# Statistical Testing Templates for CPTAC Data

<b>Standard imports for playing with and plotting data frames.</b>

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest
import copy
#import altair as alt
#alt.renderers.enable('notebook') #Necessary for Altair to work

In [2]:
import CPTAC

Loading CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter CPTAC.embargo() to open the webpage for more details.


In [3]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

#Try Looking at all proteins, not just interacting proteins
all_proteins = list(proteomics.columns.values)

### Test for phosphorylation levels (mutation in any gene with phosphorylation of any protein)

In [4]:
#List of proteins (will test all phosphorylation sites on these proteins)
phosProtList = all_proteins

In [None]:
sites = phos.columns
p_values = []
site_names = []
gene = 'PIK3CA'

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
       
        for site in phosphositesdf.columns:
            #just making sure not to do comparison on mutation column
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                sitedf = sitedf.loc[sitedf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
                #print(sitedf[99:140])
                mutateddf = sitedf.loc[sitedf['Mutation'] != 'Wildtype'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                #if len(mutateddf) > 20:
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                if not np.isnan(ttest[1]):
                    p_values.append(ttest[1])
                    site_names.append(site)



#We need to remove all 'nan' p-values and their corresponding site names before passing it in for the fdr correction
indexesToRemove=[]

for index in range(0, len(p_values)):
    if np.isnan(p_values[index]):
        indexesToRemove.append(index)

for rem in range( len(indexesToRemove)-1, -1, -1):
    p_values.pop(indexesToRemove[rem])
    site_names.pop(indexesToRemove[rem])
#p_values and site names have now had all entries removed where the corresponding p-value is 'nan'

print(p_values)
print(site_names)
        


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


## Get N most significant

In [None]:
N = 2500

copy_site_names = copy.deepcopy(site_names)
copy_p_values = copy.deepcopy(p_values)

pvalDict = dict(zip(copy_site_names, copy_p_values))
orig_sorted_dict = sorted(pvalDict.items(), key=lambda kv: kv[1])

sorted_dict = copy.deepcopy(orig_sorted_dict)
sorted_dict = sorted_dict[0:N]

print(sorted_dict)

In [None]:
gene = "PIK3CA"
diffMeans = []
for val in sorted_dict:
    df = CPTAC.compare_mutations(phos, val[0], gene)
    df = df.loc[df["Patient_Type"] == "Tumor"].drop("Patient_Type", axis = 1)
    #print(df)
    mutatedf = df.loc[df["Mutation"]!="Wildtype"]
    wtdf= df.loc[df["Mutation"] =="Wildtype"]
    meanMutated = np.mean(mutatedf[val[0]])
    #print("MEAN MUT: ", meanMutated)
    meanWt = np.mean(wtdf[val[0]])
    #print("MEAN WT: ", meanWt)
    diffMean = meanMutated - meanWt
    #print("Diff mean: ", diffMean)
    diffMeans.append(diffMean)

In [None]:
pvals = []
for item in sorted_dict:
    pvals.append(-1 * np.log10(item[1]))
    #_values.append(item[1])

pvalsAndMeanDiffDict = dict(zip(diffMeans, pvals))
print(pvalsAndMeanDiffDict)

In [None]:
df = pd.DataFrame({'-Log10(PValue)': pvals, 'Mean Difference':diffMeans})

In [None]:
ax = sns.scatterplot(x = "Mean Difference", y = "-Log10(PValue)", data =df)
ax.set_title("Difference in mean phosphorylation levels when PIK3CA is mutated (mutated mean - wildtype mean) for top " + str(len(pvals)) + " phosphorylation sites")