# Statistical Testing Templates for CPTAC Data

<b>Standard imports for playing with and plotting data frames.</b>

In [3]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest
#import altair as alt
#alt.renderers.enable('notebook') #Necessary for Altair to work

In [4]:
import CPTAC

Loading CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter CPTAC.embargo() to open the webpage for more details.


In [5]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

print(phos)

#Try Looking at all proteins, not just interacting proteins
all_proteins = list(proteomics.columns.values)
#print(len(all_proteins))
#print(all_proteins)

idx   AAAS-S495  AAAS-S541  AAAS-Y485  AACS-S618  AAED1-S12  AAGAB-S310  \
S001        NaN        NaN        NaN    -0.8810    -1.8100         NaN   
S002        NaN        NaN        NaN        NaN     0.0840         NaN   
S003   -0.20200        NaN        NaN        NaN    -1.8800         NaN   
S004    0.25000        NaN        NaN        NaN        NaN         NaN   
S005   -0.00200        NaN     -0.407        NaN        NaN         NaN   
S006    0.55600    -0.0461        NaN        NaN     0.9410         NaN   
S007        NaN        NaN        NaN        NaN     0.0796         NaN   
S008    0.30000        NaN        NaN    -0.3710    -1.1400         NaN   
S009    0.49000        NaN        NaN        NaN        NaN         NaN   
S010   -0.23100     0.0108        NaN        NaN     0.2420         NaN   
S011    0.26800        NaN        NaN        NaN    -0.1120         NaN   
S012   -0.06200        NaN        NaN     1.8600    -1.4000         NaN   
S013   -1.49000        Na

## t-test for effects of all mutations combined
<b>Note:</b> You can put any number of proteins in the list and it will compare that gene with the protein levels of all of them

### List of interacting proteins (according to STRING and Uniprot)

In [9]:
#Build the protein list; this may have only a single protein if desired
#protList = ['IRS1', 'IRS2', 'RRAS', 'AKT2', 'NRAS', 'PTEN', 'AKT1', 'MRAS', 'HRAS', 'RPS6KB1', 'PIK3R1', 'PKC', 'MTOR', 'S6K', 'MAPK', 'ERBB3', 'P85A', 'P55G', 'CDK5']
#protList = all_proteins
protList = ['AHNAK','PPP1R12A']

In [10]:
gene = 'PIK3CA'
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross[:100]
        cross = cross[["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] != "Wildtype"]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

In [11]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

In [12]:
print(significant_proteins)
print(significant_pvals)

[]
[]


### Test for phosphorylation levels (mutation in any gene with phosphorylation of any protein)

In [13]:
#List of proteins (will test all phosphorylation sites on these proteins)
phosProtList = protList

In [24]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf['Mutation'] != 'Wildtype'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                if len(mutateddf) > 20:
                    ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                    p_values.append(ttest[1])
                    site_names.append(site)



#We need to remove all 'nan' p-values and their corresponding site names before passing it in for the fdr correction
indexesToRemove=[]

for index in range(0, len(p_values)):
    if np.isnan(p_values[index]):
        indexesToRemove.append(index)

for rem in range( len(indexesToRemove)-1, -1, -1):
    p_values.pop(indexesToRemove[rem])
    site_names.pop(indexesToRemove[rem])
#p_values and site names have now had all entries removed where the corresponding p-value is 'nan'

print(p_values)
print(site_names)

sum = 0
for p in p_values:
    sum += p
avg = sum/len(p_values)

print("AVG p_val = ", avg)
        


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


[0.03698162328955382, 0.06537114967136168, 0.6482265109802732, 0.7437593229334798, 0.20406242970801175, 0.31254706751687333, 0.07208096894115837, 0.7902480128721662, 0.8262950543597634, 0.11665125432611836, 0.9214644898859095, 0.1389827479340972, 0.1429078869830693, 0.2657174032109103, 0.16761277093936472, 0.5319595272895374, 0.39008183165945076, 0.1749412698064083, 0.27175015292636706, 0.30223087274684357, 0.0886133317420495, 0.14574103344918116, 0.002096691367289287, 0.08696314048080471, 0.15483950969356947, 0.04143886092030965, 0.11418235543821204, 0.0213526361208086, 0.12138171266843717, 0.32146684684268606, 0.42032817210343676, 0.8133055628506575, 0.025178358290441474, 0.5877701736307979, 0.37069679998046545, 0.14369426295145668, 0.4867198564855847, 0.5500492392283083, 0.38451235689436125, 0.8084685609644895, 0.5589794591971767, 0.11546797527952174, 0.07630875935520931, 0.07785816024195638, 0.4127812728193687, 0.06862053151997978, 0.7935866027057291, 0.4652518729254843, 0.58626954

### Seeing significance of P values using bonferroni correction

In [25]:
threshold_pval = .05/len(site_names)
print("threshold_pval: ", threshold_pval)
bonferonni_corrected_pvals = list()
bonferonni_sig_sites = list()
for ind in range(0, len(p_values)):
    if p_values[ind] <= threshold_pval:
        bonferonni_corrected_pvals.append(p_values[ind])
        bonferonni_sig_sites.append(site_names[ind])
        
        
bf_significant_vals = dict(zip(bonferonni_sig_sites, bonferonni_corrected_pvals))

threshold_pval:  0.00032679738562091506


### Print signifcant p-values

In [26]:
#print("\nSignificant P-values from Bonferroni: ", bonferonni_corrected_pvals)
#print("\nSignificant Sites from Bonferroni: ", bonferonni_sig_sites)
#sortedkeys = sort(bf_significant_vals.keys())
#for key in sortedkeys:
#    print (key, bf_significant_vals[key])
#print("\nSignificant values: ", bf_significant_vals)

for key in sorted(bf_significant_vals):
    #print(%s: %s % (key, bf_significant_vals[key]))
    print(key, bf_significant_vals[key])

print(len(bonferonni_sig_sites))
print(len(bonferonni_corrected_pvals))
print(len(site_names))


#indexMin = bonferonni_corrected_pvals.index(min(bonferonni_corrected_pvals))

#print("Min P-val: ", bonferonni_corrected_pvals[index_min])
#print("Site name at min: ", bonferonni_sig_sites[index_min])

0
0
153


### Use FDR Correction

In [23]:
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values,alpha=0.05, method='indep')[1]         
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values,alpha=0.05, method='indep')[0]

significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]
significant_vals = dict(zip(significant_sites, significant_pvalues))


print("\nSignificant P-vals (FDR): ", significant_pvalues)
print("\nSignificant sites (FDR): ", significant_sites)
print("\nSignificant values (FDR): ", significant_vals)

print(len(significant_sites))


Significant P-vals (FDR):  []

Significant sites (FDR):  []

Significant values (FDR):  {}
0


### Plot phosphorylation levels and gene mutation
<b>Note:</b> There may be fewer data points due to NA values

In [2]:
#Specify the gene and the site; you may use a string to specify the site or reference the significant results above

#index_min = significant_pvalues.tolist().index(min(significant_pvalues))
#print("INDEX MIN: ", index_min, " PVAL: ", significant_pvalues[index_min])
gene = 'PIK3CA'
#site = significant_sites[index_min]
site="AKAP12-S629"
#print("SIGNIFICANT SITES LENGTH: ", len(significant_sites))

#print(somatic_mutations)

#Build the dataframe for plotting
#genedf = somatic_mutations[gene].to_frame()
genedf = CPTAC.compare_mutations(phos, site, gene)

#print(genedf)
#sites = phos.filter(regex=site)
#genedf = genedf.add(sites, fill_value=0)

phos_boxplot = sns.boxplot(data=genedf, x="Mutation" ,y=site)
phos_boxplot.set_title(gene + " gene mutation and " + site + " phosphorylation levels")
phos_boxplot = sns.stripplot(data=genedf, x="Mutation", y=site,jitter=True, color=".3")
phos_boxplot.set(xlabel="Somatic Gene Mutation",ylabel="Phosphoproteomics")

print("\n")

phos_boxplot = sns.boxplot(data=genedf, x="Mutation" ,y=site)
phos_boxplot.set_title(gene + " gene mutation and " + site + " phosphorylation levels")
phos_boxplot = sns.stripplot(data=genedf, x="Mutation", y=site,jitter=True, color=".3")
phos_boxplot.set(xlabel="Somatic Gene Mutation",ylabel="Phosphoproteomics")

NameError: name 'CPTAC' is not defined