## Standard Imports

In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest

In [3]:
import CPTAC

Loading CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter CPTAC.embargo() to open the webpage for more details.


In [4]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

## T-Test for affect of mutation on protein abundance

#### Use interacting proteins for protlist

In [5]:
all_proteins = list(proteomics.columns.values)
protList= ["CPEB1", "JTB", "TACC1", "TPX2", "PPP2CA","PP1","PPP1CA","PPP1CB","PPP1CC","PPP1R2","ARHGEF2","BORA", "BRCA1", "KIF2A", "PARD3","p53","TP53","PLK1","PIFO","GADD45A","AUNIP","NIN","MLN8054","FRY","SIRT2","MYCN","FBXW7","MYCN","HNRNPU","TACC3","UBE2C","BIRC5","DLGAP5","TPX2","CDK1","CDC20","CENPA","CCNB2"]

#### Perform T-Test

In [9]:
gene = 'AURKA'
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        #print(cross)
        cross = cross[:100]
        cross = cross[["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] != "Wildtype"]
        print(mutated)
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

               Mutation    JTB
S055  Missense_Mutation  0.438
S059  Missense_Mutation  0.206
               Mutation  TACC1
S055  Missense_Mutation  0.077
S059  Missense_Mutation -0.361
               Mutation   TPX2
S055  Missense_Mutation  0.179
S059  Missense_Mutation  0.850
               Mutation  PPP2CA
S055  Missense_Mutation -0.0716
S059  Missense_Mutation -0.2550
               Mutation  PPP1CA
S055  Missense_Mutation   0.104
S059  Missense_Mutation   0.323
               Mutation  PPP1CB
S055  Missense_Mutation  -0.373
S059  Missense_Mutation   0.353
               Mutation  PPP1CC
S055  Missense_Mutation -0.0832
S059  Missense_Mutation -0.1480
               Mutation  PPP1R2
S055  Missense_Mutation -0.0101
S059  Missense_Mutation -0.1710
               Mutation  ARHGEF2
S055  Missense_Mutation   0.0549
S059  Missense_Mutation   0.0355
               Mutation  KIF2A
S055  Missense_Mutation  0.551
S059  Missense_Mutation  0.181
               Mutation  PARD3
S055  Missense_Mut

### See if P-values are significant using fdr correction

In [19]:
testersss = CPTAC.compare_mutations(proteomics, 'PTEN', 'PTEN')
#testersss.head(10)
mutated222 = testersss.loc[testersss["Mutation"] != "Wildtype"]
print(len(mutated222))

78


In [7]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

  reject = pvals_sorted <= ecdffactor*alpha


### Print Significant P-values and their corresponding proteins

In [None]:
print(significant_proteins)
print(significant_pvals)

# Test for phosphorylation levels

#### We will use all proteins again

In [None]:
phosProtList = protList

In [None]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf['Mutation'] != 'Wildtype'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                p_values.append(ttest[1])
                site_names.append(site)     



#We need to remove all 'nan' p-values and their corresponding site names before passing it in for the fdr correction
indexesToRemove=[]

for index in range(0, len(p_values)):
    if np.isnan(p_values[index]):
        indexesToRemove.append(index)

for rem in range( len(indexesToRemove)-1, -1, -1):
    p_values.pop(indexesToRemove[rem])
    site_names.pop(indexesToRemove[rem])
#p_values and site names have now had all entries removed where the corresponding p-value is 'nan'

print(p_values)
print(site_names)

sum = 0
for p in p_values:
    sum += p
avg = sum/len(p_values)

print("AVG p_val = ", avg)

index_min = p_values.index(min(p_values))
print("INDEX MIN: ", index_min, " PVAL: ", p_values[index_min])
print("SITE OF MIN: ", site_names[index_min])

### See significance of P-Values using Bonferroni correction

In [None]:
threshold_pval = .05/len(site_names)
print("threshold_pval: ", threshold_pval)
bonferonni_corrected_pvals = list()
bonferonni_sig_sites = list()

for ind in range(0, len(p_values)):
    if p_values[ind] <= threshold_pval:
        bonferonni_corrected_pvals.append(p_values[ind])
        bonferonni_sig_sites.append(site_names[ind])
        
#print("\nBonferonni corrected significant Pvals: ", bonferonni_corrected_pvals, "\n")
        


### Print significant sites and pvalues 

In [None]:
print("\nSignificant Sites: ", bonferonni_sig_sites)
print("\nSignificant P-Values: ", bonferonni_corrected_pvals)
#print(significant_vals)

### Now try FDR correction

In [None]:
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values,alpha=0.05, method='indep')[1]         
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values,alpha=0.05, method='indep')[0]

significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]

significant_vals = dict(zip(significant_sites, significant_pvalues))

### Print significant sites and pvalues

In [None]:
print(significant_sites)
print(significant_pvalues)
print(significant_vals)