## Standard Imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import re
import seaborn as sns
import statsmodels.stats.multitest

In [2]:
import CPTAC

Loading CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter CPTAC.embargo() to open the webpage for more details.


In [3]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

## T-Test for affect of mutation on protein abundance

#### Use interacting proteins for protlist

In [4]:
all_proteins = list(proteomics.columns.values)
protList= ["CPEB1", "JTB", "TACC1", "TPX2", "PPP2CA","PP1","PPP1CA","PPP1CB","PPP1CC","ARHGEF2","BORA", "BRCA1", "KIF2A", "PARD3","p53","TP53","PLK1","PIFO","GADD45A","AUNIP","NIN","FRY","SIRT2","MYCN","FBXW7","MYCN","HNRNPU","TACC3","UBE2C","BIRC5","DLGAP5","TPX2","CDK1","CDC20","CENPA","CCNB2"]

#### Perform T-Test

In [5]:
gene = 'AURKA'
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross[:100]
        cross = cross[["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] != "Wildtype"]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


### See if P-values are significant using fdr correction

In [6]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

  reject = pvals_sorted <= ecdffactor*alpha


### Print Significant P-values and their corresponding proteins

In [7]:
print(significant_proteins)
print(significant_pvals)

[]
[]


# Test for phosphorylation levels

#### We will use all proteins again

In [8]:
phosProtList = protList

In [9]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf['Mutation'] != 'Wildtype'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                p_values.append(ttest[1])
                site_names.append(site)     



#We need to remove all 'nan' p-values and their corresponding site names before passing it in for the fdr correction
indexesToRemove=[]

for index in range(0, len(p_values)):
    if np.isnan(p_values[index]):
        indexesToRemove.append(index)

for rem in range( len(indexesToRemove)-1, -1, -1):
    p_values.pop(indexesToRemove[rem])
    site_names.pop(indexesToRemove[rem])
#p_values and site names have now had all entries removed where the corresponding p-value is 'nan'

print(p_values)
print(site_names)

sum = 0
for p in p_values:
    sum += p
avg = sum/len(p_values)

print("AVG p_val = ", avg)

[0.36880837022107027, 0.10533615483605772, 0.5282239026568312, 0.4311617130222527, 0.8312987286021523, 0.07610858945399845, 0.007631874847793643, 0.7249169741034017, 0.28985452615455715, 0.8237271194551231, 0.0001777351529621046, 0.40270655377963716, 0.2059011382154486, 0.06585936645773272, 0.5818716760793893, 0.2515228110968836, 0.21315643688446625, 0.4553339079988249, 0.4047117808523285, 0.03742601870980346, 0.4745940086737651, 0.6362242536366376, 0.09328183427244217, 0.9562119275034813, 0.7858286613654919, 0.943390438783714, 0.20665283912695223, 0.7859634639999156, 0.5551019843790959, 0.5968201534228672, 0.34744657100684784, 0.29845728852361253, 0.6473246067912157, 0.871639076222802, 0.8638625569939554, 0.09494909554478458, 0.49193843126479586, 0.801749584188487, 0.7556114822319573, 0.10099558527138851, 0.2266563732415546, 0.9588667324489621, 0.8552105849383433, 0.9199559800729479, 0.3773713097840439, 0.07555741258834422, 0.8456948727537471, 0.3237032322354991, 0.17284853117884258, 

### See significance of P-Values using Bonferroni correction

In [10]:
threshold_pval = .05/len(site_names)
print("threshold_pval: ", threshold_pval)
bonferonni_corrected_pvals = list()
bonferonni_sig_sites = list()

for ind in range(0, len(p_values)):
    if p_values[ind] <= threshold_pval:
        bonferonni_corrected_pvals.append(p_values[ind])
        bonferonni_sig_sites.append(site_names[ind])
        
#print("\nBonferonni corrected significant Pvals: ", bonferonni_corrected_pvals, "\n")
        


threshold_pval:  0.001388888888888889


### Print significant sites and pvalues 

In [11]:
print("\nSignificant Sites: ", bonferonni_sig_sites)
print("\nSignificant P-Values: ", bonferonni_corrected_pvals)
#print(significant_vals)


Significant Sites:  ['PHLPP1-S317']

Significant P-Values:  [0.0001777351529621046]


### Now try FDR correction

In [12]:
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values,alpha=0.05, method='indep')[1]         
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values,alpha=0.05, method='indep')[0]

significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]

significant_vals = dict(zip(significant_sites, significant_pvalues))

### Print significant sites and pvalues

In [13]:
print(significant_sites)
print(significant_pvalues)
print(significant_vals)

[]
[]
{}


In [14]:
print(len(phosProtList))

36
