   # TP53: Effects of Mutation on Interacting Proteins

<b>Standard imports for playing with and plotting data frames.</b>

In [1]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns

<b>Import CPTAC data</b>

In [2]:
import CPTAC

Loading Dictionary...
Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter CPTAC.embargo() to open the webpage for more details.


In [3]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

### List of proteins that interact with TP53 (according to Uniprot and String)

In [4]:
gene = 'TP53'
protList = ['TGFB1', 'AXIN1', 'EP300', 'HRMT1L2', 'CARM1', 'TAF1', 'ING4', 'CABLES1', 
            'TP73', 'HIPK1', 'HIPK2', 'TP53INP1', 'TP53BP', 'WWOX', 'HCV', 'USP7', 'SYVN1', 'HSP90AB1',
           'CHD8', 'ARMC10', 'BANP', 'CDKN2AIP', 'NUAK1', 'STK11', 'LKB1', 'UHRF2', 'E4F1', 'YWHAZ',
           'MAML1', 'MKRN1', 'PML', 'MDM2', 'FBXO42', 'ATM', 'PP2R2A', 'AURKA', 'DAXX', 'BRD7', 'TRIM24',
           'L3MBTL1', 'GRK5', 'CAK', 'PTK2B', 'PYK2', 'MDM2', 'PRKCG', 'PPIF', 'KAT6A', 'UBC9', 'ZNF385B',
           'ZNF385A', 'ANKRD2', 'RFFL', 'RNF34', 'MTA1', 'COP1', 'CCAR2', 'MORC3', 'POU4F2', 'AFG1L', 'UBD',
           'TAF6', 'HPV', 'UL123', 'E1B-55K', 'BAX', 'FAS', 'BCL2', 'CREBBP', 'CDK2', 'MDM2', 'CDKN2A', 'ATM',
            'CCNB1', 'TERT', 'IGF1R', 'ALB', 'MMP1']

## t-test for effects of missense mutations

In [5]:
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross.loc[cross['Patient_Type'] == 'Tumor'][["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] == "Missense_Mutation"]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

In [6]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

### List of significantly affected proteins and their respective p-values (FDR adjusted)

In [7]:
print(significant_proteins)
print(significant_pvals)

['CABLES1' 'HSP90AB1' 'CHD8' 'CDKN2AIP' 'UHRF2' 'YWHAZ' 'AURKA' 'DAXX'
 'AFG1L' 'TAF6' 'FAS' 'CDKN2A']
[0.01198163 0.02933247 0.01198163 0.03426805 0.02920811 0.01198163
 0.01036627 0.01554115 0.03943006 0.03426805 0.00441439 0.02874655]


## t-test for effects of frame shift and nonsense mutations (truncating)

In [8]:
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross.loc[cross['Patient_Type'] == 'Tumor'][["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[(cross["Mutation"] == "Frame_Shift_Del") | 
                            (cross["Mutation"] == "Frame_Shift_Ins") |
                            (cross["Mutation"] == 'Nonsense_Mutation')]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

In [9]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

In [10]:
print(significant_proteins)
print(significant_pvals)

['CABLES1' 'CHD8' 'FAS' 'CDK2' 'CDKN2A' 'CCNB1']
[0.0059016  0.00027794 0.0059016  0.0408439  0.00625023 0.00866919]


## t-test for effects of missense and truncating mutations combined

In [11]:
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross.loc[cross['Patient_Type'] == 'Tumor'][["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[(cross["Mutation"] == "Frame_Shift_Del") | 
                            (cross["Mutation"] == "Frame_Shift_Ins") |
                            (cross["Mutation"] == 'Nonsense_Mutation') |
                            (cross["Mutation"] == 'Missense_Mutation')]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

In [12]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

In [13]:
print(significant_proteins)
print(significant_pvals)

['TAF1' 'CABLES1' 'HSP90AB1' 'CHD8' 'CDKN2AIP' 'YWHAZ' 'AURKA' 'DAXX'
 'AFG1L' 'TAF6' 'FAS' 'CDK2' 'CDKN2A' 'CCNB1']
[3.34175606e-02 4.03994861e-04 3.14875026e-03 1.76268430e-04
 1.01630359e-02 1.71636933e-02 2.56707953e-03 2.56707953e-03
 2.98265751e-02 9.46604364e-03 5.39752132e-05 1.25904262e-02
 1.83899711e-03 2.56707953e-03]


## t-test for effects of all mutations

In [14]:
tested = [];
p_vals = [];
for protein in protList: 
    if protein in proteomics.columns:
        tested.append(protein)
        cross = CPTAC.compare_mutations(proteomics, protein, gene)
        cross = cross.loc[cross['Patient_Type'] == 'Tumor'][["Mutation", protein]].dropna(axis=0)
        mutated = cross.loc[cross["Mutation"] != "Wildtype"]
        wt = cross.loc[cross["Mutation"] == "Wildtype"]
        ttest = scipy.stats.ttest_ind(mutated[protein], wt[protein])
        p_vals.append(ttest[1])

In [15]:
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_vals)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_vals)[1]
significant_proteins = np.array(tested)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]

In [16]:
print(significant_proteins)
print(significant_pvals)

['CABLES1' 'HSP90AB1' 'CHD8' 'CDKN2AIP' 'STK11' 'YWHAZ' 'AURKA' 'DAXX'
 'AFG1L' 'FAS' 'CDK2' 'CDKN2A' 'CCNB1']
[2.78773236e-04 1.74402946e-03 2.58375695e-04 7.19163403e-03
 1.54582617e-02 4.39318009e-03 6.07336786e-04 6.07336786e-04
 1.04749693e-02 1.47599338e-05 3.77291095e-03 4.56910336e-04
 5.87412307e-04]


### Build the dataframe for plotting

In [17]:
plotdf = CPTAC.compare_mutations(proteomics, gene)
plotdf = plotdf.loc[plotdf['Patient_Type'] == 'Tumor'][['Mutation', gene]]

for protein in significantResults:
    if(protein != 'TP53'):
        proteindf = CPTAC.compare_mutations(proteomics, protein, gene)[[protein]]
        plotdf = plotdf.join(proteindf)

NameError: name 'significantResults' is not defined

In [None]:
plotdf = plotdf.replace({'Mutation': {'Frame_Shift_Del': 'Truncating',
                                      'Frame_Shift_Ins': 'Truncating',
                                      'Nonsense_Mutation': 'Truncating',
                                      'Wildtype': 'Control',
                                      'Missense_Mutation': 'Missense'
                                     }})

In [None]:
plotdf = pd.melt(plotdf, id_vars=['Mutation'], var_name='Protein', value_name='Proteomics')
plotdf = plotdf.loc[(plotdf["Mutation"] == "Control") | (plotdf["Mutation"] == "Missense") | (plotdf["Mutation"] == "Truncating")]
plotdf = plotdf.dropna(axis=0)

In [None]:
a4_dims = (23.4, 16.54)
fig, ax = plt.subplots(figsize=a4_dims)
my_pal = {"Missense": "#6C906D", "Control": "#596B94", "Truncating": "#A35857"}

#Create the plot
boxplt = sns.boxplot(x='Protein', y='Proteomics', hue='Mutation', data=plotdf, palette=my_pal)
boxplt = sns.stripplot(data=plotdf, x='Protein', y='Proteomics', hue='Mutation', dodge=True, jitter=True, color='.3')

#Add styling
boxplt.set_xlabel('')
boxplt.set_ylabel('Protein Level', fontsize='40')
boxplt.tick_params(labelsize='20')

#Adjust legend
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles[0:3], ['Missense', 'Control', 'Truncating'], title='TP53 Status', fontsize='20', frameon=False)
boxplt.get_legend().get_title().set_fontsize('20')
boxplt.get_legend().set_bbox_to_anchor((0.15, 0.15, 0, 0))

#Add significance indicator: TP53/Missense
x1, x2 = -0.25, -0.05
y, h, col = 4.45, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: TP53/Truncation
x1, x2 = 0.05, 0.25
y, h, col = 4.25, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "ns", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: CABLES1/Missense
x1, x2 = 0.75, 0.95
y, h, col = 1.3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "***", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: CABLES1/Truncation
x1, x2 = 1.05, 1.25
y, h, col = 0.8 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: HSP90AB1/Missense
x1, x2 = 1.75, 1.95
y, h, col = 1 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: HSP90AB1/Truncation
x1, x2 = 2.05, 2.25
y, h, col = 0.8 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "***", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: CHD8/Missense
x1, x2 = 2.75, 2.95
y, h, col = 1 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: CHD8/Truncation
x1, x2 = 3.05, 3.25
y, h, col = 0.8 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: UHRF2/Missense
x1, x2 = 3.75, 3.95
y, h, col = 1 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: UHRF2/Truncation
x1, x2 = 4.05, 4.25
y, h, col = 0.8 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "ns", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: AURKA/Missense
x1, x2 = 4.75, 4.95
y, h, col = 3 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "****", ha='center', va='bottom', color=col, fontsize='23')

#Add significance indicator: AURKA/Truncation
x1, x2 = 5.05, 5.25
y, h, col = 2.8 + .3, .3, 'k'
plt.plot([x1, x1, x2, x2], [y+h, y+h, y+h, y+h], lw=1.5, c=col)
plt.text((x1+x2)*.5, y+h, "*", ha='center', va='bottom', color=col, fontsize='23')

fig = boxplt.get_figure()
fig.savefig('p53_trans.png', dpi=300)
plt.show()

## Phosphoproteome abundance of interacting proteins

### This is the same list as before, with some of the end numbers of the proteins removed; less specificity in protein names captures more potentially significant results in the phosphoproteomic data

In [46]:
gene = 'TP53'
phosProtList = ['TGFB1', 'AXIN1', 'EP300', 'HRMT1L2', 'CARM1', 'TAF1', 'ING4', 'CABLES1', 
            'TP73', 'HIPK1', 'HIPK2', 'TP53INP1', 'TP53BP', 'WWOX', 'HCV', 'USP7', 'SYVN1', 'HSP90AB1',
           'CHD8', 'ARMC10', 'BANP', 'CDKN2AIP', 'NUAK1', 'STK11', 'LKB1', 'UHRF2', 'E4F1', 'YWHAZ',
           'MAML1', 'MKRN1', 'PML', 'MDM2', 'FBXO42', 'ATM', 'PP2R2A', 'AURKA', 'DAXX', 'BRD7', 'TRIM24',
           'L3MBTL1', 'GRK5', 'CAK', 'PTK2B', 'PYK2', 'MDM2', 'PRKCG', 'PPIF', 'KAT6A', 'UBC9', 'ZNF385B',
           'ZNF385A', 'ANKRD2', 'RFFL', 'RNF34', 'MTA1', 'COP1', 'CCAR2', 'MORC3', 'POU4F2', 'AFG1L', 'UBD',
           'TAF6', 'HPV', 'UL123', 'E1B-55K', 'BAX', 'FAS', 'BCL2', 'CREBBP', 'CDK2', 'MDM2', 'CDKN2A', 'ATM',
            'CCNB1', 'TERT', 'IGF1R', 'ALB', 'MMP1']

In [47]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf['Mutation'] == 'Missense_Mutation'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                p_values.append(ttest[1])
                site_names.append(site)     
                
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]

print(significant_sites)
print(significant_pvalues)

['TP53BP1-S1431' 'TP53BP1-S1435' 'TP53BP1-S1683' 'TP53BP1-S1706'
 'TP53BP1-S1763' 'TP53BP1-S1764' 'TP53BP1-S867' 'HSP90AB1-S226'
 'HSP90AB1-S255' 'HSP90AB1-S261' 'HSP90AB1-S445' 'CDKN2AIP-S131'
 'CREBBP-T974' 'CDKN2AIP-S131']
[3.38126648e-04 8.74993877e-04 3.86169349e-03 6.45050474e-06
 3.27620938e-04 2.19398978e-02 3.64259923e-04 5.06456035e-03
 1.34927660e-04 4.93202061e-03 1.62167798e-02 1.62167798e-02
 2.19257961e-02 1.62167798e-02]


### Look at truncating mutations

In [48]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[(sitedf["Mutation"] == "Frame_Shift_Del") | 
                            (sitedf["Mutation"] == "Frame_Shift_Ins") |
                            (sitedf["Mutation"] == 'Nonsense_Mutation')].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                p_values.append(ttest[1])
                site_names.append(site)     
                
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]

print(significant_sites)
print(significant_pvalues)

['TAF1D-S206' 'HIPK1-T1027' 'TP53BP1-S1431' 'TP53BP1-S1683'
 'TP53BP1-S1706' 'TP53BP1-S1763' 'TP53BP1-S1764' 'TP53BP1-S867'
 'HSP90AB1-S226' 'HSP90AB1-S255' 'HSP90AB1-S261' 'CHD8-S1420' 'CHD8-S1424'
 'CHD8-S1995' 'CHD8-S549' 'FASN-S2198' 'FASN-T2204' 'BCL2L12-S273']
[0.04492884 0.03343621 0.04492884 0.03343621 0.03271287 0.00202
 0.04503339 0.03558846 0.0272901  0.00202    0.00498404 0.03343621
 0.03558846 0.03343621 0.03271287 0.02017401 0.03271287 0.03343621]


## Missense and truncating

In [50]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[(sitedf["Mutation"] == "Frame_Shift_Del") | 
                        (sitedf["Mutation"] == "Frame_Shift_Ins") |
                        (sitedf["Mutation"] == 'Nonsense_Mutation') |
                        (sitedf["Mutation"] == 'Missense_Mutation')            ].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                p_values.append(ttest[1])
                site_names.append(site)       

areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]

print(significant_sites)
print(significant_pvalues)

['TGFB1I1-S192' 'CABLES1-S373' 'CABLES1-T415' 'HIPK1-S1200' 'HIPK1-T1027'
 'TP53BP1-S1109' 'TP53BP1-S1431' 'TP53BP1-S1435' 'TP53BP1-S1683'
 'TP53BP1-S1706' 'TP53BP1-S1763' 'TP53BP1-S1764' 'TP53BP1-S400'
 'TP53BP1-S403' 'TP53BP1-S557' 'TP53BP1-S640' 'TP53BP1-S644'
 'TP53BP1-S645' 'TP53BP1-S776' 'TP53BP1-S867' 'TP53BP1-S998'
 'TP53BP1-T1653' 'HSP90AB1-S226' 'HSP90AB1-S255' 'HSP90AB1-S261'
 'HSP90AB1-S445' 'CHD8-S1420' 'CHD8-S1995' 'CHD8-S2008' 'CHD8-S549'
 'CHD8-S562' 'CHD8-T1982' 'CDKN2AIP-S131' 'DAXX-S680' 'MTA1-T564'
 'FASN-S207' 'FASN-T2204' 'BCL2L12-S273' 'BCL2L13-S444' 'CREBBP-T974'
 'CDKN2AIP-S131' 'RALBP1-S30']
[4.83396943e-02 1.02748279e-02 1.71077907e-02 2.26293623e-02
 3.31176397e-03 5.10409707e-03 1.59419677e-05 3.56867481e-05
 4.05486152e-05 4.91755490e-07 7.77961292e-07 1.04397887e-03
 3.86388492e-02 5.10409707e-03 3.98470422e-02 9.00020989e-03
 4.27340920e-02 1.18523003e-02 3.14587547e-02 1.59419677e-05
 4.91988605e-02 1.07792725e-02 4.05486152e-05 4.91755490e-07
 2.381236

### Look at all mutations

In [49]:
sites = phos.columns
p_values = []
site_names = []

for protein in phosProtList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis = 1)
        for site in phosphositesdf.columns:
            if (site is not 'Mutation'):
                sitedf = CPTAC.compare_mutations(phos, site, gene)
                mutateddf = sitedf.loc[sitedf["Mutation"] != 'Wildtype'].dropna(axis=0)
                wtdf = sitedf.loc[sitedf['Mutation'] == 'Wildtype'].dropna(axis=0)
                ttest = scipy.stats.ttest_ind(mutateddf[site], wtdf[site])
                p_values.append(ttest[1])
                site_names.append(site)       

areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvalues = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_sites = np.array(site_names)[np.array(areSignificant)]
significant_pvalues = np.array(pvalues)[np.array(areSignificant)]

print(significant_sites)
print(significant_pvalues)

['TGFB1I1-S137' 'TGFB1I1-S143' 'TGFB1I1-S192' 'TGFB1I1-S68' 'TAF15-S97'
 'CABLES1-S109' 'CABLES1-S290' 'CABLES1-S373' 'CABLES1-S418'
 'CABLES1-T415' 'HIPK1-S1200' 'HIPK1-T1027' 'TP53BP1-S1004'
 'TP53BP1-S1099' 'TP53BP1-S1109' 'TP53BP1-S1431' 'TP53BP1-S1435'
 'TP53BP1-S1683' 'TP53BP1-S1706' 'TP53BP1-S1763' 'TP53BP1-S1764'
 'TP53BP1-S400' 'TP53BP1-S403' 'TP53BP1-S525' 'TP53BP1-S557'
 'TP53BP1-S640' 'TP53BP1-S644' 'TP53BP1-S645' 'TP53BP1-S776'
 'TP53BP1-S867' 'TP53BP1-T1653' 'TP53BP1-T307' 'HSP90AB1-S226'
 'HSP90AB1-S255' 'HSP90AB1-S261' 'HSP90AB1-S445' 'CHD8-S1420' 'CHD8-S1995'
 'CHD8-S2008' 'CHD8-S549' 'CHD8-S562' 'CHD8-T1982' 'CDKN2AIP-S131'
 'STK11IP-S599' 'DAXX-S680' 'TRIM24-S1025' 'TRIM24-S1028' 'ANKRD26-S631'
 'MTA1-T564' 'TAF6-S673' 'FASN-S207' 'FASN-S2198' 'FASN-T2204'
 'BCL2L12-S273' 'BCL2L13-S444' 'CREBBP-S121' 'CREBBP-T974' 'CDKN2AIP-S131'
 'RALBP1-S30']
[4.73928331e-02 4.24683358e-02 1.30612019e-02 1.96788056e-02
 4.87173544e-02 4.04222291e-02 2.81107614e-02 5.42094133e-03
 2