   # TP53: Effects of Mutation on Interacting Proteins

<b>Standard imports for playing with and plotting data frames.</b>

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns

<b>Import CPTAC data</b>

In [2]:
import CPTAC

Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Data...

 ******PLEASE READ******


In [3]:
somatic_mutations = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()

### List of proteins that interact with TP53 (according to Uniprot)

In [4]:
protList = ['AXIN1', 'EP300', 'HRMT1L2', 'CARM1', 'TAF1', 'ING4', 'CABLES1', 
            'TP73', 'HIPK1', 'HIPK2', 'TP53INP1', 'WWOX', 'HCV', 'USP7', 'SYVN1', 'HSP90AB1',
           'CHD8', 'ARMC10', 'BANP', 'CDKN2AIP', 'NUAK1', 'STK11', 'LKB1', 'UHRF2', 'E4F1', 'YWHAZ',
           'MAML1', 'MKRN1', 'PML', 'MDM2', 'FBXO42', 'ATM', 'PP2R2A', 'AURKA', 'DAXX', 'BRD7', 'TRIM24',
           'L3MBTL1', 'GRK5', 'CAK', 'PTK2B', 'PYK2', 'MDM2', 'PRKCG', 'PPIF', 'KAT6A', 'UBC9', 'ZNF385B',
           'ZNF385A', 'ANKRD2', 'RFFL', 'RNF34', 'MTA1', 'COP1', 'CCAR2', 'MORC3', 'POU4F2', 'AFG1L', 'UBD',
           'TAF6', 'HPV', 'UL123', 'E1B-55K']

### Proteome abundance of proteins that interact with TP53

In [5]:
#Specify the gene you want to analyze
gene = 'TP53'
sigList = [];

#Bonferroni correction since we are testing multiple proteins at once
pcutoff = 0.05/len(protList)

#Create a basic dataframe that contains T/F values for mutation at TP53
genedf = somatic_mutations[gene].to_frame()

#Loop through all interacting proteins to find ones with significant changes in protein levels
#when TP53 is mutated
for protein in protList:
    if protein in proteomics.columns:
        proteindf = proteomics[protein].to_frame()
        proteindfName = protein + " protein" #Necessary when the gene and protein have the same name
        proteindf.columns = [proteindfName]
        
        cross = genedf.add(proteindf, fill_value=0).dropna(axis=0)
        mutated = cross.loc[cross[gene] == 1.0]
        wt = cross.loc[cross[gene] == 0.0]
        
        ttest = scipy.stats.ttest_ind(mutated[proteindfName], wt[proteindfName])
        if ttest[1] <= pcutoff:
            sigList.append(protein)
            print("Test for " + protein + ": ")
            print(ttest)

Test for CABLES1: 
Ttest_indResult(statistic=-4.533885225055603, pvalue=1.6398425638878636e-05)
Test for HSP90AB1: 
Ttest_indResult(statistic=3.775761395509814, pvalue=0.000273573248009503)
Test for CHD8: 
Ttest_indResult(statistic=4.65599574118626, pvalue=1.0132380205138297e-05)
Test for AURKA: 
Ttest_indResult(statistic=4.2121494006965365, pvalue=8.335995102753076e-05)
Test for DAXX: 
Ttest_indResult(statistic=4.139121723641875, pvalue=7.383403320014249e-05)


### List of significantly affected proteins

In [6]:
print(sigList)

['CABLES1', 'HSP90AB1', 'CHD8', 'AURKA', 'DAXX']


### Phosphoproteome abundance of interacting proteins

In [10]:
#This is the same list as before, with some of the end numbers of the proteins removed
#Less specificity in protein names captures more potentially significant results in the phosphoproteomic data

phosProtList = ['AXIN', 'EP', 'HRMT', 'CARM', 'TAF', 'ING', 'CABLES', 
            'TP73', 'HIPK', 'TP53INP', 'WWOX', 'HCV', 'USP', 'SYVN', 'HSP90AB1',
           'CHD', 'ARMC', 'BANP', 'CDKN', 'NUAK', 'STK', 'LKB', 'UHRF', 'E4F', 'YWHAZ',
           'MAML', 'MKRN', 'PML', 'MDM', 'FBXO', 'ATM', 'PP2R', 'AURKA', 'DAXX', 'BRD', 'TRIM',
           'L3MBTL', 'GRK', 'CAK', 'PTK', 'PYK', 'PRKCG', 'PPIF', 'KAT', 'UBC', 'ZNF',
            'ANKRD', 'RFFL', 'RNF', 'MTA', 'COP', 'CCAR', 'MORC', 'POU4F', 'AFG1L', 'UBD',
           'TAF', 'HPV', 'UL1', 'E1B']

In [13]:
#Specify gene of interest
gene = 'TP53'
genedf = somatic_mutations[gene].to_frame()
sigPhosResults = [];

#Build the dataframe with all the phosphorylation sites on the proteins listed above
for protein in phosProtList:
    sites = phos.filter(regex=protein)
    genedf = genedf.add(sites, fill_value=0)

mutated = genedf.loc[genedf[gene] == 1.0]
wt = genedf.loc[genedf[gene] == 0.0]

#Bonferroni correction for all the sites we are testing at once
pcutoff = 0.05 / len(genedf.columns)

#Test each location one by one and print significant results
for loc in genedf.columns:
    if not loc == gene:
        
        mutsitedf = mutated[[gene, loc]].dropna()
        wtsitedf = wt[[gene, loc]].dropna()
        
        ttest = scipy.stats.ttest_ind(mutsitedf[loc], wtsitedf[loc])
        if(ttest[1] <= pcutoff):
            sigPhosResults.append(loc)
            print('Results for ' + loc + ': ')
            print(ttest)

Results for CHD4-S1570: 
Ttest_indResult(statistic=4.526056608333135, pvalue=2.4823987834174773e-05)
Results for FBXO4-S12: 
Ttest_indResult(statistic=-4.650620461853634, pvalue=1.0350987175380954e-05)
Results for HSP90AB1-S226: 
Ttest_indResult(statistic=4.83818141889424, pvalue=4.875355910124854e-06)
Results for HSP90AB1-S255: 
Ttest_indResult(statistic=5.523615626841696, pvalue=2.7326572771063515e-07)
Results for HSP90AB1-S261: 
Ttest_indResult(statistic=5.578991724475906, pvalue=2.147645387372064e-07)
Results for RNF219-S210: 
Ttest_indResult(statistic=4.518769532753358, pvalue=1.8262572900673308e-05)
Results for TRIM2-S402: 
Ttest_indResult(statistic=-5.126605963336591, pvalue=1.7944947572117115e-06)
Results for USP39-S82: 
Ttest_indResult(statistic=4.90729235100044, pvalue=4.270008173957158e-06)
Results for USP43-S625: 
Ttest_indResult(statistic=-4.480420316051711, pvalue=2.01997291739303e-05)
Results for USP47-S1013: 
Ttest_indResult(statistic=-4.355314891888761, pvalue=3.271581

### List of significantly affected phosphorylation sites

In [14]:
print(sigPhosResults)

['CHD4-S1570', 'FBXO4-S12', 'HSP90AB1-S226', 'HSP90AB1-S255', 'HSP90AB1-S261', 'RNF219-S210', 'TRIM2-S402', 'USP39-S82', 'USP43-S625', 'USP47-S1013', 'ZNF318-S1043', 'ZNF318-S1243', 'ZNF326-S478', 'ZNF638-S1401']
