# TP53 Hotspot Phosphoproteomic Analysis

In [1]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import CPTAC

Loading CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter CPTAC.embargo() to open the webpage for more details.


In [2]:
CPTAC.version()

'0.1.3'

In [2]:
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
phos = CPTAC.get_phosphoproteomics()
gene = 'TP53'
protein = 'TP53'

In [3]:
#A list of mutations present in the hotspot according to our Hotspot3D output
hotspot_mutations = ['p.P250L', 'p.R248W', 'p.S241C', 'p.S241del', 'p.K164E', 'p.R249S', 'p.R273H', 'p.R282W',
                    'p.S127Y', 'p.G244D', 'p.A159P']
mutated_hotspot = somatic.loc[(somatic['Location'].isin(hotspot_mutations)) & (somatic['Gene'] == gene)]
hotspot_patients = mutated_hotspot['Clinical_Patient_Key']

### List of proteins that interact with TP53 (according to Uniprot and String)

In [4]:
protList = ['AIMP2', 'ARIH2', 'ARRB1', 'ASH2L', 'ATG7', 'AXIN1', 'BANP', 'BCL2', 'BCL2L1', 'BCR', 'BHLHE40', 'BRCA2',
           'BRD7', 'BTBD2', 'BTRC', 'CABLES1', 'CARM1', 'CCDC106', 'CDKN1A', 'CEBPB', 'CHD8', 'CREBBP', 'CSE1L', 'CSNK2A1', 'CUL7',
           'CUL9', 'CXXC1', 'DAXX', 'DDX17', 'DDX5', 'DROSHA', 'DUSP26', 'DVL2', 'E6', 'EP300', 'ETS2', 'FBXO11',
           'FOXO3', 'GSK3B', 'GTF2H1', 'HDAC1', 'HIPK1', 'HIPK2', 'HMGB1', 'HNRNPK', 'HRMT1L2', 'HSP82', 'HSPA1L', 'HSPA9', 'HSPB1',
           'HTT', 'HUWE1', 'IFI16', 'IFI205B', 'ING4', 'IKBKB', 'IP6K2', 'JMJD6', 'KAT5', 'KAT8', 'KDM1A', 'KMT2E', 'LAMA4',
           'MAGEA2B', 'MAGEC2', 'MAP1B', 'MAPK11', 'MAPKAPK5', 'MDM2', 'MDM4', 'MKRN1', 'MPDZ', 'MT1A', 'NCL', 'NCOR2',
           'NFYA', 'NFYB', 'NOC2L', 'NOL3', 'NPM1', 'NR0B2', 'NR4A1', 'NRDC', 'NSP1', 'NUAK1', 'NUMB', 'OTUB1', 
           'PARD3', 'PARP1', 'PBK', 'PHB', 'PIAS1', 'PIAS2', 'PIAS4', 'PIN1', 'PLK1', 'PML', 'PPIF', 'PPP1CC',
           'PPP1R13L', 'PPP2R1A', 'PPP2R5C', 'PRKCD', 'PSME3', 'PTK2', 'RAD51', 'RBPJ', 'RCHY1', 'RFWD3', 'RING1',
           'RPS3', 'RYBP', 'S100A1', 'S100A2', 'S100A4', 'S100B', 'SAFB', 'SETD7', 'SFN', 'SIN3A', 'SIRT1', 'SMAD2',
           'SMYD2', 'SNAT1', 'SOX4', 'SP1', 'SREBF2', 'SRPK1', 'SUMO1', 'SYVN1', 'TAF1', 'TBP', 'TCF4', 'TOE1', 'TP53BP1',
           'TP53BP2', 'TP53INP1', 'TP63', 'TPT1', 'TRIM24', 'TWIST1', 'UBC', 'UBE3A', 'UHRF2', 'USP42', 'USP7', 'VDR', 'VRK1',
           'WRN', 'WWOX', 'XPO1', 'XRCC6', 'YWHAG', 'YWHAZ', 'ZNF420', 'ATM', 'BAX', 'BCL2L11', 'CCNA1', 'CCNA2', 'CCNB1', 
           'CCNB2', 'CCND1', 'CCND3', 'CCNE1', 'CCNE2', 'CDC25A', 'CDC6', 'CDK2', 'CDKN1B', 'CDKN2A', 'CHEK1', 
           'CHEK2', 'CITED2', 'CKS1B', 'MCM5', 'MRE11A', 'PCNA', 'RB1', 'ARMC10', 'BANP', 'CDKN2AIP', 'NUAK1', 'STK11',
           'LKB1', 'UHRF2', 'E4F1', 'MAML1', 'MKRN1', 'PML', 'FBXO42', 'ATM', 'PP2A', 'PPP2R5C', 'PPP2R2A', 'AURKA', 'DAXX',
           'BRD7', 'TRIM24', 'L3MBTL1', 'GRK5', 'CAK', 'CDK7', 'MAT1', 'CDK5', 'AURKB', 'SETD2', 'UHRF2', 'NOC2L', 'PTK2',
           'FAK1', 'PTK2B', 'PYK2', 'PRKCG', 'PPIF', 'SNAI1', 'KAT6A', 'ANKRD2', 'RFFL', 'RNF34', 'MTA1', 'COP1', 'CCAR2',
           'MORC3', 'POU4F2', 'NOP53', 'AFG1L', 'UBD', 'TAF6', 'FATS', 'C10ORF90']

#Remove possible duplicates
protList = list(set(protList))

## All mutations vs wildtype

In [6]:
sites = phos.columns
p_values = []
site_names = []

for protein in protList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis=1)
        phosphositesdf['Within_Hotspot'] = False
        phosphositesdf.loc[hotspot_patients, 'Within_Hotspot'] = True
        phosphositesdf.loc[phosphositesdf['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        phosphositesdf.drop('Mutation', axis=1)
        for site in phosphositesdf.columns:
            if (site != 'Within_Hotspot' and site != 'Mutation'):
                sitedf = phosphositesdf[[site, 'Within_Hotspot']].dropna(axis=0)
                mutated = sitedf.loc[sitedf['Within_Hotspot'] != 'Wildtype']
                wt = sitedf.loc[sitedf['Within_Hotspot'] == 'Wildtype']
                ttest = scipy.stats.ttest_ind(mutated[site], wt[site])
                p_values.append(ttest[1])
                site_names.append(site)
                
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_proteins = np.array(site_names)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]
significant_vals = dict(zip(significant_proteins, significant_pvals))
significant_vals

{'NPM1-S70': 0.038878505182900966,
 'CREBBP-T974': 0.009217027484735641,
 'TP53BP1-S1004': 0.032625891703090396,
 'TP53BP1-S1431': 0.0008643140003929224,
 'TP53BP1-S1435': 0.0008643140003929224,
 'TP53BP1-S1683': 0.0009882801246996684,
 'TP53BP1-S1706': 7.899210519341453e-05,
 'TP53BP1-S1763': 6.901183518862966e-07,
 'TP53BP1-S1764': 0.0026881513503528814,
 'TP53BP1-S403': 0.032625891703090396,
 'TP53BP1-S640': 0.04139829057587989,
 'TP53BP1-S867': 0.0030048347848826654,
 'FOXO3-S43': 0.0016182401268920261,
 'CLASP1-S572': 0.012982087946777021,
 'CTDSP1-T74': 0.032625891703090396,
 'LASP1-T104': 0.032933589316956446,
 'SP100-S157': 0.00010674105443492965,
 'SP100-S223': 0.004112207588010277,
 'SP100-S228': 0.0026881513503528814,
 'SP100-S231': 4.679164997306459e-05,
 'SP110-S244': 0.0026881513503528814,
 'SP110-S380': 0.004112207588010277,
 'USP11-S648': 0.009217027484735641,
 'CHD8-S549': 0.030286647174017287,
 'PARP1-S782': 0.014140455703340167,
 'BCL2L12-S273': 0.01595262598239566,


## Mutations within hotspot vs wildtype

In [7]:
sites = phos.columns
p_values = []
site_names = []

for protein in protList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis=1)
        phosphositesdf['Within_Hotspot'] = False
        phosphositesdf.loc[hotspot_patients, 'Within_Hotspot'] = True
        phosphositesdf.loc[phosphositesdf['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        phosphositesdf.drop('Mutation', axis=1)
        for site in phosphositesdf.columns:
            if (site != 'Within_Hotspot' and site != 'Mutation'):
                sitedf = phosphositesdf[[site, 'Within_Hotspot']].dropna(axis=0)
                mutated = sitedf.loc[sitedf['Within_Hotspot'] == True]
                wt = sitedf.loc[sitedf['Within_Hotspot'] == 'Wildtype']
                ttest = scipy.stats.ttest_ind(mutated[site], wt[site])
                p_values.append(ttest[1])
                site_names.append(site)
                
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_proteins = np.array(site_names)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]
significant_vals = dict(zip(significant_proteins, significant_pvals))
significant_vals

{'HUWE1-S3808': 0.023035547134485423,
 'HUWE1-S3818': 0.022558704623591275,
 'PIAS2-S499': 0.04594812104477181,
 'CREBBP-T974': 0.040748452536135515,
 'TP53BP1-S1431': 0.0011162671079661052,
 'TP53BP1-S1435': 0.001501726880253013,
 'TP53BP1-S1683': 0.021475996183742276,
 'TP53BP1-S1706': 0.0002414574679697111,
 'TP53BP1-S1763': 0.0005262666326208138,
 'TP53BP1-S227': 0.01490006977908907,
 'TP53BP1-S867': 0.005189706313447883,
 'FOXO3-S43': 0.019025958067335855,
 'CLASP1-S572': 0.005185397859035873,
 'SP100-S157': 0.009311601609255192,
 'SP100-S223': 0.001681146675364001,
 'SP100-S228': 0.01084611970435412,
 'SP100-S231': 0.0002414574679697111,
 'SP110-S244': 0.00022715594852941022,
 'SP110-S380': 0.013026843951616126,
 'USP11-S648': 0.0219367226210614,
 'RYBP-S190': 0.003466463721526021,
 'RYBP-S193': 0.007259414870953549,
 'NCL-S619': 2.8857022530986562e-05,
 'NCL-S67': 0.0002414574679697111,
 'NOC2L-S49': 0.03537019612696443}

## Mutations outside of hotspot vs wildtype

In [8]:
sites = phos.columns
p_values = []
site_names = []

for protein in protList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis=1)
        phosphositesdf['Within_Hotspot'] = False
        phosphositesdf.loc[hotspot_patients, 'Within_Hotspot'] = True
        phosphositesdf.loc[phosphositesdf['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        phosphositesdf.drop('Mutation', axis=1)
        for site in phosphositesdf.columns:
            if (site != 'Within_Hotspot' and site != 'Mutation'):
                sitedf = phosphositesdf[[site, 'Within_Hotspot']].dropna(axis=0)
                mutated = sitedf.loc[sitedf['Within_Hotspot'] == False]
                wt = sitedf.loc[sitedf['Within_Hotspot'] == 'Wildtype']
                ttest = scipy.stats.ttest_ind(mutated[site], wt[site])
                p_values.append(ttest[1])
                site_names.append(site)
                
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_proteins = np.array(site_names)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]
significant_vals = dict(zip(significant_proteins, significant_pvals))
significant_vals

{'TP53BP1-S1763': 3.730212327832176e-05,
 'SP100-S157': 0.01926525300150349,
 'NCL-S67': 0.01926525300150349}

## Mutations outside of hotspot vs mutations within hotspot

In [9]:
sites = phos.columns
p_values = []
site_names = []

for protein in protList:
    pattern = re.compile(protein)
    isInList = filter(pattern.search, sites)
    if next(isInList, None) is not None:
        phosphositesdf = CPTAC.compare_mutations(phos, protein, gene)
        phosphositesdf = phosphositesdf.loc[phosphositesdf['Patient_Type'] == 'Tumor'].drop('Patient_Type', axis=1)
        phosphositesdf['Within_Hotspot'] = False
        phosphositesdf.loc[hotspot_patients, 'Within_Hotspot'] = True
        phosphositesdf.loc[phosphositesdf['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        phosphositesdf.drop('Mutation', axis=1)
        for site in phosphositesdf.columns:
            if (site != 'Within_Hotspot' and site != 'Mutation'):
                sitedf = phosphositesdf[[site, 'Within_Hotspot']].dropna(axis=0)
                mutated = sitedf.loc[sitedf['Within_Hotspot'] == False]
                wt = sitedf.loc[sitedf['Within_Hotspot'] == True]
                ttest = scipy.stats.ttest_ind(mutated[site], wt[site])
                p_values.append(ttest[1])
                site_names.append(site)
                
areSignificant = statsmodels.stats.multitest.fdrcorrection(p_values)[0]
pvals = statsmodels.stats.multitest.fdrcorrection(p_values)[1]
significant_proteins = np.array(site_names)[np.array(areSignificant)]
significant_pvals = np.array(pvals)[np.array(areSignificant)]
significant_vals = dict(zip(significant_proteins, significant_pvals))
significant_vals

{}