In [1]:
import pandas as pd
import numpy as np
import re
import scipy.stats
import statsmodels.stats.multitest
import matplotlib.pyplot as plt
import seaborn as sns
import CPTAC.Endometrial as CPTAC

Welcome to the CPTAC data service package. This import contains
information about the package. In order to access a specific data set,
import a CPTAC subfolder by either 'import CPTAC.DataName' or 'from
CPTAC import DataName'.
Loading Endometrial CPTAC data:
Loading Dictionary...
Loading Clinical Data...
Loading Acetylation Proteomics Data...
Loading Proteomics Data...
Loading Transcriptomics Data...
Loading CNA Data...
Loading Phosphoproteomics Data...
Loading Somatic Mutation Data...

 ******PLEASE READ******
CPTAC is a community resource project and data are made available
rapidly after generation for community research use. The embargo
allows exploring and utilizing the data, but the data may not be in a
publication until July 1, 2019. Please see
https://proteomics.cancer.gov/data-portal/about/data-use-agreement or
enter embargo() to open the webpage for more details.


In [3]:
gene='TP53'
somatic = CPTAC.get_somatic()
proteomics = CPTAC.get_proteomics()
transcriptomics = CPTAC.get_transcriptomics()

# A list of mutations present in the hotspot according to our Hotspot3D output
hotspot_mutations = ['p.P250L', 'p.R248W', 'p.S241C', 'p.241del','p.K164E', 'p.R249S', 'p.R273H', 'p.R282W',
                    'p.S127Y', 'p.G244D', 'p.A159P']
mutated_hotspot = somatic.loc[(somatic['Location'].isin(hotspot_mutations)) & (somatic['Gene'] == gene)]

# Get a list of patient identifiers for those who have mutations in the hotspot
hotspot_patients_both_keys = mutated_hotspot[['Clinical_Patient_Key', 'Patient_Id']]
hotspot_patients = hotspot_patients_both_keys['Clinical_Patient_Key']
hotspot_patients

586      S001
1249     S006
1335     S008
3962     S016
4609     S020
44776    S071
44808    S072
46588    S082
49326    S096
49407    S097
Name: Clinical_Patient_Key, dtype: object

# All TP53 Mutants vs. Wildtype TP53

In [10]:
#bonferroni correction
pcutoff = 0.05 / len(proteomics.columns)

proteomics_all_mut_wt_tstat = {}

# Go through every protein in our proteomics dataframe
for protein in proteomics.columns:
    if (protein != 'idx') and (protein != gene):
        # Create dataframe with mutation status of TP53 with proteomics of the given protein
        p53_mutations_protein = CPTAC.compare_mutations(proteomics, protein, gene)
        # Create a column showing the location of TP53 mutation
        p53_mutations_protein['Within_Hotspot'] = False
        p53_mutations_protein.loc[hotspot_patients, 'Within_Hotspot'] = True
        p53_mutations_protein.loc[p53_mutations_protein['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p53_mutations_protein = p53_mutations_protein.loc[p53_mutations_protein['Sample_Status'] == 'Tumor'].dropna(axis=0)
        # Perform the ttest if there is enough data for that protein
        mutated_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] != 'Wildtype']
        wt_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] == 'Wildtype']
        if len(mutated_tp53) < 2: continue
        ttest = scipy.stats.ttest_ind(mutated_tp53[protein], wt_tp53[protein])
        # Print significant values
        if ttest[1] <= pcutoff:
            proteomics_all_mut_wt_tstat[protein] = ttest[0]

In [11]:
transcriptomics_all_mut_wt_tstat = {}
pcutoff = 0.05 / len(transcriptomics.columns)

# Go through every protein in our proteomics dataframe
for protein in transcriptomics.columns:
    if (protein != 'idx') and (protein != gene):
        # Create dataframe with mutation status of TP53 with proteomics of the given protein
        p53_mutations_protein = CPTAC.compare_mutations(transcriptomics, protein, gene)
        # Create a column showing the location of TP53 mutation
        p53_mutations_protein['Within_Hotspot'] = False
        p53_mutations_protein.loc[hotspot_patients, 'Within_Hotspot'] = True
        p53_mutations_protein.loc[p53_mutations_protein['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p53_mutations_protein = p53_mutations_protein.loc[p53_mutations_protein['Sample_Status'] == 'Tumor'].dropna(axis=0)
        # Perform the ttest if there is enough data for that protein
        mutated_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] != 'Wildtype']
        wt_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] == 'Wildtype']
        if len(mutated_tp53) < 2: continue
        ttest = scipy.stats.ttest_ind(mutated_tp53[protein], wt_tp53[protein])
        # Print significant values
        if ttest[1] <= pcutoff:
            transcriptomics_all_mut_wt_tstat[protein] = ttest[0]

In [13]:
proteomics_shared_tstat = {}
proteomics_unique_tstat = {}

transcriptomics_shared_tstat = {}
transcriptomics_unique_tstat = {}

shared_identical_direction = list()
shared_opposite_direction = list()


for protein in proteomics_all_mut_wt_tstat:
    if protein in transcriptomics_all_mut_wt_tstat:
        proteomics_shared_tstat[protein] = proteomics_all_mut_wt_tstat[protein]
        transcriptomics_shared_tstat[protein] = transcriptomics_all_mut_wt_tstat[protein]

for protein in proteomics_all_mut_wt_tstat:
    if protein not in transcriptomics_all_mut_wt_tstat:
        proteomics_unique_tstat[protein] = proteomics_all_mut_wt_tstat[protein]
        
for protein in transcriptomics_all_mut_wt_tstat:
    if protein not in proteomics_all_mut_wt_tstat:
        transcriptomics_unique_tstat[protein] = transcriptomics_all_mut_wt_tstat[protein]

for protein in proteomics_shared_tstat:
    if proteomics_shared_tstat[protein] > 0 and transcriptomics_shared_tstat[protein] > 0:
        shared_identical_direction.append(protein)
    if proteomics_shared_tstat[protein] < 0 and transcriptomics_shared_tstat[protein] < 0:
        shared_identical_direction.append(protein)
    else:
        shared_opposite_direction.append(protein)
        
print('There are ' + str(len(shared_identical_direction)) + ' shared correlations in the same direction')
print(shared_identical_direction)

print('There are ' + str(len(shared_opposite_direction)) + ' shared correlations in the opposite direction')
print(shared_opposite_direction)

print('There are ' + str(len(proteomics_unique_tstat)) + ' unique proteomic correlations not found in the transcriptome')
print('There are ' + str(len(transcriptomics_unique_tstat)) + ' unique transcriptomic correlations not found in the proteome')

There are 75 shared correlations in the same direction
['ABCF1', 'ABT1', 'ALG2', 'ARFIP1', 'ARFIP2', 'ATAD2', 'CA8', 'CAD', 'CBS', 'CDKAL1', 'CIP2A', 'CMPK2', 'CMSS1', 'CMTR1', 'CSE1L', 'DDX27', 'DOCK5', 'EIF2AK2', 'EIF4G1', 'EPS8', 'FBXO4', 'FMN1', 'FXR1', 'GFM1', 'GMPS', 'GRB7', 'HELZ2', 'HERC5', 'IDNK', 'IFIT1', 'IFIT2', 'IRF2BPL', 'ISG15', 'KIF13B', 'KIF2C', 'KIFC1', 'KLHDC7A', 'KLRG2', 'KRI1', 'LSG1', 'LSM14B', 'MRGBP', 'MTHFD2', 'MX1', 'MX2', 'MYO5C', 'NGDN', 'NOL10', 'NPDC1', 'NUP153', 'OAS3', 'OGFR', 'PRKCI', 'PRPF6', 'PTPN3', 'RAE1', 'RIOK1', 'SENP2', 'SIAE', 'SLC4A1AP', 'SNX7', 'SP110', 'STAT1', 'STAT2', 'TESC', 'TOPBP1', 'TPD52L2', 'TPX2', 'TRIP13', 'UBE2C', 'USP39', 'VPS36', 'XPO5', 'YTHDF1', 'ZGPAT']
There are 57 shared correlations in the opposite direction
['ABCF1', 'ABT1', 'ATAD2', 'CAD', 'CBS', 'CDKAL1', 'CIP2A', 'CMPK2', 'CMSS1', 'CMTR1', 'CSE1L', 'DDX27', 'EIF2AK2', 'EIF4G1', 'FXR1', 'GFM1', 'GMPS', 'GRB7', 'HELZ2', 'HERC5', 'IFIT1', 'IFIT2', 'IRF2BPL', 'ISG15', 'KIF

# TP53 Hotspot Mutants vs. Wildtype TP53

In [None]:
proteomics_htspt_wt_tstat = {}
pcutoff = 0.05 / len(proteomics.columns)

for protein in proteomics.columns:
    if (protein != 'idx') and (protein != gene):
        # Create dataframe with mutation status of TP53 with proteomics of the given protein
        p53_mutations_protein = CPTAC.compare_mutations(proteomics, protein, gene)
        # Create a column showing the location of TP53 mutation
        p53_mutations_protein['Within_Hotspot'] = False
        p53_mutations_protein.loc[hotspot_patients, 'Within_Hotspot'] = True
        p53_mutations_protein.loc[p53_mutations_protein['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p53_mutations_protein = p53_mutations_protein.loc[p53_mutations_protein['Sample_Status'] == 'Tumor'].dropna(axis=0)
        # Perform the ttest
        hotspot_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] == True]
        other_mut_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] == 'Wildtype']
        if len(hotspot_tp53) < 2 or len(other_mut_tp53) < 2: continue
        ttest = scipy.stats.ttest_ind(hotspot_tp53[protein], other_mut_tp53[protein])
        # Print significant values
        if ttest[1] <= pcutoff:
            proteomics_htspt_wt_tstat[protein] = ttest[0]

In [None]:
transcriptomics_htspt_wt_tstat = {}
pcutoff = 0.05 / len(transcriptomics.columns)

for protein in transcriptomics.columns:
    if (protein != 'idx') and (protein != gene):
        # Create dataframe with mutation status of TP53 with proteomics of the given protein
        p53_mutations_protein = CPTAC.compare_mutations(transcriptomics, protein, gene)
        # Create a column showing the location of TP53 mutation
        p53_mutations_protein['Within_Hotspot'] = False
        p53_mutations_protein.loc[hotspot_patients, 'Within_Hotspot'] = True
        p53_mutations_protein.loc[p53_mutations_protein['Mutation'] == 'Wildtype', 'Within_Hotspot'] = 'Wildtype'
        p53_mutations_protein = p53_mutations_protein.loc[p53_mutations_protein['Sample_Status'] == 'Tumor'].dropna(axis=0)
        # Perform the ttest
        hotspot_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] == True]
        other_mut_tp53 = p53_mutations_protein.loc[p53_mutations_protein['Within_Hotspot'] == 'Wildtype']
        if len(hotspot_tp53) < 2 or len(other_mut_tp53) < 2: continue
        ttest = scipy.stats.ttest_ind(hotspot_tp53[protein], other_mut_tp53[protein])
        # Print significant values
        if ttest[1] <= pcutoff:
            transcriptomics_htspt_wt_tstat[protein] = ttest[0]

In [None]:
proteomics_shared_tstat_2 = {}
proteomics_unique_tstat_2 = {}

transcriptomics_shared_tstat_2 = {}
transcriptomics_unique_tstat_2 = {}

shared_identical_direction_2 = list()
shared_opposite_direction_2 = list()


for protein in proteomics_htspt_wt_tstat:
    if protein in transcriptomics_all_mut_wt_tstat:
        proteomics_shared_tstat_2[protein] = proteomics_htspt_wt_tstat[protein]
        transcriptomics_shared_tstat_2[protein] = transcriptomics_htspt_wt_tstat[protein]

for protein in proteomics_htspt_wt_tstat:
    if protein not in transcriptomics_htspt_wt_tstat:
        proteomics_unique_tstat_2[protein] = proteomics_htspt_wt_tstat[protein]
        
for protein in transcriptomics_htspt_wt_tstat:
    if protein not in proteomics_htspt_tstat:
        transcriptomics_unique_tstat_2[protein] = transcriptomics_htspt_wt_tstat[protein]

for protein in proteomics_shared_tstat_2:
    if proteomics_shared_tstat_2[protein] > 0 and transcriptomics_shared_tstat_2[protein] > 0:
        shared_identical_direction_2.append(protein)
    if proteomics_shared_tstat_2[protein] < 0 and transcriptomics_shared_tstat_2[protein] < 0:
        shared_identical_direction_2.append(protein)
    else:
        shared_opposite_direction_2.append(protein)
        
print('There are ' + str(len(shared_identical_direction_2)) + ' shared correlations in the same direction')
print(str(shared_identical_direction_2) + '\n')
print
print('There are ' + str(len(shared_opposite_direction_2)) + ' shared correlations in the opposite direction')
print(str(shared_opposite_direction_2) + '\n')

print('There are ' + str(len(proteomics_unique_tstat_2)) + ' unique proteomic correlations not found in the transcriptome')
print('There are ' + str(len(transcriptomics_unique_tstat_2)) + ' unique transcriptomic correlations not found in the proteome')