In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator




def wrap_ttest(df, label_column, comparison_columns=None, alpha=.05, return_all=False, correction_method='bonferroni', mincount=3, pval_return_corrected=True):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
        
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]

        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)

        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)


        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
        
        '''Loop through each comparison column, perform the t-test, and record the p-val'''
#         import pdb; pdb.set_trace()
        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= mincount:
                continue
            elif len(partition2[column].dropna(axis=0)) <= mincount:
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
                comparisons.append(column)
                pvals.append(pval)
                
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]

        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison','P_Value'])

        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            if pval_return_corrected:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = results[1]

            else:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = pvals

            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    if pval_return_corrected:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':results[1][i]}, ignore_index=True)
                    else:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)

        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            return None


    except:
        print("Incorrectly Formatted Dataframe!")
        return None

  import pandas.util.testing as tm


In [2]:
import cptac
import cptac.utils as u
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
br = cptac.Brca()

                                         

# Dealing with duplicates

In [4]:
phos = br.get_phosphoproteomics()
phos = cptac.utils.reduce_multiindex(phos, "Database_ID")
phos = cptac.utils.reduce_multiindex(phos, flatten=True)
phos.head()



Name,A2M_S710_VGFYEsDVMGR,AAAS_S495_IAHIPLYFVNAQFPRFsPVLGR,AAAS_S541_AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,AAED1_S12_QVsGAAALVPAPSGPDSGQPLAAAVAELPVLDAR,AAGAB_S310S311_AFWMAIGGDRDEIEGLssDEEH,AAGAB_S311_AFWMAIGGDRDEIEGLSsDEEH,AAK1_S14_EQGGsGLGSGSSGGGGSTSGLGSGYIGR,AAK1_S18_REQGGSGLGsGSSGGGGSTSGLGSGYIGR,AAK1_S21_REQGGSGLGSGSsGGGGSTSGLGSGYIGR,AAK1_S618T620S623_VGsLtPPsSPK,...,ZZEF1_S2526_sLRLEEQSAK,ZZZ3_S113_RQTEPVsPVLK,ZZZ3_S314_IVTACLPVEHVNQLTTEPATGPFSETQSSLRDsEEEVDVVGDSSASK,ZZZ3_S381_YTLRTsPR,ZZZ3_S391_AAPTRGsPTK,ZZZ3_S397_NSsPYRENGQFEENNLSPNETNATVSDNVSQSPTNPGEISQNEK,ZZZ3_S397T428N429_NSsPYRENGQFEENNLSPNETNATVSDNVSQSPtnPGEISQNEK,ZZZ3_S606_VGLPARPKsPLDPK,ZZZ3_S82_ESWVsPR,ZZZ3_S89_GLsSSEK
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,,1.9431,,,0.0127,-0.4495,,-1.1852,-0.8333,0.0863,...,-1.7098,0.0735,0.2238,-0.6702,-8.8556,,-0.8493,-0.1744,-0.0273,-6.8916
CPT001846,,0.2274,,0.5472,0.6895,-0.768,,0.6895,0.3903,,...,,-0.8199,,-0.879,-4.0222,,1.1806,-1.27,-0.8983,-3.6228
X01BR001,,-2.2853,-0.8967,4.1225,0.4842,-1.2458,-0.031,0.8805,0.7448,1.0649,...,,0.1092,,,-1.6299,-0.2017,-0.3953,-2.1328,-0.8815,-2.1191
X01BR008,,1.3714,,-0.0636,-0.6224,-1.2732,-1.1351,-0.8652,0.0031,0.2173,...,0.711,0.3824,,,1.3333,0.784,0.1078,0.4824,,-0.9748
X01BR009,,0.2682,,,-0.6207,-2.3561,-0.3078,0.3785,0.699,-0.1082,...,-1.4189,-0.7779,,,0.5267,-0.6661,0.5539,-0.3728,,-0.8354


In [5]:
# phos_all_levels

In [6]:
duplicated_filter = phos.columns.duplicated(keep=False)
duplicated_columns = phos.loc[:,duplicated_filter]
no_duplicate_columns = phos.loc[:,~duplicated_filter]
# duplicated_columns

In [7]:
#Deal With Duplicates
#get the databaseID
phos_all_levels = br.get_phosphoproteomics(tissue_type="tumor")
phos_all_levels = cptac.utils.reduce_multiindex(phos_all_levels, flatten=True)
#filter for just the duplicated ones
duplicate_columns = phos_all_levels.loc[:,duplicated_filter]
duplicate_columns

mut = br.get_genotype_all_vars("PIK3CA")

joined = mut.join(duplicate_columns)

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
br_hotspot_df


#select samples containing hotspot mutations
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_hotspot = br_hotspot_df.append(wildtype)
brca_hotspot 

#drop unncessary columns
filter_prot_and_mutations = brca_hotspot.drop(columns = ["Mutation_Status", "Location"])
filter_prot_and_mutations

gene_name_only = duplicated_columns.columns.str.split('_', 1).str[0]
gene_name_only


selected_dup = []
for duplicate_name in gene_name_only:
    #select only one gene at a time
    one_gene_df = filter_prot_and_mutations.loc[:, (filter_prot_and_mutations.columns.str.contains(duplicate_name))
                                            |(filter_prot_and_mutations.columns.str.contains("Mutation"))]
    #Run t-test and keep the one with the min pvalue
    gene_duplicate = wrap_ttest(one_gene_df, "Mutation", return_all=True, pval_return_corrected=False)
    min_row = gene_duplicate.loc[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    name = min_row['Comparison'][0]
    selected_dup.append(name)

    
phos_all_levels = br.get_phosphoproteomics(tissue_type="tumor")
phos_all_levels = cptac.utils.reduce_multiindex(phos_all_levels, flatten=True)
no_duplicate = phos_all_levels.loc[:,~duplicated_filter] #filter for just the non duplicated ones
no_duplicate=list(no_duplicate.columns)
    
    
selected_all= selected_dup+no_duplicate

#use the list of selected genes to filter dataframe
phos_all_levels = br.get_phosphoproteomics(tissue_type="tumor")
phos_all_levels = cptac.utils.reduce_multiindex(phos_all_levels, flatten=True)
selected_phos = phos_all_levels[phos_all_levels.columns.intersection(selected_all)]



In [8]:
#get mutation_tyoe
mut_type = br.get_genotype_all_vars('PIK3CA')

#join proteomics and mutation type
joined = mut_type.join(selected_phos)

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]


#join wildtype proteomics to missense mutation proteomics 
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_hotspot = br_hotspot_df.append(wildtype)

#drop unncessary columns
brca_prot_and_mutations = brca_hotspot.drop(columns = ["Mutation_Status", "Location"])
#drop databaseID identifier
gene_site_peptide = ["Mutation"]
for name in brca_prot_and_mutations.columns:
    if name != "Mutation":
        gene = name.split('_', 3)[0]
        site = name.split('_', 3)[1]
        peptide = name.split('_', 3)[2]
        name = "_".join([gene,site,peptide])
        gene_site_peptide.append(name)
gene_site_peptide
brca_prot_and_mutations.columns=gene_site_peptide
brca_prot_and_mutations



Unnamed: 0_level_0,Mutation,A2M_S710_VGFYEsDVMGR,AAAS_S495_IAHIPLYFVNAQFPRFsPVLGR,AAAS_S541_AQEPPAGGGGSIHDLPLFTETSPTSAPWDPLPGPPPVLPHsPHSHL,AAED1_S12_QVsGAAALVPAPSGPDSGQPLAAAVAELPVLDAR,AAGAB_S310S311_AFWMAIGGDRDEIEGLssDEEH,AAGAB_S311_AFWMAIGGDRDEIEGLSsDEEH,AAK1_S14_EQGGsGLGSGSSGGGGSTSGLGSGYIGR,AAK1_S18_REQGGSGLGsGSSGGGGSTSGLGSGYIGR,AAK1_S21_REQGGSGLGSGSsGGGGSTSGLGSGYIGR,...,ZZEF1_S2526_sLRLEEQSAK,ZZZ3_S113_RQTEPVsPVLK,ZZZ3_S314_IVTACLPVEHVNQLTTEPATGPFSETQSSLRDsEEEVDVVGDSSASK,ZZZ3_S381_YTLRTsPR,ZZZ3_S391_AAPTRGsPTK,ZZZ3_S397_NSsPYRENGQFEENNLSPNETNATVSDNVSQSPTNPGEISQNEK,ZZZ3_S397T428N429_NSsPYRENGQFEENNLSPNETNATVSDNVSQSPtnPGEISQNEK,ZZZ3_S606_VGLPARPKsPLDPK,ZZZ3_S82_ESWVsPR,ZZZ3_S89_GLsSSEK
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT001846,Missense_Mutation,,0.2274,,0.5472,0.6895,-0.7680,,0.6895,0.3903,...,,-0.8199,,-0.8790,-4.0222,,1.1806,-1.2700,-0.8983,-3.6228
X11BR004,Missense_Mutation,,-0.6013,,,-0.3988,0.8755,,0.9013,0.1331,...,-0.1368,-0.5278,,-3.4636,-2.5823,-0.7879,-0.0852,-1.1710,-2.4652,1.2765
X11BR013,Missense_Mutation,,1.9429,,,0.7240,-0.1044,-1.5777,-0.8241,-0.4826,...,,0.3839,,-1.4186,1.2913,0.3513,0.9022,0.2887,-0.0800,0.0466
X11BR014,Missense_Mutation,-0.1369,-0.5455,0.1734,,-0.6037,-0.2204,,1.2039,0.9041,...,,-0.0086,1.1756,,0.2584,,-0.4486,0.3165,,-0.0295
X11BR022,Missense_Mutation,,-0.7772,,,0.7644,-0.1852,0.1228,-1.0389,0.4308,...,,0.3418,,-0.2348,-0.4726,-0.0602,-0.4914,-0.6522,-0.8473,1.1032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X15BR003,Wildtype_Tumor,,0.1412,,,0.7846,0.2412,,0.2997,0.6861,...,-0.4191,-1.1826,-0.2929,-0.5500,-3.7809,,-3.2222,0.3367,-0.7932,-2.5849
X18BR004,Wildtype_Tumor,,-2.8546,,,1.0444,1.3914,,-0.7864,-2.3060,...,,-1.4103,,,-1.4984,-0.7584,-0.2429,-0.5785,,-2.1222
X18BR006,Wildtype_Tumor,,-1.3923,,1.9846,0.5373,-0.2949,,0.6568,1.6890,...,,0.3135,1.2631,1.0263,,,-0.0603,0.3135,,-0.9230
X18BR017,Wildtype_Tumor,-0.3119,-0.1950,0.1855,0.9188,0.3897,0.4729,0.7524,-0.3734,0.5938,...,2.6471,-1.1582,,,-0.5359,,-0.8966,-1.1344,-2.0362,-1.5348


In [9]:
brca_prot_and_mutations.loc[:,brca_prot_and_mutations.columns.duplicated()]

Unnamed: 0_level_0,ACIN1_T682_DSSTSYTETKDPSSGQEVAtPPVPQLQVCEPK,APBB2_S229_PNRPQSSPEDGQVATVSSsPETK,ATXN2L_S496_SAAPAPISASCPEPPIGSAVPTSSASIPVTSSVSDPGVGSISPAsPK,CAST_S294_YRELLAKPIGPDDAIDALSSDFTCGsPTAAGK,COBLL1_S655_DAAIQTTPsCNSFDGK,FGA_S588_QFTSSTsYNRGDSTFESK,LIMCH1_S523_ASVLDTSMSAGSGsPSK,NUP98_N906S917_TAPLPPASQTTPLQMALnGKPAPPPQSQsPEVEQLGR,PRKAG2_T165_TSGLSSSPStPTQVTK,RBM26_T614_EGSTQQLQtTSPK,SNX17_S407_VGGTLRRsDSQQAVK,SPAG9_S318_NVSTGsAENEEK,WAPL_S98_CSSYsESSEAAQLEEVTSVLEANSK,XRN1_S1633_EAQSSQATPVQTSQPDSSNIVKVsPR,ZC3H14_S343_TGSISSSVSVPAKPERRPsLPPSK,ZEB1_S323_TSQCSSPSLSASPGsPTRPQIR
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
CPT001846,,-1.4654,,1.2735,,3.0532,-1.6162,0.9176,,-0.3530,-1.3050,-1.2096,,-0.7946,-1.0021,-0.2226
X11BR004,1.2109,-0.5199,-2.0960,-2.6696,,2.1935,-0.3115,1.0085,-0.2599,,-1.4806,-0.2936,-1.3814,0.7008,-0.2301,0.7524
X11BR013,,,-1.7791,1.8599,-0.6051,,-2.5150,0.3132,0.2356,-2.1124,,-1.5573,,-1.0227,1.2505,0.0275
X11BR014,0.1644,-1.9474,,-5.8475,,-1.9176,-0.9198,-0.1712,0.4090,0.2643,,-2.3367,0.0496,-0.7468,0.8654,1.3888
X11BR022,,,0.6276,-2.9279,0.7918,,-2.2606,0.3127,-0.7532,-0.4332,,-1.8226,,-0.1715,1.2863,0.5472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X15BR003,,-1.8306,-3.6070,-1.7891,0.0458,-0.2652,0.1504,0.3428,1.3911,-0.8747,0.9539,-2.1385,,,-0.3652,
X18BR004,,,0.0531,-0.1549,,-2.1835,2.4440,-1.0059,,0.2138,,0.2266,-0.9459,0.0595,-0.7979,-1.5558
X18BR006,,,-0.1993,2.8299,0.0136,-2.8656,,-0.1472,-0.2841,-0.3188,2.8907,0.2222,,,-0.2493,1.3609
X18BR017,0.3600,0.9664,-0.9541,1.0754,0.8574,-2.6466,,,,1.6918,-0.6845,-0.4606,,-0.6786,0.6394,1.7691


In [10]:
#ttest for all
# wrap_ttest(brca_prot_and_mutations, "Mutation", return_all=True, pval_return_corrected=True)

# For the 62

In [11]:
protein_list_from_paper = [
'TRIM28',
'ARHGEF35',
'RPS6KA5',
'EIF2AK4', 
'SRRT', 
'BCL9',
'TP53BP1',
'FAM21A', 
'MAPT', 
'TSC22D3', 
'HEATR3', 
'MED26',
'ACIN1', 
'HUWE1', 
'GTF3C1',
'ZC3H14',
'KRT8',
'MAPT',
'SPAG7',
'ARID4B',
'METTL3',
'MAPT',
'MAPT',
'MAPT',
'UBXN2B',
'RAVER1',
'RTF1',
'RXRA',
'PHF20L1',
'WWC3',
'WHSC2',
'GTF2F1',
'RPS6KA5',
'UBXN7',
'BCL2L11',
'NOSIP',
'MPG',
'PPP6R2',
'KHDRBS1',
'FIP1L1',
'STAT3',
'RAPH1',
'NFIA',
'HUWE1',
'DDX17',
'TSC22D3',
'NFIA',
'KIAA1468',
'ANXA11',
'RAVER1',
'FAM83H',
'WIZ',
'DTNBP1',
'C1ORF174',
'RBM5',
'INTS1',
'RBM10',
'KCTD1',
'TAF3',
'PRR12'
]

In [12]:
#select the ones that are from the paper
phos = br.get_phosphoproteomics(tissue_type="tumor")
genefilter = phos.columns.get_level_values("Name").isin(protein_list_from_paper)
phos_62 = phos[phos.columns[genefilter]]
phos_62 = cptac.utils.reduce_multiindex(phos_62, flatten=True)
phos_62

#join mutation
#get mutation_tyoe
mut_type = br.get_genotype_all_vars('PIK3CA')

#join proteomics and mutation type
joined = mut_type.join(phos_62)

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]


#join wildtype proteomics to missense mutation proteomics 
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_hotspot = br_hotspot_df.append(wildtype)

#drop unncessary columns
phos_62 = brca_hotspot.drop(columns = ["Mutation_Status", "Location"])
phos_62



Name,Mutation,ACIN1_S1004_SGVSITIDDPVRTAQVPsPPRGK_NP_055792.1|NP_001158286.1|NP_001158287.1|NP_001158288.1|NP_001158289.1,ACIN1_S1104_GLLVDRPsETK_NP_055792.1|NP_001158286.1|NP_001158287.1|NP_001158288.1|NP_001158289.1,ACIN1_S1161_sEREWDRDK_NP_055792.1|NP_001158286.1|NP_001158287.1|NP_001158288.1|NP_001158289.1,ACIN1_S166S169M173_EAAELEEAsAEsEDEmIHPEGVASLLPPDFQSSLERPELELSR_NP_055792.1|NP_001158286.1,ACIN1_S169_EAAELEEASAEsEDEMIHPEGVASLLPPDFQSSLERPELELSR_NP_055792.1|NP_001158286.1,ACIN1_S208_KSSsISEEK_NP_055792.1|NP_001158286.1,ACIN1_S208S216_SSsISEEKGDsDDEK_NP_055792.1|NP_001158286.1|NP_001158287.1,ACIN1_S210S216_SSSIsEEKGDsDDEKPR_NP_055792.1|NP_001158286.1|NP_001158287.1,ACIN1_S216_SSSISEEKGDsDDEK_NP_055792.1|NP_001158286.1|NP_001158287.1,...,ZC3H14_S421_ISPPIKEEETKGDsVEK_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313227.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313243.1|NP_001313232.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1,ZC3H14_S515_DLVQPDKPAsPK_NP_079100.2|NP_001153575.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313229.1|NP_001313231.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313234.1|NP_001313243.1,ZC3H14_S527_FIVTLDGVPsPPGYMSDQEEDMCFEGMKPVNQTAASNK_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313229.1|NP_001313231.1|NP_001313234.1|NP_001313243.1,ZC3H14_S581_QLEDPNGSFsNAEMSELSVAQKPEK_NP_079100.2|NP_001153575.1|NP_001313229.1|NP_001313231.1|NP_001313234.1|NP_001313243.1|NP_001313236.1|NP_001313225.1|NP_001313226.1|NP_001313244.1|NP_001313230.1,ZC3H14_S620_NGDECAYHHPIsPCK_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313227.1|NP_001313242.1|NP_001313235.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313243.1|NP_001313232.1|NP_001313240.1|NP_997545.2,ZC3H14_S665_RIPVLsPKPAVAPPAPPSSSQLCR_NP_079100.2|NP_001313239.1|NP_001313236.1|NP_001313241.1|NP_001313226.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313227.1|NP_997544.1|NP_001313234.1|NP_001313242.1|NP_001313237.1|NP_001313238.1,ZC3H14_S665_RIPVLsPKPVAPPAPPSSSQLCR_NP_001153575.1|NP_001153576.1|NP_001313225.1|NP_001313228.1|NP_001313244.1|NP_001313231.1|NP_997543.1|NP_001313243.1|NP_001313232.1|NP_001313235.1|NP_001313233.1|NP_001313240.1|NP_997545.2,ZC3H14_T389_TRtSQEELLAEVVQGQSR_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313227.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313243.1|NP_001313232.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1|NP_001313224.1|NP_001313229.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1,ZC3H14_T405_TSQEELLAEVVQGQSRtPR_NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313228.1|NP_001313226.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313243.1|NP_001313232.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313227.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1|NP_079100.2|NP_001153575.1|NP_001313239.1,ZC3H14_Y531S533M539_FIVTLDGVPSPPGyMsDQEEDmCFEGMKPVNQTAASNK_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313229.1|NP_001313231.1|NP_001313234.1|NP_001313243.1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT001846,Missense_Mutation,-1.2012,,-0.9032,-0.5388,-0.0899,-3.2789,-1.7695,,-2.3486,...,,-0.1152,0.4362,0.4651,-1.2301,1.4219,-0.0416,0.0682,-0.4000,1.1179
X11BR004,Missense_Mutation,-0.4822,,-0.0197,-0.8792,,0.3812,-1.3655,,0.2760,...,,0.8954,-1.7942,0.5142,0.1847,-0.6628,0.3376,,0.3674,-2.2746
X11BR013,Missense_Mutation,0.2220,0.1227,-0.3235,-0.2038,,1.3579,-1.5845,,-0.1385,...,,0.2710,,,0.0914,0.3254,-0.7955,0.2180,,0.8872
X11BR014,Missense_Mutation,-0.3158,0.5895,,,,2.2091,-0.4709,-0.4888,-0.8468,...,-1.1003,0.2375,-0.0638,,-0.9154,0.5984,-0.3531,,1.2248,0.2166
X11BR022,Missense_Mutation,-0.2621,-1.0800,0.6618,,,-1.1981,-0.1886,,0.5608,...,,0.8500,,,-0.5274,0.7508,0.8808,3.7382,,-0.4555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X15BR003,Wildtype_Tumor,0.0950,-1.6028,2.0715,3.8278,,-5.5588,-2.3371,,1.0694,...,,0.2120,0.2012,0.4768,-0.6392,0.6507,0.5568,1.3634,-0.0912,0.1843
X18BR004,Wildtype_Tumor,0.4320,,-0.5530,-1.5749,,-0.9102,0.3516,0.3644,-0.1051,...,,0.8071,-0.2327,0.1207,-1.6833,,,-2.2830,-0.7099,-0.1013
X18BR006,Wildtype_Tumor,-0.1732,0.2831,-0.5514,,,-1.5444,0.5134,0.7329,0.3852,...,,0.3418,0.4830,,0.1244,-0.2928,0.4721,-0.4666,0.9980,0.5112
X18BR017,Wildtype_Tumor,-0.0642,,-0.2941,,2.7383,,0.7960,,0.2331,...,,0.2807,0.3382,1.4936,-1.2335,0.4452,0.6612,2.3657,0.3798,0.0151


In [13]:
wrap_ttest(phos_62, "Mutation", correction_method="fdr_bh", return_all=True, pval_return_corrected=True)

Unnamed: 0,Comparison,P_Value
0,NFIA_S300S319_sPGSGSQSSGWHEVEPGMPsPTTLK_NP_001...,0.630395
1,RXRA_T13_HFLPLDFStQVNSSLTSPTGR_NP_002948.1,0.909843
2,TRIM28_S612_LASPSGSTSSGLEVVAPEGTsAPGGGPGTLDDSA...,0.909843
3,RPS6KA5_S376_LFQGYsFVAPSILFK_NP_004746.2|NP_00...,0.909843
4,ACIN1_S478_ELLVSQHTVQLVGGLsPLSSPSDTK_NP_055792...,0.909843
...,...,...
782,PHF20L1_S435_VSSPsPATDGK_NP_057102.4|NP_001264...,0.993286
783,RBM5_S621_GLVAAYsGDSDNEEELVERLESEEEK_NP_005769.1,0.993286
784,HUWE1_S1077_LCVGsPVR_NP_113584.3,0.993286
785,GTF3C1_S1068_NSSTDQGsDEEGSLQK_NP_001511.2|NP_0...,0.993286


In [14]:
wrap_ttest(phos_62, "Mutation", correction_method="fdr_bh", return_all=True, pval_return_corrected=False)

Unnamed: 0,Comparison,P_Value
0,NFIA_S300S319_sPGSGSQSSGWHEVEPGMPsPTTLK_NP_001...,0.000801
1,MED26_T426_KLtFDPMTR_NP_004822.2,0.003963
2,RAPH1_S1012_FTPPAESGsPSKETLPPPAAPPKPGK_NP_9987...,0.004372
3,ZC3H14_T389_TRtSQEELLAEVVQGQSR_NP_079100.2|NP_...,0.006517
4,MED26_S337_LELLPSAEsPVCWLEQPESHQR_NP_004822.2,0.006888
...,...,...
782,BCL2L11_S118_STQTPsPPCQAFNHYLSAMASMR_NP_619527...,0.990935
783,PRR12_S1135_SRPALsPLGDIDFCPPNPGPDGPRR_NP_065770.1,0.991299
784,ARID4B_S675_LSKPPFQTNPsPEMVSKLDLTDAK_NP_001193...,0.991439
785,TP53BP1_S640_sEALSSVLDQEEAMEIK_NP_001135452.1|...,0.992812


# Check individual

In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

def plotting(df, gene, site):
    plt.rcParams['figure.figsize']=(11.7,8.5) #size of plot
    sns.set(font_scale = 1.2)

    boxplot = sns.boxplot(x='Mutation', y=site, data = df, showfliers = False,
                         order=["Wildtype_Tumor", "Missense_Mutation"])  

    boxplot.set_title(gene)

    boxplot = sns.stripplot(x='Mutation', y=site, data = df, jitter = True, 
                               color = ".3", dodge = True,
                           order=["Wildtype_Tumor", "Missense_Mutation"])

    boxplot.set(xlabel = "Mutation Type", ylabel = gene+"_proteomics")


    plt.show()
    plt.clf()
    plt.close()
    

In [16]:
def get_df(gene_list, site=None):
    #select the ones that are from the paper
    phos = br.get_phosphoproteomics(tissue_type="tumor")
    genefilter = phos.columns.get_level_values("Name").isin(gene_list)
    phos_62 = phos[phos.columns[genefilter]]
    phos_62 = cptac.utils.reduce_multiindex(phos_62, flatten=True)
    phos_62

    #join mutation
    #get mutation_tyoe
    mut_type = br.get_genotype_all_vars('PIK3CA')

    #join proteomics and mutation type
    joined = mut_type.join(phos_62)

    #select samples containing hotspot mutations
    br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                        joined.Location.str.contains('E545K') |
                        joined.Location.str.contains('H1047R')]


    #join wildtype proteomics to missense mutation proteomics 
    wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
    brca_hotspot = br_hotspot_df.append(wildtype)

    #drop unncessary columns
    phos_62 = brca_hotspot.drop(columns = ["Mutation_Status", "Location"])
    
    df = phos_62
    site_name = ["Mutation"]
    for name in df:
        if name != "Mutation":
            both = "_".join([name.split('_', 3)[0],name.split('_', 3)[1]])
            site_name.append(both)
#             site_name.append(name.split('_', 3)[1]
            
    df.columns = site_name
    
    if site != None:
        df = df[["Mutation", site]]
    
    return(df)

In [17]:
gene = "PRR12"
# site = gene+"_"+ "S1012"
site = None
df = get_df([gene], site)
# df = df.loc[:,~df.columns.duplicated(keep="first")]
df.head()



Unnamed: 0_level_0,Mutation,PRR12_S1061,PRR12_S1077,PRR12_S1135,PRR12_S1308,PRR12_S1361,PRR12_S1381S1382,PRR12_S1382,PRR12_S1568,PRR12_S1921,...,PRR12_S915S917,PRR12_S917,PRR12_T1192,PRR12_T1304S1308,PRR12_T1561,PRR12_T1561S1568,PRR12_T1705,PRR12_T224,PRR12_T686,PRR12_T738
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT001846,Missense_Mutation,0.2865,-0.1985,0.8693,0.0959,0.0332,0.2033,-0.3119,-1.5463,-1.2555,...,-1.1722,-0.8392,-1.7405,0.0573,,-0.0368,-1.6114,,0.6775,0.254
X11BR004,Missense_Mutation,0.0994,-1.9133,-0.4802,0.1927,-2.2905,-0.3869,-1.2405,0.0518,-0.4207,...,,1.5663,-0.772,-0.4802,0.3177,-1.4568,-1.0042,-1.2464,,-1.3615
X11BR013,Missense_Mutation,-0.1385,-0.771,0.3894,0.3145,0.5553,-0.2119,-0.1181,0.3866,1.5906,...,,-0.2622,0.1186,0.0466,1.2709,1.4491,0.6247,0.9825,1.3498,1.3348
X11BR014,Missense_Mutation,,1.1428,-1.145,0.4582,,,,-0.2099,0.5268,...,-1.0362,-1.8117,0.1465,,,-0.6976,-0.9542,-0.2994,0.9026,0.4269
X11BR022,Missense_Mutation,1.2076,-0.3203,0.6464,0.3966,0.9989,0.6601,-0.9311,-0.991,-0.2211,...,,-0.2981,0.0783,0.1673,-0.0928,0.3367,-0.3015,0.566,2.6722,-0.2006


In [18]:
wrap_ttest(df, "Mutation", correction_method="fdr_bh", return_all=True, pval_return_corrected=False)

Unnamed: 0,Comparison,P_Value
0,PRR12_T1705,0.074739
1,PRR12_T686,0.110006
2,PRR12_S386,0.156855
3,PRR12_S651,0.181909
4,PRR12_S1381S1382,0.220347
5,PRR12_S1921,0.256275
6,PRR12_S865,0.335848
7,PRR12_S677,0.357598
8,PRR12_S917,0.38324
9,PRR12_S444,0.42257


In [19]:
plotting(df, 'RAPH1', 'RAPH1_S1012')

ValueError: Could not interpret input 'RAPH1_S1012'

# Split hotspots

In [None]:
#pick a gene
gene = "RAPH1"
site = gene+"_"+ "S1012"
site=None
phos_s_df = get_df([gene], site)
phos_s_df

In [None]:
#get the location
phos_split_df = phos_s_df.drop(columns="Mutation")
mut = br.get_genotype_all_vars("PIK3CA")
joined = mut.join(phos_split_df)
wildtype = joined.loc[joined["Mutation"]=="Wildtype_Tumor"]
# wildtype
wildtype['type'] = "wildtype_tumor"

# allsnps = joined.loc[joined['Mutation']=="Missense_Mutation"]
# allsnps['type'] = "all_snps"

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K')|
                      joined.Location.str.contains('H1047R')]


all_hotspots = br_hotspot_df.drop(columns=['Location','Mutation_Status'])
all_hotspots["type"] = "all_hotspots"

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K')]


brca_hotspot_helical = br_hotspot_df.drop(columns=['Location','Mutation_Status'])
brca_hotspot_helical["type"] = "helical"
brca_hotspot_helical

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('H1047R')]

#join wildtype proteomics to missense mutation proteomics 
brca_hotspot_kinase = br_hotspot_df.drop(columns=['Location','Mutation_Status'])
brca_hotspot_kinase["type"] = "kinase"
brca_hotspot_kinase

#make the all missense vs hotspots dataframe plot
all_vs_hot =  pd.concat([wildtype, all_hotspots])
# all_vs_hot =  pd.concat([all_vs_hot, all_hotspots])

with_split = pd.concat([all_vs_hot, brca_hotspot_kinase])
with_split = pd.concat([with_split, brca_hotspot_helical])
with_split

In [None]:
#select for a specific site
gene_site = 'RAPH1_S1012'
gene_df = with_split[["Mutation", "type", gene_site]]
gene_df["type"].unique()

In [None]:
df = gene_df
gene =gene_site

plt.rcParams['figure.figsize']=(11.7,8.5) #size of plot
sns.set(font_scale = 1.2)

boxplot = sns.boxplot(x='type', y=gene, data = df, showfliers = False)  

boxplot.set_title(gene)

boxplot = sns.stripplot(x='type', y=gene, data = df, jitter = True, 
                           color = ".3", dodge = True)

boxplot.set(xlabel = "Mutation Type", ylabel = gene+"_proteomics")


plt.show()
plt.clf()
plt.close()