In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


In [2]:
en = cptac.Endometrial()
br = cptac.Brca()
cl = cptac.Colon()

                                                

In [3]:
gene = 'PIK3CA'

# Endometrial

In [4]:
mut = en.get_genotype_all_vars(gene, mutation_hotspot=['E542K', 'E545K', 'H1047R'])
phos = en.get_phosphoproteomics(tissue_type="tumor")
phos  = en.reduce_multiindex(phos, flatten=True)
phos



Name,AAAS_S495,AAAS_S541,AAAS_Y485,AACS_S618,AAED1_S12,AAGAB_S310,AAGAB_S311,AAK1_S14,AAK1_S18,AAK1_S20,...,ZZZ3_S397,ZZZ3_S411,ZZZ3_S420,ZZZ3_S424,ZZZ3_S426,ZZZ3_S468,ZZZ3_S89,ZZZ3_T415,ZZZ3_T418,ZZZ3_Y399
C3L-00006,,,,-0.881,-1.810,,,,-0.2420,-0.2420,...,0.18400,,,,-0.20500,,,,,
C3L-00008,,,,,0.084,,,-1.1100,-0.3830,-1.0900,...,-0.17100,,,-0.393,-0.17100,,0.29,,0.16050,-0.06350
C3L-00032,-0.202,,,,-1.880,,,,0.3820,-0.0416,...,,,,,,,,,,
C3L-00090,-0.002,,-0.4070,,,,,,,-0.5550,...,0.13970,,,,-0.55900,,,,,0.29800
C3L-00098,0.556,-0.0461,,,0.941,,0.429,0.3620,0.6970,-0.0529,...,-0.15875,,,0.196,0.06175,,,,,-0.29000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520,0.398,,-0.0901,,,,,,,-0.0845,...,-0.14750,,,,-0.07520,,,,,-0.13800
C3N-01521,0.750,0.7040,,,0.209,,-0.229,0.2215,-0.1200,-0.5100,...,0.33600,,,0.442,0.42200,,,,,0.27000
C3N-01537,0.526,,,,-0.967,,,0.4700,-0.3570,-0.3140,...,-0.05860,,-0.559,,0.30900,,,,,
C3N-01802,,,,,0.180,,,-0.2250,0.7010,0.1400,...,-0.13200,,,-0.920,-0.13200,,0.00,,-0.04685,0.20165


In [5]:
joined = mut.join(phos)
joined['Mutation'].unique()

#select samples containing hotspot mutations
en_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

endo_hotspot = en_hotspot_df.append(wildtype)
endo_hotspot = endo_hotspot.drop(columns = ["Mutation_Status", "Location"])


# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = endo_hotspot['Mutation'].isin(compare)
missense_wt = endo_hotspot[get]
missense_wt['Mutation'].value_counts()


# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype
missense

Name,Mutation,AAAS_S495,AAAS_S541,AAAS_Y485,AACS_S618,AAED1_S12,AAGAB_S310,AAGAB_S311,AAK1_S14,AAK1_S18,...,ZZZ3_S397,ZZZ3_S411,ZZZ3_S420,ZZZ3_S424,ZZZ3_S426,ZZZ3_S468,ZZZ3_S89,ZZZ3_T415,ZZZ3_T418,ZZZ3_Y399
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,,,,-0.881,-1.81,,,,-0.242,...,0.184,,,,-0.205,,,,,
C3L-00032,Missense_Mutation,-0.202,,,,-1.88,,,,0.382,...,,,,,,,,,,
C3L-00362,Missense_Mutation,,,,0.844,0.279,,,,-0.355,...,0.0805,,,,-0.109,,,,,
C3L-00601,Missense_Mutation,0.594,,,0.0,-0.982,-0.276,,,-0.109,...,0.134,,,0.32,0.32,,,,,
C3L-00605,Missense_Mutation,-0.162,,0.14,,,,,,,...,0.189,,,,0.386,,,,,0.596
C3L-00921,Missense_Mutation,-0.525,,,0.894,-1.4,,-0.255,0.271,0.761,...,-0.144,,,,-0.22,,,0.336,,-0.131
C3L-00947,Missense_Mutation,0.46,,,,-0.265,,,,0.608,...,0.483,,,,0.483,,,,,
C3N-00323,Missense_Mutation,0.532,,,,-0.0342,,,,-0.447,...,0.628,,0.2,-0.806,-0.806,,,,0.635,0.2
C3N-00324,Missense_Mutation,-0.791,,,,,,,,0.281,...,-0.3435,,,,-0.0994,,,,,
C3N-00383,Missense_Mutation,-0.0591,,,,-1.77,,,,-0.533,...,,,,,,,,,,


In [6]:
endo_d = {}

for prot in phos:
    dif_endo = wt_med[prot] - missense_med[prot]
    endo_d[prot] = dif_endo
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(endo_d, orient='index', columns=['Difference_In_Median'])

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
endo_df = median_diff.join(t_test)
endo_df

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Difference_In_Median,P_Value
AAAS_S495,0.257295,0.650738
AAAS_S541,-0.061000,
AAAS_Y485,-0.300000,
AACS_S618,-0.019150,0.980243
AAED1_S12,0.378000,0.168105
...,...,...
ZZZ3_S468,,
ZZZ3_S89,,
ZZZ3_T415,-0.031250,
ZZZ3_T418,-0.725175,


# Brca

In [45]:
mut = br.get_genotype_all_vars(gene, mutation_hotspot=['E542K', 'E545K', 'H1047R'])
phos = br.get_phosphoproteomics(tissue_type="tumor")




In [46]:
dropped = br.reduce_multiindex(phos, ["Database_ID", "Peptide"])
dropped

dropped = br.reduce_multiindex(dropped, flatten=True)
dropped

fil = dropped.columns.get_level_values('Name').duplicated(False)
duplicates = phos[phos.columns[fil]]
no_duplicates = dropped[dropped.columns[~fil]]

len(duplicates.columns.get_level_values('Name').unique())
duplicates



Name,ACAP2,ACAP2,ACIN1,ACIN1,ACIN1,ACIN1,ACIN1,ACIN1,ACSS2,ACSS2,...,ZC3H14,ZC3H14,ZC3H14,ZC3H14,ZCCHC8,ZCCHC8,ZEB1,ZEB1,ZNF414,ZNF414
Site,S581,S581,S657,S657,S825,S825,T682,T682,S267,S267,...,S343,S343,S665,S665,S427,S427,S323,S323,S296,S296
Peptide,ESLPSTVsANSLYEPEGER,SNDSGIQQSSDDGRESLPSTVsANSLYEPEGER,SLsPGVSR,SLsPGVSRDSSTSYTETK,IsVVSATK,IsVVSTK,DSSTSYTETKDPSSGQEVAtPPVPQLQVCEPK,DSSTSYTETKDPSSGQEVAtPPVPQLQVCEPK,AELGMGDSTSQsPPIK,AELGMGDSTSQsPPIKR,...,TGSISSSVSVPAKPERRPsLPPSK,TGSISSSVSVPAKPERRPsLPPSK,RIPVLsPKPAVAPPAPPSSSQLCR,RIPVLsPKPVAPPAPPSSSQLCR,SSSHSSPGsPK,SSSHSSPGsPKK,TSQCSSPSLSASPGsPTRPQIR,TSQCSSPSLSASPGsPTRPQIR,SQGAGSsPR,SQGAGSsPRRPQGGSDAPSGACR
Database_ID,NP_036419.3,NP_036419.3,NP_055792.1|NP_001158286.1|NP_001158287.1,NP_055792.1|NP_001158286.1,NP_055792.1|NP_001158287.1|NP_001158288.1|NP_001158289.1,NP_001158286.1,NP_001158286.1|NP_001158287.1|NP_055792.1,NP_055792.1|NP_001158286.1|NP_001158287.1,NP_001070020.2|NP_061147.1|NP_001229322.1,NP_001070020.2|NP_061147.1|NP_001229322.1,...,NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313227.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313243.1|NP_001313232.1|NP_001313242.1|NP_001313235.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1,NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313229.1|NP_001313227.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313243.1|NP_001313232.1|NP_001313242.1|NP_001313235.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1,NP_079100.2|NP_001313239.1|NP_001313236.1|NP_001313241.1|NP_001313226.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313227.1|NP_997544.1|NP_001313234.1|NP_001313242.1|NP_001313237.1|NP_001313238.1,NP_001153575.1|NP_001153576.1|NP_001313225.1|NP_001313228.1|NP_001313244.1|NP_001313231.1|NP_997543.1|NP_001313243.1|NP_001313232.1|NP_001313235.1|NP_001313233.1|NP_001313240.1|NP_997545.2,NP_060082.2,NP_060082.2,NP_001167565.1|NP_001167564.1|NP_001167566.1|NP_001310603.1|NP_001310604.1|NP_001310607.1|NP_001310567.1|NP_001167567.1|NP_110378.3|NP_001310605.1|NP_001310606.1|NP_001121600.1,NP_001167567.1|NP_110378.3|NP_001310605.1|NP_001310606.1|NP_001121600.1|NP_001167565.1|NP_001167564.1|NP_001167566.1|NP_001310603.1|NP_001310604.1|NP_001310607.1|NP_001310567.1,NP_001139647.1|NP_115746.2,NP_115746.2
CPT000814,0.4781,-0.7949,-4.1136,-3.4466,-1.6218,-2.7829,,-0.5246,-0.9789,-2.5174,...,-1.7561,,0.8268,1.1035,,,,-1.8201,,
CPT001846,0.2503,-1.0130,-1.5451,-1.5511,-0.9719,-0.7861,,-0.2830,-0.6075,-0.7740,...,-1.0021,,1.4219,-0.0416,,,-0.2226,1.0117,-2.4053,
X01BR001,,,-0.5980,-1.3631,0.2342,1.4307,0.5208,-0.9150,1.2478,-0.3328,...,-1.0004,,-0.6925,0.2617,-4.4618,-3.0199,0.1184,,-1.8036,-3.5824
X01BR008,,-0.0350,0.0348,0.6682,0.6015,1.0317,0.7745,-0.2049,-0.7256,-1.6097,...,0.3412,,0.7698,0.4015,1.7777,,,0.3046,0.4221,
X01BR009,,0.3861,0.6113,0.1563,-1.2738,-0.6842,0.3286,0.9288,0.4027,-0.7885,...,-0.9094,,0.5100,0.2122,1.0981,,,0.3135,0.7217,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,,0.1204,1.3387,0.6292,0.4579,0.3471,0.5478,-0.8384,-0.1686,-0.9093,...,0.4977,,0.8767,-0.3365,0.2934,,,0.4302,-0.2482,
X21BR002,,0.2408,0.6930,0.2026,0.6821,1.5682,0.6328,-0.5523,-0.5814,-1.3618,...,0.7277,,-0.0363,0.1898,0.9027,,,0.8370,-0.1694,
X21BR010,1.1750,-0.1676,0.4896,1.1858,1.5798,2.4067,-0.3992,-0.5157,-0.0451,-1.5553,...,1.1934,0.4601,-0.1777,-0.7836,-2.7910,-1.6227,2.1904,2.1453,-0.9308,
X22BR005,,-0.5270,0.6536,-1.0170,0.1316,0.6809,0.2388,-0.2604,,-0.7868,...,-0.9897,0.3345,-0.0074,-0.1213,,2.2421,1.0433,,-0.8187,


In [9]:
#FOR NO DUPLICATES
#get the pval and min for no duplicates

#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
mut_type = br.get_genotype_all_vars(gene)
# mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])

joined = mut_type.join(no_duplicates)
joined

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

braca_hotspot = br_hotspot_df.append(wildtype)
braca_hotspot = braca_hotspot.drop(columns = ["Mutation_Status", "Location"])

braca_hotspot


# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = braca_hotspot['Mutation'].isin(compare)
missense_wt = braca_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


no_dup_d = {}

for prot in no_duplicates:
    dif_brca = wt_med[prot] - missense_med[prot]
    no_dup_d[prot] = dif_brca
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(no_dup_d, orient='index', columns=['Difference_In_Median'])

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
no_dup_df = median_diff.join(t_test)
no_dup_df

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Difference_In_Median,P_Value
A2M_S710,0.27610,0.747631
AAAS_S495,-0.22110,0.119525
AAAS_S541,0.35720,0.638884
AAED1_S12,-0.33260,0.159187
AAGAB_S310S311,0.44315,0.121689
...,...,...
ZZZ3_S397,-0.13435,0.761187
ZZZ3_S397T428N429,-0.39230,0.071512
ZZZ3_S606,0.18190,0.376978
ZZZ3_S82,0.01380,0.459838


In [10]:
#DEALING WITH DUPLICATES
#get the pval and min for duplicates

duplicates = br.reduce_multiindex(duplicates, flatten=True)

# We need to figure which of databaseID we want to use, for each protein


#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest

mut_type = br.get_genotype_all_vars(gene)
joined = mut_type.join(duplicates)
joined

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

brace_hotspot = br_hotspot_df.append(wildtype)
brace_hotspot = brace_hotspot.drop(columns = ["Mutation_Status", "Location"])
brace_hotspot

# Keep two values to compare
compare = ['Wildtype_Tumor','Missense_Mutation']
get = brace_hotspot['Mutation'].isin(compare)
missense_wt = brace_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

duplicates

duplicates_d = {}

for prot in duplicates:
    dif_brca = wt_med[prot] - missense_med[prot]
    duplicates_d[prot] = dif_brca
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(duplicates_d, orient='index', columns=['Difference_In_Median'])
median_diff

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
dup_df = median_diff.join(t_test)


duplicate_gene_names = list(set(duplicates.columns.get_level_values('Name'))) #get a list of genes names that are duplicated
# duplicate_gene_names

for name in duplicate_gene_names:
    gene_duplicate = dup_df[dup_df.index.str.contains(name)]
    min_row = gene_duplicate[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    new_no_dup_df=no_dup_df.append(min_row)
new_no_dup_df

#reformat the table by getting rid of databaseIDs so we can join it to Endo and BRca
split = no_dup_df.index.str.split('_').str[0:2]
no_dup_df['new_index'] = split
# '_'.join(split)
no_dup_df['new_index'] = no_dup_df['new_index'].str.join('_')


brca_df = no_dup_df.set_index("new_index")


#are there duplicates in the index
brca_df




Name,Mutation,Location,Mutation_Status,ACAP2_S581_ESLPSTVsANSLYEPEGER_NP_036419.3,ACAP2_S581_SNDSGIQQSSDDGRESLPSTVsANSLYEPEGER_NP_036419.3,ACIN1_S657_SLsPGVSR_NP_055792.1|NP_001158286.1|NP_001158287.1,ACIN1_S657_SLsPGVSRDSSTSYTETK_NP_055792.1|NP_001158286.1,ACIN1_S825_IsVVSATK_NP_055792.1|NP_001158287.1|NP_001158288.1|NP_001158289.1,ACIN1_S825_IsVVSTK_NP_001158286.1,ACIN1_T682_DSSTSYTETKDPSSGQEVAtPPVPQLQVCEPK_NP_001158286.1|NP_001158287.1|NP_055792.1,...,ZC3H14_S343_TGSISSSVSVPAKPERRPsLPPSK_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313227.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313243.1|NP_001313232.1|NP_001313242.1|NP_001313235.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1,ZC3H14_S343_TGSISSSVSVPAKPERRPsLPPSK_NP_079100.2|NP_001153575.1|NP_001313239.1|NP_001153576.1|NP_001313236.1|NP_001313225.1|NP_001313241.1|NP_001313229.1|NP_001313227.1|NP_001313231.1|NP_997543.1|NP_997544.1|NP_001313234.1|NP_001313228.1|NP_001313226.1|NP_001313244.1|NP_001313230.1|NP_001313224.1|NP_001313243.1|NP_001313232.1|NP_001313242.1|NP_001313235.1|NP_001313237.1|NP_001313233.1|NP_001313238.1|NP_001313240.1,ZC3H14_S665_RIPVLsPKPAVAPPAPPSSSQLCR_NP_079100.2|NP_001313239.1|NP_001313236.1|NP_001313241.1|NP_001313226.1|NP_001313230.1|NP_001313224.1|NP_001313229.1|NP_001313227.1|NP_997544.1|NP_001313234.1|NP_001313242.1|NP_001313237.1|NP_001313238.1,ZC3H14_S665_RIPVLsPKPVAPPAPPSSSQLCR_NP_001153575.1|NP_001153576.1|NP_001313225.1|NP_001313228.1|NP_001313244.1|NP_001313231.1|NP_997543.1|NP_001313243.1|NP_001313232.1|NP_001313235.1|NP_001313233.1|NP_001313240.1|NP_997545.2,ZCCHC8_S427_SSSHSSPGsPK_NP_060082.2,ZCCHC8_S427_SSSHSSPGsPKK_NP_060082.2,ZEB1_S323_TSQCSSPSLSASPGsPTRPQIR_NP_001167565.1|NP_001167564.1|NP_001167566.1|NP_001310603.1|NP_001310604.1|NP_001310607.1|NP_001310567.1|NP_001167567.1|NP_110378.3|NP_001310605.1|NP_001310606.1|NP_001121600.1,ZEB1_S323_TSQCSSPSLSASPGsPTRPQIR_NP_001167567.1|NP_110378.3|NP_001310605.1|NP_001310606.1|NP_001121600.1|NP_001167565.1|NP_001167564.1|NP_001167566.1|NP_001310603.1|NP_001310604.1|NP_001310607.1|NP_001310567.1,ZNF414_S296_SQGAGSsPR_NP_001139647.1|NP_115746.2,ZNF414_S296_SQGAGSsPRRPQGGSDAPSGACR_NP_115746.2
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,Amplification,Amplification,Single_mutation,0.4781,-0.7949,-4.1136,-3.4466,-1.6218,-2.7829,,...,-1.7561,,0.8268,1.1035,,,,-1.8201,,
CPT001846,Missense_Mutation,p.E542K,Multiple_mutation,0.2503,-1.0130,-1.5451,-1.5511,-0.9719,-0.7861,,...,-1.0021,,1.4219,-0.0416,,,-0.2226,1.0117,-2.4053,
X01BR001,Wildtype_Tumor,No_mutation,Wildtype_Tumor,,,-0.5980,-1.3631,0.2342,1.4307,0.5208,...,-1.0004,,-0.6925,0.2617,-4.4618,-3.0199,0.1184,,-1.8036,-3.5824
X01BR008,Wildtype_Tumor,No_mutation,Wildtype_Tumor,,-0.0350,0.0348,0.6682,0.6015,1.0317,0.7745,...,0.3412,,0.7698,0.4015,1.7777,,,0.3046,0.4221,
X01BR009,Amplification,Amplification,Single_mutation,,0.3861,0.6113,0.1563,-1.2738,-0.6842,0.3286,...,-0.9094,,0.5100,0.2122,1.0981,,,0.3135,0.7217,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X21BR001,Amplification,Amplification,Single_mutation,,0.1204,1.3387,0.6292,0.4579,0.3471,0.5478,...,0.4977,,0.8767,-0.3365,0.2934,,,0.4302,-0.2482,
X21BR002,Missense_Mutation,p.H1047R,Multiple_mutation,,0.2408,0.6930,0.2026,0.6821,1.5682,0.6328,...,0.7277,,-0.0363,0.1898,0.9027,,,0.8370,-0.1694,
X21BR010,Missense_Mutation,p.H1047R,Multiple_mutation,1.1750,-0.1676,0.4896,1.1858,1.5798,2.4067,-0.3992,...,1.1934,0.4601,-0.1777,-0.7836,-2.7910,-1.6227,2.1904,2.1453,-0.9308,
X22BR005,Missense_Mutation,p.E365K,Single_mutation,,-0.5270,0.6536,-1.0170,0.1316,0.6809,0.2388,...,-0.9897,0.3345,-0.0074,-0.1213,,2.2421,1.0433,,-0.8187,


# Colon

In [49]:
mut = cl.get_genotype_all_vars(gene, mutation_hotspot=['E542K', 'E545K', 'H1047R'])
phos = cl.get_phosphoproteomics(tissue_type="tumor")
# phos  = cl.reduce_multiindex(phos, flatten=True)
phos



Name,AAAS,AAAS,AAAS,AAED1,AAGAB,AAGAB,AAK1,AAK1,AAK1,AAK1,...,ZZEF1,ZZEF1,ZZEF1,ZZEF1,ZZEF1,ZZZ3,ZZZ3,ZZZ3,ZZZ3,ZZZ3
Site,S495,S525,S541,S12,S310,S311,S20,S21,S26,S618,...,S1501,S1518,S1537,S1540,T1521,S113,S391,S606,S90,S91
Database_ID,Q9NRG9,Q9NRG9,Q9NRG9,Q7RTV5,Q6PD74,Q6PD74,Q2M2I8,Q2M2I8,Q2M2I8,Q2M2I8,...,O43149,O43149,O43149,O43149,O43149,Q8IYH5,Q8IYH5,Q8IYH5,Q8IYH5,Q8IYH5
01CO005,,,-0.240,-0.460,,,-0.231,,,,...,,-0.6750,-1.404,-1.404,,-0.572,,0.205,,
01CO006,-0.3650,,,-0.424,-0.015,-0.015,,-0.485,,,...,,-0.2875,0.222,0.222,-0.701,0.624,,,,
01CO008,0.7250,0.137,0.137,,,,,,,,...,-0.147,-0.1470,,,,,,,,-0.030
01CO013,0.2265,,,-1.278,0.403,0.075,-0.223,-0.701,,,...,-0.041,0.0430,0.554,0.554,0.127,1.263,,,,
01CO014,0.5600,,,-0.382,,,,-0.259,,,...,,-0.1290,-0.919,-0.919,,0.032,,0.026,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO006,0.0550,,,0.515,,,,,,,...,,0.1970,-0.339,-0.339,0.136,,,,,
21CO007,,,,0.099,,,,-0.567,,,...,0.070,0.0410,-0.077,-0.077,,0.167,,0.235,0.547,
22CO004,0.6220,,,-0.308,,,,-0.137,,,...,,-0.2215,0.193,0.193,-0.347,,,0.418,,
22CO006,0.6460,0.007,0.007,,,,,,,,...,0.026,0.0260,,,,,,,,-0.226


In [60]:
dropped = cl.reduce_multiindex(phos, 'Database_ID')
dropped = cl.reduce_multiindex(dropped, flatten=True)

fil = dropped.columns.get_level_values('Name').duplicated(False)
duplicates = phos[phos.columns[fil]] #for duplicates we want to keep all levels, so grab from phos instead of dropped
no_duplicates = dropped[dropped.columns[~fil]]

len(duplicates.columns.get_level_values('Name').unique())
# cl.reduce_multiindex(duplicates, flatten=True)
duplicates



Name,ACIN1,ACIN1,ADD1,ADD1,ARHGAP17,ARHGAP17,ARHGEF12,ARHGEF12,ARL6IP4,ARL6IP4,...,THEMIS,THEMIS,TJP1,TJP1,TMPO,TMPO,TPD52L2,TPD52L2,ZC3H14,ZC3H14
Site,S825,S825,S600,S600,S497,S497,S41,S41,S231,S231,...,S584,S584,S912,S912,S184,S184,S96,S96,S665,S665
Database_ID,Q9UKV3,Q9UKV3-5,P35611,P35611-2,Q68EM7,Q68EM7-2,Q9NZN5,Q9NZN5-2,Q66PJ3,Q66PJ3-2,...,Q8N1K5,Q8N1K5-4,Q07157,Q07157-2,P42166,P42167,O43399,O43399-2,Q6PJT7,Q6PJT7-2
01CO005,-1.5870,-0.880,,,,,,,,,...,-1.407,,-1.897,-1.878,-0.412,-0.1240,,,-1.258,
01CO006,0.4880,0.516,,,-0.597,0.311,,0.048,0.324,,...,,0.002,-1.079,-0.549,-1.244,-0.0870,-0.748,,,
01CO008,0.1075,-0.549,,,,,,-0.527,,,...,,,0.225,0.116,0.466,0.8870,,,,
01CO013,0.7230,0.370,,,,,,-0.174,,,...,0.457,-0.618,-0.566,-0.665,-1.290,-0.7400,-0.627,,,
01CO014,-1.5430,-0.444,,,,-0.467,,,,,...,,,-0.181,-0.899,-0.119,-0.4815,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21CO006,0.0870,-0.026,,,,,,0.195,,,...,,,-0.259,0.326,0.135,-0.4340,,,,
21CO007,-0.0170,,,,,,,,-0.625,0.216,...,,,0.203,-0.510,0.188,0.0855,,,0.486,
22CO004,0.6310,0.523,,,,,,0.264,,-0.297,...,,,0.399,-0.839,0.512,-0.0350,,,,
22CO006,-0.0065,-0.114,,,,,,0.321,,,...,,,-0.596,0.820,-0.044,-0.0200,,,,


In [59]:
#FOR NO DUPLICATES
#get the pval and min for no duplicates

#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
mut_type = cl.get_genotype_all_vars(gene)
# mut_type = mut_type.drop(columns = ["Mutation_Status", "Location"])

joined = mut_type.join(no_duplicates)
joined

#select samples containing hotspot mutations
cl_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

colon_hotspot = cl_hotspot_df.append(wildtype)
colon_hotspot = colon_hotspot.drop(columns = ["Mutation_Status", "Location"])

colon_hotspot

# Keep two values to compare
compare = ['Wildtype_Tumor','nonsynonymous SNV']
get = colon_hotspot['Mutation'].isin(compare)
missense_wt = colon_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "nonsynonymous SNV"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


no_dup_d = {}

for prot in no_duplicates:
    dif_colon = wt_med[prot] - missense_med[prot]
    no_dup_d[prot] = dif_colon
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(no_dup_d, orient='index', columns=['Difference_In_Median'])

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
no_dup_df = median_diff.join(t_test)
no_dup_df

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Difference_In_Median,P_Value
AAAS_S495,0.12500,0.356935
AAAS_S525,0.53400,
AAAS_S541,0.41475,0.159858
AAED1_S12,-0.07200,0.945226
AAGAB_S310,,
...,...,...
ZZZ3_S113,-0.14900,0.733776
ZZZ3_S391,-0.04600,
ZZZ3_S606,-0.11600,0.458431
ZZZ3_S90,,


In [62]:
#DEALING WITH DUPLICATES
#get the pval and min for duplicates

duplicates = cl.reduce_multiindex(duplicates, flatten=True)

# We need to figure which of databaseID we want to use, for each protein


#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest

mut_type = cl.get_genotype_all_vars(gene)
joined = mut_type.join(duplicates)
joined

#select samples containing hotspot mutations
cl_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

colon_hotspot = cl_hotspot_df.append(wildtype)
colon_hotspot = colon_hotspot.drop(columns = ["Mutation_Status", "Location"])
colon_hotspot

# Keep two values to compare
compare = ['Wildtype_Tumor','nonsynonymous SNV']
get = colon_hotspot['Mutation'].isin(compare)
missense_wt = colon_hotspot[get]
missense_wt['Mutation'].value_counts()

# Step 2 get the difference in medians
missense = missense_wt[missense_wt.Mutation == "nonsynonymous SNV"] #get all missense_mutations
wt = missense_wt[missense_wt.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

duplicates

duplicates_d = {}

for prot in duplicates:
    dif_brca = wt_med[prot] - missense_med[prot]
    duplicates_d[prot] = dif_brca
#     print(wt_med[prot], '-', missense_med[prot], '=',dif_endo)

median_diff = pd.DataFrame.from_dict(duplicates_d, orient='index', columns=['Difference_In_Median'])
median_diff

#step 3 do a t test for every gene and report the p-value
cols = list(missense_wt.columns[1:])
t_test = u.wrap_ttest(missense_wt, 'Mutation', cols, return_all = True)
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
dup_df = median_diff.join(t_test)


duplicate_gene_names = list(set(duplicates.columns.get_level_values('Name'))) #get a list of genes names that are duplicated
# duplicate_gene_names

for name in duplicate_gene_names:
    gene_duplicate = dup_df[dup_df.index.str.contains(name)]
    min_row = gene_duplicate[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    new_no_dup_df=no_dup_df.append(min_row)
new_no_dup_df

#reformat the table by getting rid of databaseIDs so we can join it to Endo and BRca
split = no_dup_df.index.str.split('_').str[0:2]
no_dup_df['new_index'] = split
# '_'.join(split)
no_dup_df['new_index'] = no_dup_df['new_index'].str.join('_')


colon_df = no_dup_df.set_index("new_index")


#are there duplicates in the index
colon_df



  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0_level_0,Difference_In_Median,P_Value
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1
AAAS_S495,0.12500,0.356935
AAAS_S525,0.53400,
AAAS_S541,0.41475,0.159858
AAED1_S12,-0.07200,0.945226
AAGAB_S310,,
...,...,...
ZZZ3_S113,-0.14900,0.733776
ZZZ3_S391,-0.04600,
ZZZ3_S606,-0.11600,0.458431
ZZZ3_S90,,


# Join Endo, Brca, and Colon together


In [63]:
brca_df=brca_df.rename_axis(None)

In [64]:
brca_endo = brca_df.join(endo_df, lsuffix='_Brca', rsuffix='_Endo')
brca_endo

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo
A2M_S710,0.27610,0.747631,,
AAAS_S495,-0.22110,0.119525,0.257295,0.650738
AAAS_S541,0.35720,0.638884,-0.061000,
AAED1_S12,-0.33260,0.159187,0.378000,0.168105
AAGAB_S310S311,0.44315,0.121689,,
...,...,...,...,...
ZZZ3_S397,-0.13435,0.761187,0.062100,0.529028
ZZZ3_S397T428N429,-0.39230,0.071512,,
ZZZ3_S606,0.18190,0.376978,,
ZZZ3_S82,0.01380,0.459838,,


In [65]:
final_table = brca_endo.join(colon_df, rsuffix='_colon')
final_table

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo,Difference_In_Median,P_Value
A2M_S710,0.27610,0.747631,,,,
AAAS_S495,-0.22110,0.119525,0.257295,0.650738,0.12500,0.356935
AAAS_S541,0.35720,0.638884,-0.061000,,0.41475,0.159858
AAED1_S12,-0.33260,0.159187,0.378000,0.168105,-0.07200,0.945226
AAGAB_S310S311,0.44315,0.121689,,,,
...,...,...,...,...,...,...
ZZZ3_S397,-0.13435,0.761187,0.062100,0.529028,,
ZZZ3_S397T428N429,-0.39230,0.071512,,,,
ZZZ3_S606,0.18190,0.376978,,,-0.11600,0.458431
ZZZ3_S82,0.01380,0.459838,,,,


In [66]:
#only keep significant pvals
significant = final_table.loc[(final_table['P_Value_Brca'] <= .05) |
                (final_table['P_Value_Endo'] <= .05) |
                (final_table['P_Value'] <= .05)]
significant

Unnamed: 0,Difference_In_Median_Brca,P_Value_Brca,Difference_In_Median_Endo,P_Value_Endo,Difference_In_Median,P_Value
AAGAB_S311,0.30040,0.704996,0.457000,0.569183,-0.6655,0.014602
AAK1_T606,0.24840,0.512914,0.570100,0.002949,-0.0205,0.814122
AAK1_T674,-0.44810,0.480089,0.573000,0.002438,-0.2455,0.950929
AATF_S153,0.69170,0.025235,-0.085200,0.177239,0.2540,0.230845
AATF_S316S320S321,0.30915,0.038469,,,,
...,...,...,...,...,...,...
ZRANB2_S83,0.00595,0.797180,-0.198100,0.033825,,
ZRANB2_T90,0.16985,0.643431,-3.050000,0.029035,,
ZRSR2_S384,-0.19190,0.975425,-0.213977,0.006471,0.2300,0.676926
ZYX_S290,-0.14015,0.473074,0.295000,0.032401,,


In [68]:
significant.to_csv("phosphoproteomics_trans.csv")