In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

  import pandas.util.testing as tm


# Load cancer data

In [2]:
en = cptac.Endometrial()
br = cptac.Brca()
cl = cptac.Colon()

                                                

In [5]:
mut = en.get_genotype_all_vars("PIK3CA")
mut



Name,Mutation,Location,Mutation_Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C3L-00006,Missense_Mutation,p.E545K,Single_mutation
C3L-00008,Wildtype_Tumor,No_mutation,Wildtype_Tumor
C3L-00032,Missense_Mutation,p.E545K,Single_mutation
C3L-00090,Wildtype_Tumor,No_mutation,Wildtype_Tumor
C3L-00098,Amplification,Amplification,Single_mutation
...,...,...,...
C3N-01520,Missense_Mutation,p.E726K,Single_mutation
C3N-01521,Amplification,Amplification,Single_mutation
C3N-01537,Wildtype_Tumor,No_mutation,Wildtype_Tumor
C3N-01802,Wildtype_Tumor,No_mutation,Wildtype_Tumor


In [9]:
len(mut.loc[mut["Mutation"]=="Missense_Mutation"])
len(mut.index)

95

In [10]:
prot = en.get_proteomics(tissue_type="tumor")
joined = mut.join(prot)
joined.head()

Name,Mutation,Location,Mutation_Status,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,p.E545K,Single_mutation,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
C3L-00008,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.685,-1.07,-0.684,0.984,0.135,0.334,1.3,...,-0.0356,,0.363,1.07,0.737,-0.564,-0.00461,-1.13,-0.0757,-0.473
C3L-00032,Missense_Mutation,p.E545K,Single_mutation,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
C3L-00090,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-1.67,-1.19,-0.443,0.243,-0.0993,0.757,0.74,...,0.0725,-0.0552,-0.0714,0.0933,0.156,-0.398,-0.0752,-0.797,-0.0301,-0.467
C3L-00098,Amplification,Amplification,Single_mutation,-0.374,-0.0206,-0.537,0.311,0.375,0.0131,-1.1,...,-0.176,,-1.22,-0.562,0.937,-0.646,0.207,-1.85,-0.176,0.0513


In [14]:
missense_mutations = joined.loc[joined["Mutation"] == "Missense_Mutation"]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
endo_missense = missense_mutations.append(wildtype)
endo_missense

Name,Mutation,Location,Mutation_Status,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,p.E545K,Single_mutation,-1.180,-0.8630,-0.8020,0.222,0.2560,0.6650,1.2800,...,-0.08770,,0.0229,0.109,,-0.332,-0.4330,-1.020,-0.12300,-0.0859
C3L-00032,Missense_Mutation,p.E545K,Single_mutation,-0.528,-1.3200,0.4350,,-0.2400,1.0400,-0.0213,...,0.00112,-0.1450,0.0105,-0.116,,0.151,-0.0740,-0.540,0.32000,-0.4190
C3L-00139,Missense_Mutation,p.Q546P,Multiple_mutation,-0.467,0.3700,-0.3390,,0.4340,0.0358,-0.1750,...,-0.67500,0.2390,0.1400,1.070,0.6070,0.486,0.1690,-0.632,-0.20300,-0.0685
C3L-00143,Missense_Mutation,p.R115L,Single_mutation,-1.120,-1.3100,0.9120,0.418,-0.0768,0.8460,-0.1210,...,-0.00212,,-1.1900,-1.270,-1.2700,-0.222,-0.3200,-0.620,0.36300,-0.4630
C3L-00156,Missense_Mutation,p.L628R,Multiple_mutation,-0.232,-0.0910,0.6180,1.630,-0.2500,1.3300,-1.7200,...,0.06520,-0.2360,0.5010,0.352,0.0839,-0.428,-0.2120,-1.340,0.31700,-0.9430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01211,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.636,-0.3710,0.9730,,0.6370,-0.1660,-0.1220,...,-0.41600,-0.7510,0.1790,0.658,0.4700,0.270,-0.1480,-0.139,-0.02990,0.1310
C3N-01217,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.743,-1.3700,,,-0.1930,0.6940,0.3380,...,0.18100,,0.1960,0.274,,-0.501,-0.0468,0.653,-0.04640,-0.1600
C3N-01219,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.295,-0.0589,0.4900,-0.109,-0.0495,0.0374,0.1020,...,-0.29000,0.0457,0.2460,0.589,0.3930,-0.199,-0.1590,-1.090,0.06270,-0.3790
C3N-01537,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.290,-0.3200,3.1700,-0.907,0.0317,-0.0425,,...,-0.17900,-0.5210,0.0523,0.515,0.5850,0.138,-0.1900,-0.966,-0.00627,-0.2490


In [15]:
prot_and_mutations = endo_missense.drop(columns = ["Mutation_Status", "Location"])
prot_and_mutations.head()

Name,Mutation,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,-0.339,0.412,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
C3L-00032,Missense_Mutation,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,-0.0479,0.419,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
C3L-00139,Missense_Mutation,-0.467,0.37,-0.339,,0.434,0.0358,-0.175,0.181,0.116,...,-0.675,0.239,0.14,1.07,0.607,0.486,0.169,-0.632,-0.203,-0.0685
C3L-00143,Missense_Mutation,-1.12,-1.31,0.912,0.418,-0.0768,0.846,-0.121,,-0.311,...,-0.00212,,-1.19,-1.27,-1.27,-0.222,-0.32,-0.62,0.363,-0.463
C3L-00156,Missense_Mutation,-0.232,-0.091,0.618,1.63,-0.25,1.33,-1.72,-0.889,0.633,...,0.0652,-0.236,0.501,0.352,0.0839,-0.428,-0.212,-1.34,0.317,-0.943


# Endometrial

## Filter out hotspot mutations
Hotspots are:
E542K
E545K
H1047R


In [28]:
#### Get the mutation type, and ptorteomics for PIK3CA

en.get_genotype_all_vars(gene)

len(mut.loc[mut["Mutation"]=="Missense_Mutation"])



1

In [22]:
prot = en.get_proteomics(tissue_type="tumor")



#### Join mutation type and proteomics together

In [23]:
joined = mut.join(prot)
joined.head()

Name,Mutation,Location,Mutation_Status,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
C3L-00008,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.685,-1.07,-0.684,0.984,0.135,0.334,1.3,...,-0.0356,,0.363,1.07,0.737,-0.564,-0.00461,-1.13,-0.0757,-0.473
C3L-00032,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
C3L-00090,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-1.67,-1.19,-0.443,0.243,-0.0993,0.757,0.74,...,0.0725,-0.0552,-0.0714,0.0933,0.156,-0.398,-0.0752,-0.797,-0.0301,-0.467
C3L-00098,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.374,-0.0206,-0.537,0.311,0.375,0.0131,-1.1,...,-0.176,,-1.22,-0.562,0.937,-0.646,0.207,-1.85,-0.176,0.0513


#### Select samples containing missense mutations

In [24]:
# en_hotspot_df = joined[joined.Location.str.contains('E542K') | 
#                     joined.Location.str.contains('E545K') |
#                     joined.Location.str.contains('H1047R')]
missense_mutations = joined.loc[joined["Mutation"] == "Missense_Mutation"]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

#### Join the proteomics for the wildtype mutations to the hotspot mutations

In [25]:
c# wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

# endo_hotspot = en_hotspot_df.append(wildtype)
endo_missense

Name,Mutation,Location,Mutation_Status,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-01311,Missense_Mutation,p.Y891C,Single_mutation,-0.261,0.0677,-1.2400,-0.5110,0.1310,0.1210,1.3100,...,0.08220,-0.6450,0.1940,0.0633,0.440,-0.397,-0.30800,-0.484,-0.14400,-0.1340
C3L-00006,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-1.180,-0.8630,-0.8020,0.2220,0.2560,0.6650,1.2800,...,-0.08770,,0.0229,0.1090,,-0.332,-0.43300,-1.020,-0.12300,-0.0859
C3L-00008,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.685,-1.0700,-0.6840,0.9840,0.1350,0.3340,1.3000,...,-0.03560,,0.3630,1.0700,0.737,-0.564,-0.00461,-1.130,-0.07570,-0.4730
C3L-00032,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.528,-1.3200,0.4350,,-0.2400,1.0400,-0.0213,...,0.00112,-0.1450,0.0105,-0.1160,,0.151,-0.07400,-0.540,0.32000,-0.4190
C3L-00090,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-1.670,-1.1900,-0.4430,0.2430,-0.0993,0.7570,0.7400,...,0.07250,-0.0552,-0.0714,0.0933,0.156,-0.398,-0.07520,-0.797,-0.03010,-0.4670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01510,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.269,0.9440,1.2000,-0.1550,0.2660,-0.5480,-0.1050,...,-0.20400,,-0.5830,0.9130,0.249,-0.515,-0.15100,-1.390,-0.12100,-0.4260
C3N-01520,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-1.070,-0.7120,0.0462,-0.0471,0.2270,1.3500,1.2100,...,-0.06990,-0.4010,0.5570,0.8270,0.348,-0.187,-0.10700,-0.830,0.06200,-0.5280
C3N-01537,Wildtype_Tumor,No_mutation,Wildtype_Tumor,-0.290,-0.3200,3.1700,-0.9070,0.0317,-0.0425,,...,-0.17900,-0.5210,0.0523,0.5150,0.585,0.138,-0.19000,-0.966,-0.00627,-0.2490
C3N-01802,Wildtype_Tumor,No_mutation,Wildtype_Tumor,0.266,1.3900,-0.0655,0.4700,0.3980,-0.1340,0.4610,...,-0.14000,,-0.0122,0.2500,0.553,0.387,0.06420,-0.437,0.10400,-0.4980


#### Drop unncecesary columns
The resulting dataframe is what we will be working with from this point forward. It contains the mutations type and the proteomics for each gene.

In [26]:
prot_and_mutations = endo_missense.drop(columns = ["Mutation_Status", "Location"])
prot_and_mutations.head()

Name,Mutation,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-01311,Missense_Mutation,-0.261,0.0677,-1.24,-0.511,0.131,0.121,1.31,-0.0531,-0.255,...,0.0822,-0.645,0.194,0.0633,0.44,-0.397,-0.308,-0.484,-0.144,-0.134
C3L-00006,Wildtype_Tumor,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,-0.339,0.412,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
C3L-00008,Wildtype_Tumor,-0.685,-1.07,-0.684,0.984,0.135,0.334,1.3,0.139,1.33,...,-0.0356,,0.363,1.07,0.737,-0.564,-0.00461,-1.13,-0.0757,-0.473
C3L-00032,Wildtype_Tumor,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,-0.0479,0.419,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
C3L-00090,Wildtype_Tumor,-1.67,-1.19,-0.443,0.243,-0.0993,0.757,0.74,-0.929,0.229,...,0.0725,-0.0552,-0.0714,0.0933,0.156,-0.398,-0.0752,-0.797,-0.0301,-0.467


## Run T-test and difference of Median for each cancer type

#### Get the median proteomic expression among all samples, for each gene.
This is done for missense mutations and wildtype separately


In [9]:
missense = prot_and_mutations[prot_and_mutations.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = prot_and_mutations[prot_and_mutations.Mutation == "Wildtype_Tumor"] #get all wildtype mutations
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

#### Get the medain difference between missense and wildtype for each gene.
This is done by (for each gene) subtracting the median of the missense mutations from the median of the wildtype.
This means that if the difference is positive the proteomics of the wildtype have higher expression than the missense mutation. If the difference is negative then the missense mutation has a higher expression than the wildtype.

In [10]:
endo_d = {}

for gene in prot_and_mutations:
    if gene == "Mutation": continue
    dif_endo =  missense_med[gene] - wt_med[gene]
    endo_d[gene] = dif_endo

median_diff = pd.DataFrame.from_dict(endo_d, orient='index', columns=['Difference_In_Median'])
median_diff.head()

Unnamed: 0,Difference_In_Median
A1BG,0.071
A2M,0.103
A2ML1,0.47135
A4GALT,0.22545
AAAS,0.0741


In [11]:
len(prot_and_mutations.loc[prot_and_mutations['Mutation']=="Missense_Mutation"])

43

In [17]:
prot_and_mutations

Name,Mutation,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,-1.180,-0.8630,-0.8020,0.222,0.2560,0.6650,1.2800,-0.3390,0.4120,...,-0.08770,,0.0229,0.109,,-0.332,-0.4330,-1.020,-0.12300,-0.0859
C3L-00032,Missense_Mutation,-0.528,-1.3200,0.4350,,-0.2400,1.0400,-0.0213,-0.0479,0.4190,...,0.00112,-0.1450,0.0105,-0.116,,0.151,-0.0740,-0.540,0.32000,-0.4190
C3L-00139,Missense_Mutation,-0.467,0.3700,-0.3390,,0.4340,0.0358,-0.1750,0.1810,0.1160,...,-0.67500,0.2390,0.1400,1.070,0.6070,0.486,0.1690,-0.632,-0.20300,-0.0685
C3L-00143,Missense_Mutation,-1.120,-1.3100,0.9120,0.418,-0.0768,0.8460,-0.1210,,-0.3110,...,-0.00212,,-1.1900,-1.270,-1.2700,-0.222,-0.3200,-0.620,0.36300,-0.4630
C3L-00156,Missense_Mutation,-0.232,-0.0910,0.6180,1.630,-0.2500,1.3300,-1.7200,-0.8890,0.6330,...,0.06520,-0.2360,0.5010,0.352,0.0839,-0.428,-0.2120,-1.340,0.31700,-0.9430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01211,Wildtype_Tumor,-0.636,-0.3710,0.9730,,0.6370,-0.1660,-0.1220,0.2320,0.3790,...,-0.41600,-0.7510,0.1790,0.658,0.4700,0.270,-0.1480,-0.139,-0.02990,0.1310
C3N-01217,Wildtype_Tumor,-0.743,-1.3700,,,-0.1930,0.6940,0.3380,0.0989,-0.0137,...,0.18100,,0.1960,0.274,,-0.501,-0.0468,0.653,-0.04640,-0.1600
C3N-01219,Wildtype_Tumor,-0.295,-0.0589,0.4900,-0.109,-0.0495,0.0374,0.1020,-0.3210,0.1830,...,-0.29000,0.0457,0.2460,0.589,0.3930,-0.199,-0.1590,-1.090,0.06270,-0.3790
C3N-01537,Wildtype_Tumor,-0.290,-0.3200,3.1700,-0.907,0.0317,-0.0425,,-0.2730,-0.0747,...,-0.17900,-0.5210,0.0523,0.515,0.5850,0.138,-0.1900,-0.966,-0.00627,-0.2490


In [16]:
u.wrap_ttest(prot_and_mutations, 'Mutation')



  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


In [12]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

In [21]:
 prot_and_mutations["Mutation"].unique()



array(['Missense_Mutation', 'Wildtype_Tumor'], dtype=object)

In [13]:
label_values = prot_and_mutations["Mutation"].unique()

'''Partition dataframe into two sets, one for each of the two unique values from the label column'''
partition1 = prot_and_mutations.loc[prot_and_mutations["Mutation"] == label_values[0]]
partition2 = prot_and_mutations.loc[prot_and_mutations["Mutation"] == label_values[1]]

'''If no comparison columns specified, use all columns except the specified labed column'''
comparison_columns = list(prot_and_mutations.columns)
comparison_columns.remove("Mutation")

number_of_comparisons = len(comparison_columns)

'''Store comparisons and p-values in two arrays'''
comparisons = []
pvals = []

'''Loop through each comparison column, perform the t-test, and record the p-val'''
times_through = 0
for column in comparison_columns:  
    times_through += 1
    if len(partition1[column].dropna(axis=0)) <= 1:
#         comparison_columns.remove(column)
        continue
    elif len(partition2[column].dropna(axis=0)) <= 1:
#         comparison_columns.remove(column)
        continue
    else:
        stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
        comparisons.append(column)
        pvals.append(pval)
    
'''Correct for multiple testing to determine if each comparison meets the new cutoff'''
results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=.05, method='fdr_bh')
reject = results[0]


'''Format results in a pandas dataframe'''
results_df = pd.DataFrame(columns=['Comparison','P_Value'])

for i in range(0, len(reject)):
    if reject[i]:
        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


'''Sort dataframe by ascending p-value'''
results_df = results_df.sort_values(by='P_Value', ascending=True)
results_df = results_df.reset_index(drop=True)

In [14]:
min(pvals)

0.0006916028868550783

In [16]:
import altair as alt
alt.data_transformers.disable_max_rows()
alt.data_transformers.enable('json')
alt.Chart(pd.DataFrame({"p_val":pvals})).mark_bar().encode(
    x=alt.X("p_val:Q",bin=alt.Bin(step=0.05)),
    y="count()"
)

#### Do a t-test for every gene.

In [19]:
genes = list(prot_and_mutations.columns[1:])
t_test = u.wrap_ttest(prot_and_mutations, 'Mutation')
# t_test = t_test.set_index('Comparison')
t_test

#### Join difference in median and t-test p value into the same dataframe

In [None]:
# Step 4 join median and p value together
endo_df = median_diff.join(t_test)
endo_df.head()

# Brca
The Brca data uses different database_IDs, giving us multiple proteomic values for each gene. To deal with this, we will first separate genes that have multiple proteomics values recoreded (multiple database_IDs) from ones that don't.

## Dealing with multiple database_IDs

#### Separate genes with multiple database_IDs from genes with multiple database_IDs

In [None]:
proteomics = br.get_proteomics(tissue_type="tumor")
fil = proteomics.columns.get_level_values('Name').duplicated(False)
duplicates = proteomics[proteomics.columns[fil]]
no_duplicates = proteomics[proteomics.columns[~fil]]
no_duplicates

### Dealing with genes that don't have multiple database_IDs
These will be processed the same way the data for endometrial was.

In [None]:
#FOR NO DUPLICATES
#Flatten the multiindex (since there are no duplicate gene names, this won't be a problem)
no_duplicates = cptac.utils.reduce_multiindex(no_duplicates, flatten=True)

#get mutation_tyoe
mut_type = br.get_genotype_all_vars('PIK3CA')

#join proteomics and mutation type
joined = mut_type.join(no_duplicates)

missense_mutations = joined.loc[joined["Mutation"] == "Missense_Mutation"]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_missense = missense_mutations.append(wildtype)


#drop unncessary columns
prot_and_mutations = brca_missense.drop(columns = ["Mutation_Status", "Location"])

# Get the difference in medians
missense = prot_and_mutations[prot_and_mutations.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = prot_and_mutations[prot_and_mutations.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


no_dup_d = {}

for prot in no_duplicates:
    dif_brca = missense_med[prot] - wt_med[prot]
    no_dup_d[prot] = dif_brca

median_diff = pd.DataFrame.from_dict(no_dup_d, orient='index', columns=['Difference_In_Median'])

#Do a t test for every gene and report the p-value
genes = list(prot_and_mutations.columns[1:])
t_test = u.wrap_ttest(prot_and_mutations, 'Mutation', correction_method="fdr_bh")
t_test = t_test.set_index('Comparison')

#join median and p value together
no_dup_df = median_diff.join(t_test)
no_dup_df.head()

## Dealing with genes that do have multiple database_IDs

#### Get t-test results and median difference

In [None]:
#DEALING WITH DUPLICATES
#flatten the multiindex
duplicates = cptac.utils.reduce_multiindex(duplicates, flatten=True)

# We need to figure which of databaseID we want to use, for each protein
proteomics = br.get_proteomics(tissue_type="tumor")
fil = proteomics.columns.get_level_values('Name').duplicated(False)
duplicates = proteomics[proteomics.columns[fil]]
duplicate_gene_names = list(set(duplicates.columns.get_level_values('Name'))) #get a list of genes names that are duplicated
duplicates = cptac.utils.reduce_multiindex(duplicates, flatten=True)
duplicates

#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest

mut_type = br.get_genotype_all_vars("PIK3CA")


joined = mut_type.join(duplicates)

missense_mutations = joined.loc[joined["Mutation"] == "Missense_Mutation"]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_missense = missense_mutations.append(wildtype)


#drop unncessary columns
prot_and_mutations = brca_missense.drop(columns = ["Mutation_Status", "Location"])


# get the difference in medians
missense = prot_and_mutations[prot_and_mutations.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = prot_and_mutations[prot_and_mutations.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


duplicates_d = {}

for prot in duplicates:
    dif_brca =  missense_med[prot] - wt_med[prot]
    duplicates_d[prot] = dif_brca
    

median_diff = pd.DataFrame.from_dict(duplicates_d, orient='index', columns=['Difference_In_Median'])


#step 3 do a t test for every gene and report the p-value
cols = list(prot_and_mutations.columns[1:])
t_test = u.wrap_ttest(prot_and_mutations, 'Mutation', correction_method="fdr_bh")
t_test = t_test.set_index('Comparison')

# Step 4 join median and p value together
dup_df = median_diff.join(t_test)
dup_df.head()

#### Select the database_ID that gives a lower p value from the t-test, and append selected database_ID to the no_duplicate dataframe

In [None]:
#for each duplicated gene name select the one with the min p val and append to no_duplicat df
for name in duplicate_gene_names:
    gene_duplicate = dup_df[dup_df.index.str.contains(name)]
    #select the one with the min p value for t test
    min_row = gene_duplicate[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    #append the selected version onto no_duplicate dataframe
    no_dup_df=no_dup_df.append(min_row)
no_dup_df=no_dup_df.sort_index()
no_dup_df.head()

#### Reformat the table by getting rid of databaseIDs so we can join it to Endo and Brca

In [None]:
split = no_dup_df.index.str.split('_', 1).str[0]
no_dup_df['new_index']=split
brca_df = no_dup_df.set_index("new_index")
brca_df.head()

# Colon

In [None]:
#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
proteomics = cl.get_proteomics(tissue_type="tumor")

prot_list = list(proteomics.columns)

mut_type = cl.get_genotype_all_vars("PIK3CA")


joined = mut_type.join(proteomics)
joined["Mutation"] = joined['Mutation'].replace(['nonsynonymous SNV'], 'Missense_Mutation')


#select samples containing hotspot mutations
missense_mutations = joined.loc[joined["Mutation"] == "Missense_Mutation"]
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
colon_missense = missense_mutations.append(wildtype)


#drop unncessary columns
prot_and_mutations = colon_missense.drop(columns = ["Mutation_Status", "Location"])


#get the difference in medians
missense = prot_and_mutations[prot_and_mutations.Mutation == "nonsynonymous SNV"] #get all missense_mutations
wt = prot_and_mutations[prot_and_mutations.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


colon_d = {}

for prot in proteomics:
    dif_colon = missense_med[prot] - wt_med[prot]
    colon_d[prot] = dif_colon

median_diff = pd.DataFrame.from_dict(colon_d, orient='index', columns=['Difference_In_Median'])

#do a t test for every gene and report the p-value
cols = list(prot_and_mutations.columns[1:])
t_test = u.wrap_ttest(prot_and_mutations, 'Mutation', correction_method="fdr_bh")
t_test = t_test.set_index('Comparison')

#join median and p value together
colon_df = median_diff.join(t_test)
colon_df.head()

# Combine Endo, Brca, and Colon into one table

#### Combine brca and endo

In [None]:
brca_df=brca_df.rename_axis(None)
brca_endo = brca_df.join(endo_df, lsuffix='_Brca', rsuffix='_Endo')
brca_endo.head()

#### Combine colon

In [None]:
final_table = brca_endo.join(colon_df, rsuffix='_colon')
final_table.head()

#### Only rows that contain a significant pvalue

In [None]:
significant = final_table.loc[(final_table['P_Value_Brca'] <= .05) |
                (final_table['P_Value_Endo'] <= .05) |
                (final_table['P_Value'] <= .05)]
significant.head()

In [None]:
significant.to_csv("~/WhenMutationsDontMatter/PIK3CA/csv_files/proteomics_no_hotspots.csv")