In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.stats.multitest
import operator




def wrap_ttest(df, label_column, comparison_columns=None, alpha=.05, return_all=False, correction_method='bonferroni', mincount=3, pval_return_corrected=True):
    try:
        '''Verify precondition that label column exists and has exactly 2 unique values'''
        label_values = df[label_column].unique()
        if len(label_values) != 2:
            print("Incorrectly Formatted Dataframe! Label column must have exactly 2 unique values.")
            return None
        
        '''Partition dataframe into two sets, one for each of the two unique values from the label column'''
        partition1 = df.loc[df[label_column] == label_values[0]]
        partition2 = df.loc[df[label_column] == label_values[1]]

        '''If no comparison columns specified, use all columns except the specified labed column'''
        if not comparison_columns:
            comparison_columns = list(df.columns)
            comparison_columns.remove(label_column)

        '''Determine the number of real valued columns on which we will do t-tests'''
        number_of_comparisons = len(comparison_columns)

        '''Store comparisons and p-values in two arrays'''
        comparisons = []
        pvals = []
        
        '''Loop through each comparison column, perform the t-test, and record the p-val'''
        
        for column in comparison_columns:
            if len(partition1[column].dropna(axis=0)) <= mincount:
                continue
            elif len(partition2[column].dropna(axis=0)) <= mincount:
                continue
            else:
                stat, pval = scipy.stats.ttest_ind(partition1[column].dropna(axis=0), partition2[column].dropna(axis=0))
                comparisons.append(column)
                pvals.append(pval)
                
        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]

        '''Format results in a pandas dataframe'''
        results_df = pd.DataFrame(columns=['Comparison','P_Value'])

        '''If return all, add all comparisons and p-values to dataframe'''
        if return_all:
            if pval_return_corrected:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = results[1]

            else:
                results_df['Comparison'] = comparisons
                results_df['P_Value'] = pvals

            '''Else only add significant comparisons'''
        else:
            for i in range(0, len(reject)):
                if reject[i]:
                    if pval_return_corrected:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':results[1][i]}, ignore_index=True)
                    else:
                        results_df = results_df.append({'Comparison':comparisons[i],'P_Value':pvals[i]}, ignore_index=True)


        '''Sort dataframe by ascending p-value'''
        results_df = results_df.sort_values(by='P_Value', ascending=True)
        results_df = results_df.reset_index(drop=True)

        '''If results df is not empty, return it, else return None'''
        if len(results_df) > 0:
            return results_df
        else:
            return None


    except:
        print("Incorrectly Formatted Dataframe!")
        return None

  import pandas.util.testing as tm


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u



# Load cancer data

In [3]:
en = cptac.Endometrial()
br = cptac.Brca()
cl = cptac.Colon()

                                                

In [4]:
gene = 'PIK3CA'

# Endometrial

## Filter out hotspot mutations
Hotspots are:
E542K
E545K
H1047R


#### Get the mutation type, and proteomics for PIK3CA

In [5]:
proteomics = en.get_proteomics(tissue_type="tumor")
mut_type = en.get_genotype_all_vars("PIK3CA")



#### Join mutation type and proteomics together

In [6]:
joined = mut_type.join(proteomics)

#### Select samples containing hotspot mutations

In [7]:
en_hotspot_df = joined[joined.Location.str.contains('E542') | 
                    joined.Location.str.contains('E545') |
                    joined.Location.str.contains('H1047')]

#### Join the proteomics for the wildtype mutations to the hotspot mutations

In [8]:
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]

endo_hotspot = en_hotspot_df.append(wildtype)

#### Drop unncecesary columns
The resulting dataframe is what we will be working with from this point forward. It contains the mutations type and the proteomics for each gene.

In [9]:
endo_prot_and_mutations = endo_hotspot.drop(columns = ["Mutation_Status", "Location"])
# endo_prot_and_mutations

## Run T-test and difference of Median for each cancer type

#### Get the median proteomic expression among all samples, for each gene.
This is done for missense mutations and wildtype separately


In [10]:
missense = endo_prot_and_mutations[endo_prot_and_mutations.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = endo_prot_and_mutations[endo_prot_and_mutations.Mutation == "Wildtype_Tumor"] #get all wildtype mutations
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype

#### Get the medain difference between missense and wildtype for each gene.
This is done by (for each gene) subtracting the median of the missense mutations from the median of the wildtype.
This means that if the difference is positive the proteomics of the wildtype have higher expression than the missense mutation. If the difference is negative then the missense mutation has a higher expression than the wildtype.

In [11]:
endo_d = {}

for gene in endo_prot_and_mutations:
    if gene == "Mutation": continue
    dif_endo =  missense_med[gene] - wt_med[gene]
    endo_d[gene] = dif_endo

median_diff = pd.DataFrame.from_dict(endo_d, orient='index', columns=['Difference_In_Median'])
median_diff.head()

Unnamed: 0,Difference_In_Median
A1BG,0.0185
A2M,-0.0635
A2ML1,1.08085
A4GALT,0.19145
AAAS,0.1561


In [12]:
endo_prot_and_mutations

Name,Mutation,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AADAT,AAED1,AAGAB,...,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,Missense_Mutation,-1.18,-0.863,-0.802,0.222,0.256,0.665,1.28,-0.339,0.412,...,-0.0877,,0.0229,0.109,,-0.332,-0.433,-1.02,-0.123,-0.0859
C3L-00032,Missense_Mutation,-0.528,-1.32,0.435,,-0.24,1.04,-0.0213,-0.0479,0.419,...,0.00112,-0.145,0.0105,-0.116,,0.151,-0.074,-0.54,0.32,-0.419
C3L-00362,Missense_Mutation,-0.924,-0.445,1.57,-0.678,0.173,0.436,0.227,-0.00248,0.479,...,-0.0496,,0.108,-0.153,,0.109,-0.336,-0.822,-0.0338,0.121
C3L-00601,Missense_Mutation,-0.454,-0.242,,,0.258,0.219,-0.249,-1.33,0.204,...,-0.158,,0.667,1.3,0.441,0.13,-0.0659,-0.923,-0.163,0.112
C3L-00605,Missense_Mutation,-0.24,0.594,3.4,0.154,0.0932,0.283,-0.0789,-0.611,0.37,...,-0.0511,-0.0136,0.214,0.683,0.433,-0.208,-0.347,-0.911,0.0692,-0.232
C3L-00918,Missense_Mutation,-0.698,-0.537,,,-0.0854,0.335,0.0685,0.123,0.273,...,0.209,,0.124,0.588,,-0.568,-0.222,-0.148,0.0622,-0.231
C3L-00921,Missense_Mutation,-1.19,-1.19,3.66,1.27,-0.105,-0.0021,0.0813,-0.691,-0.295,...,0.097,0.254,0.151,0.285,-0.239,-0.0681,-0.218,-0.0873,-0.127,0.374
C3L-00947,Missense_Mutation,-0.485,0.917,0.304,-0.00675,0.178,-0.423,0.224,-0.247,0.0787,...,0.0429,-0.212,0.169,0.521,-0.147,0.354,0.17,-1.01,-0.11,0.131
C3N-00323,Missense_Mutation,-0.817,-1.13,,,0.102,0.371,0.644,0.281,0.114,...,-0.0222,,-0.279,0.423,,0.294,-0.13,-0.106,-0.299,0.385
C3N-00324,Missense_Mutation,-0.449,-0.44,1.15,,-0.645,0.316,0.0809,-0.47,0.804,...,0.0449,0.0452,-0.152,-0.197,0.295,-0.188,-0.265,0.25,-0.113,-0.302


#### Do a t-test for every gene.

In [13]:
genes = list(endo_prot_and_mutations.columns[1:])
n = wrap_ttest(endo_prot_and_mutations, 'Mutation', correction_method="fdr_bh")
n

In [14]:
genes = list(endo_prot_and_mutations.columns[1:])
n = wrap_ttest(endo_prot_and_mutations, 'Mutation', correction_method="fdr_bh", return_all=True, pval_return_corrected=True)
n

Unnamed: 0,Comparison,P_Value
0,A1BG,0.995678
1,PPIA,0.995678
2,PPIAL4G,0.995678
3,PPIB,0.995678
4,PPIC,0.995678
...,...,...
10380,DIAPH3,0.999795
10381,POLD4,0.999852
10382,MASTL,0.999852
10383,BAZ1B,0.999876


#### Join difference in median and t-test p value into the same dataframe

In [15]:
# # Step 4 join median and p value together
# endo_df = median_diff.join(t_test)
# endo_df.head()

# BRCA

In [16]:
proteomics = br.get_proteomics(tissue_type="tumor")
fil = proteomics.columns.get_level_values('Name').duplicated(False)
duplicates = proteomics[proteomics.columns[fil]]
no_duplicates = proteomics[proteomics.columns[~fil]]
# duplicates

In [17]:
#DEALING WITH DUPLICATES
#flatten the multiindex
duplicates = cptac.utils.reduce_multiindex(duplicates, flatten=True)

# We need to figure which of databaseID we want to use, for each protein
proteomics = br.get_proteomics(tissue_type="tumor")
fil = proteomics.columns.get_level_values('Name').duplicated(False)
duplicates = proteomics[proteomics.columns[fil]]
duplicate_gene_names = list(set(duplicates.columns.get_level_values('Name'))) #get a list of genes names that are duplicated
duplicates = cptac.utils.reduce_multiindex(duplicates, flatten=True)
duplicates

#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest

mut_type = br.get_genotype_all_vars("PIK3CA")
joined = mut_type.join(duplicates)

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]


#join wildtype proteomics to missense mutation proteomics 
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_hotspot = br_hotspot_df.append(wildtype)

#drop unncessary columns
filter_prot_and_mutations = brca_hotspot.drop(columns = ["Mutation_Status", "Location"])


#step 3 do a t test for every gene and report the p-value
cols = list(filter_prot_and_mutations.columns[1:])
t_test = u.wrap_ttest(filter_prot_and_mutations, 'Mutation', cols, correction_method= "fdr_bh", return_all=True)

# t_test



In [18]:
#for each duplicated gene name select the one with the min p val and append[ to no_duplicat df

selected_dup = []
for name in duplicate_gene_names:
    gene_duplicate = t_test[t_test.Comparison.str.contains(name)]
    #select the one with the min p value for t test
#     import pdb; pdb.set_trace()
    min_row = gene_duplicate.loc[gene_duplicate.P_Value == gene_duplicate.P_Value.min()]
    name = min_row['Comparison'].values
    selected_dup.append(name)

selected= []
no_dup = cptac.utils.reduce_multiindex(no_duplicates, flatten=True)
no_dup.columns
for name in no_dup.columns:
    selected.append(name)

In [19]:
#use the list of selected genes to filter dataframe
proteomics = br.get_proteomics(tissue_type="tumor")
proteomics = cptac.utils.reduce_multiindex(proteomics, flatten=True)
selected_prot = proteomics[proteomics.columns.intersection(selected)]

#get mutation_tyoe
mut_type = br.get_genotype_all_vars('PIK3CA')

#join proteomics and mutation type
joined = mut_type.join(selected_prot)

#select samples containing hotspot mutations
br_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]


#join wildtype proteomics to missense mutation proteomics 
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
brca_hotspot = br_hotspot_df.append(wildtype)

#drop unncessary columns
brca_prot_and_mutations = brca_hotspot.drop(columns = ["Mutation_Status", "Location"])
#drop databaseID identifier
split = brca_prot_and_mutations.columns.str.split('_', 1).str[0]
brca_prot_and_mutations.columns=split

#remove duplicate columns
brca_prot_and_mutations = brca_prot_and_mutations.loc[:,~brca_prot_and_mutations.columns.duplicated()]
brca_prot_and_mutations 




Name,Mutation,A1BG,A2M,A2ML1,AAAS,AACS,AADAT,AAED1,AAGAB,AAK1,...,ZSCAN31,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT001846,Missense_Mutation,1.3964,1.3302,-5.0948,0.7674,-1.6845,,2.1022,-0.5814,0.2916,...,-0.7592,0.4711,0.6018,0.2062,-0.2137,-2.1219,0.0860,2.5814,-0.2852,-0.1074
X11BR004,Missense_Mutation,1.3627,1.8899,-0.5023,-0.6201,0.8102,,-1.1417,-0.1994,0.3278,...,1.8955,0.3530,0.8214,1.2056,-0.6622,-0.5864,-0.0059,1.3178,0.4372,-1.0408
X11BR013,Missense_Mutation,0.7803,-0.6980,-3.5729,0.7011,-1.8474,-2.7737,-0.0788,0.2499,-1.4275,...,,1.0010,-0.6068,0.8283,-1.0147,1.4330,0.2379,-1.0891,-0.2156,-0.3116
X11BR014,Missense_Mutation,0.9126,-0.2126,-8.9169,-0.6406,-1.9189,-2.4744,-2.0662,-0.2834,0.4592,...,1.2953,-0.1190,0.1474,-1.5504,-3.3162,-0.5357,1.4172,1.8508,0.2182,-0.5697
X11BR022,Missense_Mutation,1.9095,1.4993,-4.9660,-0.5247,-0.2128,-3.1250,-0.2501,1.9603,-0.1348,...,,-0.3823,-0.0501,0.1364,-0.6264,-0.8129,-1.1180,-0.5281,-0.6841,0.0143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X15BR003,Wildtype_Tumor,2.4710,1.0447,1.6000,-0.2389,-0.2619,,1.2958,0.1298,0.3695,...,-0.3656,-1.0569,0.8718,-0.4324,-0.7942,-2.4072,1.2313,0.6114,-0.6675,-0.8818
X18BR004,Wildtype_Tumor,-0.4652,-0.2157,-3.6000,-0.3496,-0.9725,,,0.2205,-0.7924,...,-2.0780,0.9906,0.3147,0.0305,-0.2289,,0.2189,-0.3281,0.4254,-0.1496
X18BR006,Wildtype_Tumor,0.0999,0.7852,-8.1266,-0.5701,-2.1021,,-0.5732,0.4928,-0.2838,...,-0.3722,0.0816,0.0086,0.3131,-0.2260,-0.6128,0.7913,0.8553,-0.0371,-0.3874
X18BR017,Wildtype_Tumor,0.8934,1.6886,-6.0217,-0.0481,0.1299,,,1.5536,0.5748,...,-0.2404,0.6150,0.2906,0.2303,-0.3553,0.4313,1.0197,1.0025,1.8005,-0.5533


In [20]:
len(brca_prot_and_mutations.loc[brca_prot_and_mutations["Mutation"]=="Missense_Mutation"])

18

In [21]:
n = wrap_ttest(brca_prot_and_mutations, 'Mutation', correction_method= "fdr_bh", pval_return_corrected=True, return_all=True)
n.head(15)

Unnamed: 0,Comparison,P_Value
0,HSPE1-MOB4,0.011107
1,UBE4A,0.086181
2,IDH3A,0.091678
3,HSPE1,0.091678
4,ACAP2,0.091678
5,OXSM,0.112433
6,PUS1,0.115105
7,HSPD1,0.169127
8,CD320,0.169127
9,MRPS6,0.169127


In [22]:
n = wrap_ttest(brca_prot_and_mutations, 'Mutation', correction_method= "fdr_bh", pval_return_corrected=True, return_all=True)
n

Unnamed: 0,Comparison,P_Value
0,HSPE1-MOB4,0.011107
1,UBE4A,0.086181
2,IDH3A,0.091678
3,HSPE1,0.091678
4,ACAP2,0.091678
...,...,...
9431,A2M,0.999473
9432,MAX,0.999599
9433,RASIP1,0.999642
9434,ARHGEF17,0.999816


In [23]:
# b_mut = brca_prot_and_mutations.loc[brca_prot_and_mutations["Mutation"]=="Missense_Mutation"]
b_mut=brca_prot_and_mutations
full = b_mut.dropna(axis=1)
full
len(brca_prot_and_mutations.loc[brca_prot_and_mutations["Mutation"]=="Wildtype_Tumor"])

53

In [24]:
#now run ttest
n = wrap_ttest(brca_prot_and_mutations, 'Mutation', correction_method= "fdr_bh", pval_return_corrected=True)
n

Unnamed: 0,Comparison,P_Value
0,HSPE1-MOB4,0.011107


In [25]:
#let make a box plot. 


# Colon

In [26]:
#  Step 1 - Create dataframe in order to do comparisons with wrap_ttest
proteomics = cl.get_proteomics(tissue_type="tumor")

prot_list = list(proteomics.columns)

mut_type = cl.get_genotype_all_vars("PIK3CA")


joined = mut_type.join(proteomics)

#select samples containing hotspot mutations
cl_hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]

#join wildtype proteomics to missense mutation proteomics 
wildtype  = joined.loc[joined.Mutation == "Wildtype_Tumor"]
colon_hotspot = cl_hotspot_df.append(wildtype)

#drop unncessary columns
col_prot_and_mutations = colon_hotspot.drop(columns = ["Mutation_Status", "Location"])

col_prot_and_mutations["Mutation"] = col_prot_and_mutations['Mutation'].replace(['nonsynonymous SNV'], 'Missense_Mutation')
col_prot_and_mutations



Name,Mutation,A1BG,A1CF,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,...,ZNHIT6,ZNRD1,ZNRF2,ZPR1,ZRANB2,ZW10,ZWILCH,ZWINT,ZYX,ZZEF1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
05CO028,Missense_Mutation,-1.340,0.343,-1.6600,0.5040,0.1620,0.3190,-0.2980,-0.2150,-0.1050,...,,,-0.0595,0.1690,0.1450,0.1970,-0.3990,,-0.7540,-0.4230
05CO032,Missense_Mutation,-1.220,-0.575,-0.5960,0.2990,-0.0254,0.7640,0.1040,-0.6940,0.3560,...,0.0105,,-0.2850,-0.1480,0.0751,0.0503,0.2300,,-0.5900,-0.0644
09CO005,Missense_Mutation,-1.490,-0.200,-1.2800,-0.1220,0.2730,0.2230,0.3590,-1.1200,-0.0215,...,,,-0.2660,0.0078,0.5310,-0.3550,-0.6400,,-0.4300,-0.3340
11CO027,Missense_Mutation,-1.090,0.236,-1.0400,0.3070,0.0530,0.4660,-0.0733,-0.0244,0.8510,...,,,,0.3930,0.7610,0.3730,,,0.0261,-0.1030
11CO052,Missense_Mutation,-0.280,-0.623,-0.9320,-0.0443,0.0090,0.0958,0.1710,-0.3130,0.0981,...,0.1510,,0.2730,0.2830,0.1690,0.1180,0.2930,,-0.1450,-0.0604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20CO001,Wildtype_Tumor,0.629,-0.068,0.0734,-0.6540,0.2130,0.1550,0.5650,-0.0406,-0.1540,...,,,-0.1340,0.4150,-0.0765,-0.0385,,,0.4260,0.5190
20CO003,Wildtype_Tumor,-0.123,-1.070,-0.4520,0.2660,-0.2280,0.0484,0.1710,0.7850,0.3000,...,,,,-0.0415,0.0830,-0.0374,-0.1320,,0.6810,-0.2970
22CO004,Wildtype_Tumor,-1.280,0.518,-0.8720,0.2580,0.4240,0.2980,-0.1500,-0.6580,0.0066,...,,,0.4490,-0.0616,0.3850,0.1160,-0.0437,,-0.8840,0.0088
22CO006,Wildtype_Tumor,0.515,-1.210,-0.0283,0.1780,-0.3030,0.1500,0.3990,-1.0000,0.2060,...,,,-0.5610,0.4740,0.0836,-0.2510,0.1760,,0.0584,-0.3010


In [27]:
col_prot_and_mutations.loc[col_prot_and_mutations["Mutation"]=="Missense_Mutation"]

Name,Mutation,A1BG,A1CF,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,...,ZNHIT6,ZNRD1,ZNRF2,ZPR1,ZRANB2,ZW10,ZWILCH,ZWINT,ZYX,ZZEF1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
05CO028,Missense_Mutation,-1.34,0.343,-1.66,0.504,0.162,0.319,-0.298,-0.215,-0.105,...,,,-0.0595,0.169,0.145,0.197,-0.399,,-0.754,-0.423
05CO032,Missense_Mutation,-1.22,-0.575,-0.596,0.299,-0.0254,0.764,0.104,-0.694,0.356,...,0.0105,,-0.285,-0.148,0.0751,0.0503,0.23,,-0.59,-0.0644
09CO005,Missense_Mutation,-1.49,-0.2,-1.28,-0.122,0.273,0.223,0.359,-1.12,-0.0215,...,,,-0.266,0.0078,0.531,-0.355,-0.64,,-0.43,-0.334
11CO027,Missense_Mutation,-1.09,0.236,-1.04,0.307,0.053,0.466,-0.0733,-0.0244,0.851,...,,,,0.393,0.761,0.373,,,0.0261,-0.103
11CO052,Missense_Mutation,-0.28,-0.623,-0.932,-0.0443,0.009,0.0958,0.171,-0.313,0.0981,...,0.151,,0.273,0.283,0.169,0.118,0.293,,-0.145,-0.0604
11CO062,Missense_Mutation,-0.0901,-0.0789,0.34,-1.23,0.519,0.713,-0.084,-0.934,-0.0101,...,,,,0.227,0.204,-0.113,,,0.0222,-0.0726
16CO006,Missense_Mutation,0.178,0.0866,-0.525,-0.129,-0.145,0.219,-0.129,-0.198,0.0203,...,,,-0.515,0.284,0.0238,0.0339,0.157,,0.259,-0.0582
16CO012,Missense_Mutation,,,,,,,,,,...,,,,,,,,,,
20CO004,Missense_Mutation,-1.87,0.461,-1.85,-0.157,1.29,0.133,-0.223,-0.246,0.235,...,0.543,,-0.197,-0.124,-0.283,0.0343,-0.376,,-0.503,-0.522


In [28]:
#get the difference in medians
missense = col_prot_and_mutations[col_prot_and_mutations.Mutation == "Missense_Mutation"] #get all missense_mutations
wt = col_prot_and_mutations[col_prot_and_mutations.Mutation == "Wildtype_Tumor"] #getn all wildtype
missense_med = missense.median() #get the medain of the missense. This will give the median expression among all samples, for each gene
wt_med = wt.median() #get the median of the wildtype


colon_d = {}

for prot in proteomics:
    dif_colon = missense_med[prot] - wt_med[prot]
    colon_d[prot] = dif_colon

median_diff = pd.DataFrame.from_dict(colon_d, orient='index', columns=['Difference_In_Median'])

#do a t test for every gene and report the p-value
cols = list(col_prot_and_mutations.columns[1:])
t_test = wrap_ttest(col_prot_and_mutations, 'Mutation', cols, correction_method= "fdr_bh")
# t_test = t_test.set_index('Comparison')
t_test

In [29]:
# t_test.head(30)

In [30]:
#join median and p value together
colon_df = median_diff.join(t_test)
colon_df.head()

TypeError: 'NoneType' object is not iterable

# Combine Endo, Brca, and Colon into one table

#### Combine brca and endo

In [None]:
brca_df=brca_df.rename_axis(None)
brca_endo = brca_df.join(endo_df, lsuffix='_Brca', rsuffix='_Endo')
brca_endo.head()

#### Combine colon

In [None]:
final_table = brca_endo.join(colon_df, rsuffix='_colon')
final_table.head()

#### Only rows that contain a significant pvalue

In [None]:
significant = final_table.loc[(final_table['P_Value_Brca'] <= .05) |
                (final_table['P_Value_Endo'] <= .05) |
                (final_table['P_Value'] <= .05)]
significant.head()

In [None]:
significant.to_csv("~/WhenMutationsDontMatter/PIK3CA/csv_files/proteomics.csv")