# Create tables with p-value and change in medians for multiple cancers 

Create a dataframe with p-value results from t-tests for all proteins (trans gene proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as pu

In [2]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN'):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object is not l:
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = cancer_object.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = l.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot)
        omics = l.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = l.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [3]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    

    for prot in all_prot_list:
        dif = wt_med[prot+'_proteomics'] - del_med[prot+'_proteomics']
        med_dict[prot+'_proteomics'] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [4]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that lscc index is up-to-date...       



                                            

# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only cnv deletions and wiltype for PTEN. 

In [5]:
gene = 'PTEN'
g_prot = g.get_proteomics()
g_prot_list = list(g_prot.columns)

g_del_wt = all_prot_format_df(g, g_prot_list)
g_del_wt.head()



Name,A1BG_proteomics,A2M_proteomics,AAAS_proteomics,AACS_proteomics,AADAT_proteomics,AAED1_proteomics,AAGAB_proteomics,AAK1_proteomics,AAMDC_proteomics,AAMP_proteomics,...,ZSWIM8_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,0.07763,0.487228,-0.254208,-0.144373,0.551881,-0.025276,-0.467451,-0.089511,-0.078806,0.329603,...,-0.047437,-0.105908,-0.347076,,0.459635,0.079452,-0.784983,-0.488441,0.16799,Deletion
C3L-00365,-0.145975,0.798796,0.184242,-0.470603,,0.390211,0.245466,-0.609998,0.118625,-0.086927,...,0.161975,-0.213093,0.235571,,0.107421,0.048724,0.138403,-0.290141,0.405037,Deletion
C3L-00674,0.821991,1.09647,-0.094421,-0.106304,0.084578,0.176402,-0.248151,0.014061,-0.699773,-0.638462,...,-0.065534,-0.306717,0.879991,,0.883564,-0.172222,0.011876,-0.131889,-0.503581,Deletion
C3L-00677,-0.064567,0.129385,0.047751,-0.118187,0.237434,,0.303847,0.322163,-0.555479,-0.363414,...,-0.254535,0.463653,0.58023,0.503044,-0.604986,0.178077,-0.720059,-0.150197,-0.268715,Deletion
C3L-01040,-0.763691,-1.031834,-0.217194,-0.695701,0.184173,-0.474816,-0.051789,0.344842,-0.642746,0.068863,...,-0.092502,0.010639,-0.465079,,-0.500083,0.112651,1.00466,-0.230304,-0.102416,Deletion


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used: FDR_BH

In [6]:
cols = list(g_del_wt.columns[:-1])

# Get only sig genes
g_sig = u.wrap_ttest(g_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
# Create list of sig genes
if g_sig is not None:
    g_sig_list = list(g_sig.Comparison)
else: 
    g_sig_list = None
print('significant pvals: \n',g_sig)

# Get all pvals
g = u.wrap_ttest(g_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
g_pval = g.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Gbm_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
               Comparison       P_Value
0       ARMH3_proteomics  5.396032e-11
1        CUTC_proteomics  1.593480e-10
2     PIP4K2A_proteomics  1.009419e-09
3        CUL2_proteomics  1.122076e-09
4        GDI2_proteomics  1.302273e-09
...                  ...           ...
1882  GOLGA7B_proteomics  8.446816e-03
1883     TEX2_proteomics  8.452685e-03
1884     MAFF_proteomics  8.463737e-03
1885   LRPAP1_proteomics  8.466030e-03
1886  PPP2R2B_proteomics  8.466960e-03

[1887 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


Part 3: Create change in medians data frame.

In [7]:
g_med = get_change_in_medians_df(g, "Gbm", g_del_wt, g_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Part 4: Merge the data frame with p-values and the data frame with difference in medians.

In [8]:
g_merged = g_pval.merge(g_med, on='Proteomics',how='outer')
g_merged

Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median
0,ARMH3_proteomics,5.396032e-11,0.405134
1,CUTC_proteomics,1.593480e-10,0.553255
2,PIP4K2A_proteomics,1.009419e-09,0.838882
3,CUL2_proteomics,1.122076e-09,0.586396
4,GDI2_proteomics,1.302273e-09,0.610188
...,...,...,...
11136,ZNF805_proteomics,,
11137,ZNF813_proteomics,,
11138,ZNF814_proteomics,,0.265936
11139,ZNF888_proteomics,,


# Repeat for other cancers.

# Ovarian

In [9]:
#o = cptac.Ovarian()

                                            

In [10]:
gene = 'PTEN'
o_prot = o.get_proteomics()
o_prot = o.reduce_multiindex(o_prot, levels_to_drop = 1)
o_prot_list = list(o_prot.columns)

o_del_wt = all_prot_format_df(o, o_prot_list)
# Drop duplicate columns - FIX
o_del_wt = o_del_wt.loc[:,~o_del_wt.columns.duplicated()]



Name,A1BG_proteomics,A2M_proteomics,A2ML1_proteomics,AAAS_proteomics,AACS_proteomics,AAGAB_proteomics,AAK1_proteomics,AAMDC_proteomics,AAMP_proteomics,AAR2_proteomics,...,ZSCAN32_proteomics,ZSWIM8_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01OV007,0.133634,0.765120,-0.778849,0.118236,0.112740,0.054942,0.313055,-0.786270,-0.384680,0.064017,...,-0.323516,0.074209,0.232999,-0.195793,,-0.145053,0.590217,0.019414,,Wildtype_Tumor
01OV017,-0.432786,-0.461906,-0.533497,0.041800,0.604114,0.351294,0.546605,0.603935,-0.235374,0.059084,...,0.032004,0.016276,0.587911,0.574339,,0.075401,-0.437564,-0.208831,,Deletion
01OV018,-0.013943,0.065842,,-0.010514,0.541935,0.169826,-0.023032,-0.057549,-0.397627,0.199581,...,0.002237,0.206993,0.111037,1.020321,,-0.245864,0.008411,0.130504,-0.367433,Wildtype_Tumor
01OV023,1.234203,1.512082,-0.703050,0.181152,0.087719,0.226861,-0.100766,-0.516022,-0.456512,-0.257629,...,,-0.529948,0.034713,0.149523,-0.313370,-0.507473,0.221818,0.278313,-1.008998,Deletion
01OV026,-0.482872,0.343872,-1.150936,0.315155,0.097235,-0.160208,-0.125165,-0.768776,-0.372368,0.450086,...,-0.752819,-0.181027,0.179612,0.007854,-0.106049,0.377063,-0.295592,0.011997,-0.236003,Wildtype_Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26OV002,1.942785,2.028395,-0.506808,-0.237500,-0.767353,-0.094263,0.164654,-0.120120,-0.560682,0.116122,...,0.291665,-0.205511,0.280838,1.143053,0.545983,0.045314,-0.463719,-0.321074,0.103304,Deletion
26OV008,-0.576189,-1.723434,2.023556,0.066470,0.576987,0.725690,0.610382,-0.704039,0.121085,0.534117,...,0.021077,0.054952,0.247094,0.365626,,0.101908,0.338000,-0.429887,-0.578476,Wildtype_Tumor
26OV009,0.072513,0.594745,0.609137,0.584056,0.041583,-0.269243,-0.461148,-0.297156,-0.027213,0.596118,...,-0.578555,0.533873,0.440283,0.738610,,0.482615,-0.685922,-0.286718,-0.542423,Wildtype_Tumor
26OV011,-0.835139,-0.861639,-0.834907,-0.019997,-0.381831,-0.186141,0.373510,0.337066,-0.046594,0.236359,...,-0.112791,-0.659868,0.076714,-0.229787,-0.320269,0.549351,0.014579,0.092864,-0.189991,Wildtype_Tumor


T-tests

In [11]:
cols = list(o_del_wt.columns[:-1])

# Get only sig sites
o_sig = u.wrap_ttest(o_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if o_sig is not None:
    o_sig_list = list(o_sig.Comparison)
else: 
    o_sig_list = None
print('significant pvals: \n',o_sig)

# Get all pvals
o = u.wrap_ttest(o_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
o_pval = o.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Ov_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1
  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


significant pvals: 
          Comparison       P_Value
0  MMS19_proteomics  1.058914e-08


  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Proteomics,Ov_P_Value
0,MMS19_proteomics,1.058914e-08
1,RACK1_proteomics,8.573522e-05
2,PI4K2A_proteomics,9.310383e-05
3,WAPL_proteomics,1.724680e-04
4,IDE_proteomics,2.011475e-04
...,...,...
10072,SAXO2_proteomics,
10073,SH3GL3_proteomics,
10074,SLC22A3_proteomics,
10075,SSX2B_proteomics,


In [12]:
o_med = get_change_in_medians_df(o, "Ov", o_del_wt, o_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [13]:
o_merged = o_pval.merge(o_med, on='Proteomics',how='inner')
o_merged

Unnamed: 0,Proteomics,Ov_P_Value,Ov_Median
0,MMS19_proteomics,1.058914e-08,0.324897
1,RACK1_proteomics,8.573522e-05,0.191250
2,PI4K2A_proteomics,9.310383e-05,0.298399
3,WAPL_proteomics,1.724680e-04,0.218775
4,IDE_proteomics,2.011475e-04,0.208025
...,...,...,...
10072,SAXO2_proteomics,,0.099993
10073,SH3GL3_proteomics,,-0.843461
10074,SLC22A3_proteomics,,-0.783325
10075,SSX2B_proteomics,,


# Breast

In [15]:
gene = 'PTEN'
b_prot = b.get_proteomics()
b_prot = b.reduce_multiindex(b_prot, levels_to_drop = 1)
b_prot_list = list(b_prot.columns)

b_del_wt = all_prot_format_df(b, b_prot_list)
b_del_wt = b_del_wt.loc[:,~b_del_wt.columns.duplicated()]
b_del_wt.head()



Name,A1BG_proteomics,A2M_proteomics,A2ML1_proteomics,AAAS_proteomics,AACS_proteomics,AADAT_proteomics,AAED1_proteomics,AAGAB_proteomics,AAK1_proteomics,AAMDC_proteomics,...,ZSWIM8_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CPT000814,-0.6712,-0.2075,2.7959,1.3969,-1.0899,,1.6708,-0.3484,-0.4756,-0.7299,...,-0.6536,0.3384,2.1169,1.391,-2.123,0.9136,-0.8082,-1.4793,0.9136,Wildtype_Tumor
CPT001846,1.3964,1.3302,-5.0948,0.7674,-1.6845,,2.1022,-0.5814,0.2916,-2.2857,...,0.4711,0.6018,0.2062,-0.2137,-2.1219,0.086,2.5814,-0.2852,-0.1074,Wildtype_Tumor
X01BR001,2.0219,1.6269,-3.2943,0.3352,-1.0739,1.2255,0.2754,-1.1187,-0.0534,-0.2519,...,0.2306,-0.301,0.3395,-0.5316,,0.4996,0.7622,-1.5607,0.0256,Wildtype_Tumor
X01BR009,1.2556,3.4489,2.8043,-0.2956,-1.7261,,,-2.0471,-0.3547,-0.8298,...,-0.2596,0.1898,-0.501,-0.4189,0.308,0.5057,0.2181,-0.2288,-0.275,Wildtype_Tumor
X01BR010,-0.3843,-1.0239,-5.3604,0.3739,3.1422,7.6788,0.1961,-1.2437,-0.0015,-0.0953,...,0.0874,1.6506,-2.0636,-1.2733,-1.2338,-0.7794,1.5518,-0.041,-0.4756,Deletion


In [16]:
cols = list(b_del_wt.columns[:-1])

# Get only sig sites
b_sig = u.wrap_ttest(b_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if b_sig is not None:
    b_sig_list = list(b_sig.Comparison)
else: 
    b_sig_list = None
print('significant pvals: \n',b_sig)

# Get all pvals
b_pval = u.wrap_ttest(b_del_wt, 'Mutation', cols, return_all = True)
b_pval = b_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Brca_P_Value'}

significant pvals: 
         Comparison   P_Value
0  PTEN_proteomics  0.000004


In [17]:
b_med = get_change_in_medians_df(b, "Brca", b_del_wt, b_prot_list)

In [18]:
b_merged = b_pval.merge(b_med, on='Proteomics',how='outer')
b_merged

Unnamed: 0,Proteomics,Brca_P_Value,Brca_Median
0,PTEN_proteomics,0.000004,1.00800
1,EIF4H_proteomics,0.000025,-0.41295
2,MIEF1_proteomics,0.000031,-0.84135
3,TMSB10_proteomics,0.000043,-0.87630
4,DRG1_proteomics,0.000065,-0.75135
...,...,...,...
9766,POLR3E_proteomics,0.999356,0.09685
9767,IGFBP7_proteomics,0.999595,-0.01140
9768,AAMP_proteomics,0.999613,-0.15430
9769,TRA2B_proteomics,0.999771,0.17180


# Colon

In [20]:
gene = 'PTEN'
c_prot = col.get_proteomics()
c_prot_list = list(c_prot.columns)

c_del_wt = all_prot_format_df(col, c_prot_list)
c_del_wt = c_del_wt.loc[:,~c_del_wt.columns.duplicated()]



In [21]:
cols = list(c_del_wt.columns[:-1])

# Get only sig sites
c_sig = u.wrap_ttest(c_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if c_sig is not None:
    c_sig_list = list(c_sig.Comparison)
else: 
    c_sig_list = None
print('significant pvals: \n',c_sig)

# Get all pvals
c_pval = u.wrap_ttest(c_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
c_pval = c_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Colon_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
 None


In [22]:
c_med = get_change_in_medians_df(col, "Colon", c_del_wt, c_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [23]:
c_merged = c_pval.merge(c_med, on='Proteomics',how='outer')
c_merged

Unnamed: 0,Proteomics,Colon_P_Value,Colon_Median
0,DFFA_proteomics,0.000037,0.19930
1,WAPL_proteomics,0.000097,0.19605
2,SEC14L2_proteomics,0.000136,0.66800
3,GBF1_proteomics,0.000164,0.12630
4,STK11IP_proteomics,0.000239,0.31250
...,...,...,...
8062,ZNF841_proteomics,,-0.78250
8063,ZNF888_proteomics,,
8064,ZNHIT6_proteomics,,
8065,ZNRD1_proteomics,,0.06800


# Hnscc

In [24]:
gene = 'PTEN'
h_prot = h.get_proteomics()
h_prot_list = list(h_prot.columns)

h_del_wt = all_prot_format_df(h, h_prot_list)
h_del_wt.head()



Name,A1BG_proteomics,A1CF_proteomics,A2M_proteomics,A2ML1_proteomics,A4GALT_proteomics,AAAS_proteomics,AACS_proteomics,AADAC_proteomics,AADAT_proteomics,AAED1_proteomics,...,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDA_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00977,27.725342,19.056377,28.565472,26.929429,,24.21734,24.746903,21.413714,,18.187642,...,24.739946,23.345919,19.219742,,,22.473319,27.870637,24.543298,20.070938,Wildtype_Tumor
C3L-00987,28.152905,,29.374443,27.872815,,24.288701,24.731336,24.530248,,,...,24.575403,22.715326,,,19.438877,22.350913,27.694608,24.194437,17.471477,Deletion
C3L-00994,28.348186,18.058554,30.252145,25.85458,,24.150865,24.325959,21.295667,,19.525432,...,24.531751,22.636623,19.005228,,,22.780357,28.117156,24.429272,20.055226,Wildtype_Tumor
C3L-00995,28.004445,,29.267877,28.182014,,24.292617,24.87866,19.727526,,18.660637,...,24.719581,22.47525,,,19.361772,22.385058,28.565526,24.713502,,Wildtype_Tumor
C3L-00997,27.735214,,28.724642,26.751857,18.426501,24.348787,25.381833,21.636534,,,...,24.692211,23.13092,,,19.610571,22.782499,28.117587,24.374482,,Wildtype_Tumor


In [25]:
cols = list(h_del_wt.columns[:-1])

# Get only sig sites
h_sig = u.wrap_ttest(h_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if h_sig is not None:
    h_sig_list = list(h_sig.Comparison)
else: 
    h_sig_list = None
print('significant pvals: \n',h_sig)

# Get all pvals
h_pval = u.wrap_ttest(h_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
h_pval = h_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Hnscc_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
              Comparison       P_Value
0        NLN_proteomics  1.113430e-09
1     AKR1C3_proteomics  1.032977e-08
2     AKR1C1_proteomics  6.931627e-08
3       ADI1_proteomics  1.601880e-07
4      EPHX1_proteomics  3.899394e-07
..                  ...           ...
523    FLAD1_proteomics  2.232897e-03
524    GSTM4_proteomics  2.234528e-03
525   ELOVL5_proteomics  2.239241e-03
526  SUPT16H_proteomics  2.241138e-03
527  ANKRD28_proteomics  2.243254e-03

[528 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


In [26]:
h_med = get_change_in_medians_df(h, "Hnscc", h_del_wt, h_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [27]:
h_merged = h_pval.merge(h_med, on='Proteomics',how='outer')
#del_wt[['PREX2_proteomics','Mutation']].dropna() # in median, not in pval (only 1 value for deletion)
h_merged

Unnamed: 0,Proteomics,Hnscc_P_Value,Hnscc_Median
0,NLN_proteomics,1.113430e-09,-0.431666
1,AKR1C3_proteomics,1.032977e-08,-0.951905
2,AKR1C1_proteomics,6.931627e-08,-1.961262
3,ADI1_proteomics,1.601880e-07,-0.513198
4,EPHX1_proteomics,3.899394e-07,-0.871688
...,...,...,...
11739,ZNF664_proteomics,,
11740,ZNF836_proteomics,,
11741,ZNF862_proteomics,,
11742,ZSCAN31_proteomics,,


# Luad

In [28]:
gene = 'PTEN'
l_prot = l.get_proteomics()
l_prot = l.reduce_multiindex(l_prot, levels_to_drop = 1)
l_prot_list = list(l_prot.columns)

l_del_wt = all_prot_format_df(l, l_prot_list)
l_del_wt = l_del_wt.loc[:,~l_del_wt.columns.duplicated()]
l_del_wt.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Name,A1BG_proteomics,A2M_proteomics,AAAS_proteomics,AACS_proteomics,AADAC_proteomics,AADAT_proteomics,AAED1_proteomics,AAGAB_proteomics,AAMDC_proteomics,AAMP_proteomics,...,ZSWIM9_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00001,-2.5347,-3.4057,0.1572,-1.1998,-1.6826,,,-0.8179,-0.8053,-0.1899,...,,0.2992,-1.3607,,,0.6527,-0.9694,-1.184,-2.5284,Deletion
C3L-00009,-0.5627,-1.7945,1.0054,-0.3624,-4.4887,0.0079,0.2157,1.3342,0.0645,0.6427,...,-0.5098,-0.1622,0.9828,0.5633,-1.462,-1.069,0.7674,0.5066,0.4311,Wildtype_Tumor
C3L-00080,-1.9422,-2.3782,0.194,0.192,-2.2655,,-1.6626,0.2149,-0.7593,0.6113,...,,-0.2795,0.6613,,0.9659,-0.3442,-1.648,1.2872,-0.7301,Deletion
C3L-00083,2.1636,3.1227,-0.3044,-1.7183,-3.2851,-1.8216,3.6147,-0.4863,-1.2387,-0.4946,...,-1.6769,-0.5897,-0.8129,,0.9399,-0.2465,0.3157,0.6547,,Wildtype_Tumor
C3L-00093,-1.0022,-0.9632,0.819,0.2556,-11.1252,,-0.1696,0.2911,-0.4459,-0.1518,...,,0.695,-0.1625,1.8536,-2.299,0.4293,-0.5876,-0.4991,-0.3077,Wildtype_Tumor


In [29]:
cols = list(l_del_wt.columns[:-2])

# Get only sig sites
l_sig = u.wrap_ttest(l_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if l_sig is not None:
    l_sig_list = list(l_sig.Comparison)
else: 
    l_sig_list = None
print('significant pvals: \n',l_sig)

# Get all pvals
l_pval = u.wrap_ttest(l_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
l_pval = l_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Luad_P_Value'})
# isoforms for some proteins

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
               Comparison       P_Value
0        ATE1_proteomics  2.851800e-07
1       SHOC2_proteomics  6.424816e-07
2        EGFR_proteomics  9.121167e-07
3      LANCL2_proteomics  1.742749e-06
4        GGCT_proteomics  2.854181e-06
..                   ...           ...
103    FERMT3_proteomics  4.804371e-04
104  ARHGAP15_proteomics  4.875693e-04
105    CRYBG2_proteomics  5.063478e-04
106     GNAI2_proteomics  5.076782e-04
107     CIAO1_proteomics  5.181887e-04

[108 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Proteomics,Luad_P_Value
0,ATE1_proteomics,2.851800e-07
1,SHOC2_proteomics,6.424816e-07
2,EGFR_proteomics,9.121167e-07
3,LANCL2_proteomics,1.742749e-06
4,GGCT_proteomics,2.854181e-06
...,...,...
10310,COG6_proteomics,9.997105e-01
10311,PLD1_proteomics,9.997256e-01
10312,CHMP6_proteomics,9.997659e-01
10313,FHL3_proteomics,9.999702e-01


In [30]:
#Isoform code - fix later
#del_wt.replace(to_replace = r'_NP_.*$', value = '', regex = True)

In [31]:
l_med = get_change_in_medians_df(l, "Luad", l_del_wt, l_prot_list)

In [32]:
l_merged = l_pval.merge(l_med, on='Proteomics',how='outer')
l_merged

Unnamed: 0,Proteomics,Luad_P_Value,Luad_Median
0,ATE1_proteomics,2.851800e-07,0.88365
1,SHOC2_proteomics,6.424816e-07,0.60600
2,EGFR_proteomics,9.121167e-07,-2.32970
3,LANCL2_proteomics,1.742749e-06,-1.21475
4,GGCT_proteomics,2.854181e-06,-2.12715
...,...,...,...
10311,PLD1_proteomics,9.997256e-01,-0.43875
10312,CHMP6_proteomics,9.997659e-01,0.05220
10313,FHL3_proteomics,9.999702e-01,0.06610
10314,UST_proteomics,,-2.49145


# Lscc

In [33]:
gene = 'PTEN'
ls_prot = ls.get_proteomics()
ls_prot = ls.reduce_multiindex(ls_prot, levels_to_drop = 1)
ls_prot_list = list(ls_prot.columns)

ls_del_wt = all_prot_format_df(ls, ls_prot_list)
ls_del_wt = ls_del_wt.loc[:,~ls_del_wt.columns.duplicated()]



Name,A1BG_proteomics,A2M_proteomics,A2ML1_proteomics,A4GALT_proteomics,AAAS_proteomics,AACS_proteomics,AADAC_proteomics,AAED1_proteomics,AAGAB_proteomics,AAK1_proteomics,...,ZSWIM9_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00081,-3.1101,-6.4486,-1.1714,0.266,0.7587,-0.5222,-6.24,0.7268,0.9413,0.0487,...,,0.524,0.8369,,1.8136,-0.1484,-0.783,-0.5483,-0.5135,Deletion
C3L-00415,-2.6522,-4.5841,6.3064,,0.2618,2.7781,6.6933,-0.2847,-0.0385,-0.2198,...,0.6812,0.2781,-0.5418,,0.7894,-0.6121,-1.5266,0.1942,-0.0953,Wildtype_Tumor
C3L-00445,-0.6754,-3.6368,-5.2459,2.5505,0.2253,1.7921,-0.9698,-1.808,1.7646,-0.1414,...,,0.5222,1.4278,2.7775,1.4203,0.7517,-0.8849,-0.7053,0.7442,Deletion
C3L-00568,-1.9178,-3.4357,-6.3256,-0.9436,0.6271,2.2628,-5.112,-0.0101,1.589,0.6312,...,0.0548,1.4713,2.5429,0.6555,-0.3754,-0.5175,-0.8949,0.3471,0.4404,Wildtype_Tumor
C3L-00603,-1.7088,-1.6799,-4.8142,,0.837,1.4581,-2.0167,-1.5169,0.1562,0.2286,...,-0.1679,1.3386,3.0732,0.4694,1.3476,-0.0376,-1.2996,0.1218,0.5527,Wildtype_Tumor


In [52]:
cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n',ls_sig)

# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})
#ls_pval # isoforms for some proteins

significant pvals: 
              Comparison       P_Value
0       RFC3_proteomics  4.340033e-07
1      CPSF2_proteomics  1.052148e-06
2        SLK_proteomics  4.777268e-06
3      CPSF1_proteomics  5.111559e-06
4      WDR33_proteomics  6.166593e-06
..                  ...           ...
282   FIP1L1_proteomics  1.292708e-03
283    HAUS7_proteomics  1.298369e-03
284   SPOUT1_proteomics  1.307289e-03
285   PGRMC1_proteomics  1.314746e-03
286  OTULINL_proteomics  1.318396e-03

[287 rows x 2 columns]


In [35]:
ls_med = get_change_in_medians_df(ls, "Lscc", ls_del_wt, ls_prot_list)

In [36]:
ls_merged = ls_pval.merge(ls_med, on='Proteomics',how='outer')
ls_merged

Unnamed: 0,Proteomics,Luad_P_Value,Lscc_Median
0,RFC3_proteomics,4.340033e-07,-0.71930
1,CPSF2_proteomics,1.052148e-06,-0.42080
2,SLK_proteomics,4.777268e-06,0.67420
3,CPSF1_proteomics,5.111559e-06,-0.47950
4,WDR33_proteomics,6.166593e-06,-0.49690
...,...,...,...
10858,CRTAP_proteomics,9.989572e-01,-0.14375
10859,VEZT_proteomics,9.996351e-01,-0.22535
10860,SPG7_proteomics,9.997930e-01,-0.01210
10861,KLHL15_proteomics,9.999643e-01,0.05330


# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [37]:
prot = en.get_proteomics()
p = list(prot.columns)

mut_type = en.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = p)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [38]:
cols = list(trunc_wt.columns[:-1])

# Get only sig sites
e_sig = u.wrap_ttest(trunc_wt, 'Mutation', cols, correction_method = 'fdr_bh')
if e_sig is not None:
    e_sig_list = list(e_sig.Comparison)
else: 
    e_sig_list = None
print('significant pvals: \n',e_sig)

# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
e_pval = e_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'En_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
             Comparison       P_Value
0     NOL10_proteomics  8.999599e-08
1      ABT1_proteomics  3.141336e-07
2     UTP25_proteomics  4.308134e-07
3    TOPBP1_proteomics  4.885533e-07
4      RIF1_proteomics  9.004982e-07
..                 ...           ...
397   ACTR5_proteomics  1.788167e-03
398   RSPH3_proteomics  1.789386e-03
399   COPS5_proteomics  1.793000e-03
400   DDAH1_proteomics  1.822621e-03
401   UCHL1_proteomics  1.825674e-03

[402 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


Unnamed: 0,Proteomics,En_P_Value
0,NOL10_proteomics,8.999599e-08
1,ABT1_proteomics,3.141336e-07
2,UTP25_proteomics,4.308134e-07
3,TOPBP1_proteomics,4.885533e-07
4,RIF1_proteomics,9.004982e-07
...,...,...
10994,ZNF586_proteomics,
10995,ZNF630_proteomics,
10996,ZNF772_proteomics,
10997,ZNF79_proteomics,


Changes in median with adaption to trunctation mutations.

In [39]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for prot in p:
    dif = wt_med[prot+'_proteomics'] - trunc_med[prot+'_proteomics']
    en_d[prot+'_proteomics'] = dif

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Proteomics'})

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,Proteomics,En_Median
0,A1BG_proteomics,0.06100
1,A2M_proteomics,-0.07400
2,A2ML1_proteomics,-0.22300
3,A4GALT_proteomics,-0.03995
4,AAAS_proteomics,0.19200
...,...,...
10994,ZXDC_proteomics,0.42800
10995,ZYG11B_proteomics,0.19420
10996,ZYX_proteomics,-0.21600
10997,ZZEF1_proteomics,-0.05320


In [40]:
e_merged = e_pval.merge(en_med, on='Proteomics',how='inner')
e_merged

Unnamed: 0,Proteomics,En_P_Value,En_Median
0,NOL10_proteomics,8.999599e-08,0.44200
1,ABT1_proteomics,3.141336e-07,0.48510
2,UTP25_proteomics,4.308134e-07,0.73450
3,TOPBP1_proteomics,4.885533e-07,0.61795
4,RIF1_proteomics,9.004982e-07,0.54640
...,...,...,...
10994,ZNF586_proteomics,,
10995,ZNF630_proteomics,,-0.99250
10996,ZNF772_proteomics,,
10997,ZNF79_proteomics,,


# Step 2: Merge all cancer data frames

Merge the data frames with p-values and changes in median into one large data frame.

In [41]:
n = g_merged.merge(h_merged, on='Proteomics',how='outer')
n = n.merge(l_merged, on='Proteomics',how='outer')
n = n.merge(ls_merged, on='Proteomics',how='outer')
n= n.merge(b_merged, on='Proteomics',how='outer')
n = n.merge(o_merged, on='Proteomics',how='outer')
n = n.merge(e_merged, on='Proteomics',how='outer')
all_df = n.merge(c_merged, on='Proteomics',how='outer')
all_df


Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median,Hnscc_P_Value,Hnscc_Median,Luad_P_Value_x,Luad_Median,Luad_P_Value_y,Lscc_Median,Brca_P_Value,Brca_Median,Ov_P_Value,Ov_Median,En_P_Value,En_Median,Colon_P_Value,Colon_Median
0,ARMH3_proteomics,5.396032e-11,0.405134,0.000558,0.135941,0.017908,0.37845,0.000252,0.42085,,,,,0.001113,-0.1503,,
1,CUTC_proteomics,1.593480e-10,0.553255,0.000189,0.314307,0.000744,1.01995,0.000841,0.55710,0.008295,0.48080,0.060164,0.159424,0.082868,0.2533,0.411506,0.08210
2,PIP4K2A_proteomics,1.009419e-09,0.838882,0.001544,0.224713,0.560996,-0.04535,0.020074,0.00965,0.203952,0.07435,0.389187,-0.052955,0.201771,-0.1690,0.199847,-0.10135
3,CUL2_proteomics,1.122076e-09,0.586396,0.400554,-0.017886,0.612583,-0.05695,0.001998,0.36965,0.198043,-0.17195,0.786184,-0.033091,0.864381,-0.0232,0.349065,0.07690
4,GDI2_proteomics,1.302273e-09,0.610188,0.630985,-0.001559,,,0.013276,0.43580,0.354712,-0.14995,0.569417,0.067389,0.061660,-0.1286,0.990512,0.04450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14463,UGT2B15_proteomics,,,,,,,,,,,,,,,,-1.04600
14464,WTH3DI_proteomics,,,,,,,,,,,,,,,,0.40100
14465,ZIC4_proteomics,,,,,,,,,,,,,,,,-0.00200
14466,ZNF419_proteomics,,,,,,,,,,,,,,,,-0.38150


In [42]:
# Keep only genes significant in > 1 cancer
all_sig = []

sig_lists = [g_sig_list, e_sig_list, b_sig_list, o_sig_list, c_sig_list, h_sig_list, l_sig_list]

for next_list in sig_lists:
    if next_list is not None:
        all_sig.append(next_list)
        

flat_list = [item for sublist in all_sig for item in sublist]
sig = set(flat_list) # remove duplicates

bool_df = all_df['Proteomics'].isin(sig)
sig_df = all_df[bool_df]
print(len(sig_df))
sig_df

2584


Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median,Hnscc_P_Value,Hnscc_Median,Luad_P_Value_x,Luad_Median,Luad_P_Value_y,Lscc_Median,Brca_P_Value,Brca_Median,Ov_P_Value,Ov_Median,En_P_Value,En_Median,Colon_P_Value,Colon_Median
0,ARMH3_proteomics,5.396032e-11,0.405134,0.000558,0.135941,0.017908,0.37845,0.000252,0.42085,,,,,0.001113,-0.1503,,
1,CUTC_proteomics,1.593480e-10,0.553255,0.000189,0.314307,0.000744,1.01995,0.000841,0.55710,0.008295,0.48080,0.060164,0.159424,0.082868,0.2533,0.411506,0.08210
2,PIP4K2A_proteomics,1.009419e-09,0.838882,0.001544,0.224713,0.560996,-0.04535,0.020074,0.00965,0.203952,0.07435,0.389187,-0.052955,0.201771,-0.1690,0.199847,-0.10135
3,CUL2_proteomics,1.122076e-09,0.586396,0.400554,-0.017886,0.612583,-0.05695,0.001998,0.36965,0.198043,-0.17195,0.786184,-0.033091,0.864381,-0.0232,0.349065,0.07690
4,GDI2_proteomics,1.302273e-09,0.610188,0.630985,-0.001559,,,0.013276,0.43580,0.354712,-0.14995,0.569417,0.067389,0.061660,-0.1286,0.990512,0.04450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12981,SUV39H1_proteomics,,,,,,,0.652278,-0.30735,0.169685,-0.37210,,,0.000411,0.7950,,
13236,RSPH3_proteomics,,,,,0.336275,0.81260,,,,,,,0.001789,-1.5200,,
13251,ARMC3_proteomics,,,,,0.367448,0.57430,0.066333,-1.78295,,,,,0.000933,-1.0990,,
13275,CCDC57_proteomics,,,,,0.420048,-0.11070,0.406049,-0.05745,0.621414,0.10775,0.819617,0.087283,0.001480,-0.8445,,


In [43]:
sig_df = sig_df.replace(to_replace = r'_proteomics$', value = '', regex = True)
sig_df = sig_df.set_index('Proteomics')

In [44]:
# check sig_df

sig_df.loc[sig_df.index == 'MAFF']


Unnamed: 0_level_0,Gbm_P_Value,Gbm_Median,Hnscc_P_Value,Hnscc_Median,Luad_P_Value_x,Luad_Median,Luad_P_Value_y,Lscc_Median,Brca_P_Value,Brca_Median,Ov_P_Value,Ov_Median,En_P_Value,En_Median,Colon_P_Value,Colon_Median
Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MAFF,0.008464,-0.202893,0.673107,0.060547,0.187607,-0.2822,0.426798,0.0787,0.078518,-0.4992,0.968044,-0.121901,0.741379,0.0704,0.200121,0.3235


In [45]:
sig_df.to_csv('sig_pval_all_proteins.csv')

In [46]:
# see sig in cancer
cancer = ['Gbm','En','Brca','Ov','Colon','Hnscc','Luad']
i = 0
for next_list in sig_lists:

    print(cancer[i], ':')
    if next_list is not None: 
        print(len(next_list),'\n')
    #print(next_list, '\n')
    if (i < 6):
        i += 1
    

Gbm :
1887 

En :
402 

Brca :
1 

Ov :
1 

Colon :
Hnscc :
528 

Luad :
108 



# HeatMap df

In [47]:
# Create long df for heat map

cancer = ['Gbm','Hnscc','Luad','Brca','Ov','En','Colon']
merged_dfs = [g_merged,h_merged,l_merged,b_merged,o_merged,e_merged,c_merged]

merged = pd.DataFrame()
i = 0
for c in cancer:
    m = merged_dfs[i]
    m2 = m.assign(Cancer = c)
    m2 = m2.rename(columns={c+'_P_Value': 'P_Value'})
    m2 = m2.rename(columns={c+'_Median': 'Medians'})
    merged = merged.append(m2) 
    if i < 6:
        i += 1

# Keep genes with at least one sig ttest
bool_df2 = merged['Proteomics'].isin(sig)
plot_df = merged[bool_df2]
plot_df = plot_df.replace(to_replace = r'_proteomics$', value = '', regex = True)

# log p-vals for right scale in plot (bigger circle, smaller pval)
plot_df['size'] = plot_df['P_Value'].apply(lambda x: -1*(np.log(x)/10))
plot_df = plot_df.set_index('Proteomics')
plot_df#when df of certain cancer doesnt have gene, not added for that cancer since appending(blank in graph, like nan)

Unnamed: 0_level_0,P_Value,Medians,Cancer,size
Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ARMH3,5.396032e-11,0.405134,Gbm,2.364277
CUTC,1.593480e-10,0.553255,Gbm,2.255993
PIP4K2A,1.009419e-09,0.838882,Gbm,2.071389
CUL2,1.122076e-09,0.586396,Gbm,2.060808
GDI2,1.302273e-09,0.610188,Gbm,2.045916
...,...,...,...,...
WWP1,,0.120300,Colon,
ZCCHC9,,-0.406000,Colon,
ZDHHC2,,,Colon,
ZNF260,,-0.052700,Colon,


In [48]:
# CHECK
plot_df.loc[plot_df.index == 'DOCK1']


Unnamed: 0_level_0,P_Value,Medians,Cancer,size
Proteomics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DOCK1,5.308552e-07,0.444503,Gbm,1.444878
DOCK1,0.2598816,0.112847,Hnscc,0.134753
DOCK1,0.01314539,0.5691,Luad,0.433168
DOCK1,0.05616896,0.34005,Brca,0.287939
DOCK1,0.003248402,0.296576,Ov,0.572959
DOCK1,0.2539366,0.117,En,0.137067
DOCK1,0.0786318,0.0447,Colon,0.254298


In [49]:
plot_df.to_csv('heat_map_df.csv')

# Keep PosNeg Genes df

In [50]:
def HasPosNeg(row):
    hasPos = False
    hasNeg= False

    for item in row:
        if pd.isnull(item):
            continue
        if item < -0.3:
            hasNeg = True
        if item > 0.3:
            hasPos = True
           
    if hasPos & hasNeg:
        return True
    return False


In [51]:
sig_df = sig_df.replace(to_replace = r'_proteomics$', value = '', regex = True)


only_med = sig_df.drop(columns= ['Gbm_P_Value','Hnscc_P_Value','Luad_P_Value',
                     'Brca_P_Value','Ov_P_Value','En_P_Value','Colon_P_Value'])
only_med

KeyError: "['Luad_P_Value'] not found in axis"

In [None]:
only_med["Pos_Neg"] = only_med.apply(HasPosNeg, axis = 1)

pn = only_med.loc[only_med['Pos_Neg'] == True]
pn_genes = list(pn.index) # list of genes that have posNeg

get = plot_df.index.isin(pn_genes)
plot_df2 = plot_df[get]

plot_df2['size'] = plot_df2['P_Value'].apply(lambda x: -1*(np.log(x)/10))
plot_df2

In [None]:
# CHECK
plot_df2.loc[plot_df2.index == 'MAFF']


In [None]:
plot_df2.to_csv('pos_neg_df.csv')