# Create tables with p-value and change in medians for multiple cancers 

Create a dataframe with p-value results from t-tests for all proteins (trans gene proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as pu
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'

In [2]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot)
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = cancer_object.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [3]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot+'_proteomics' in del_med.index and prot+'_proteomics' in wt_med.index:
            dif = del_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
            med_dict[prot+'_proteomics'] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [4]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...



version 3scc v3.2.......                 
Checking that ovarian index is up-to-date...



                                            

# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [5]:
gene = 'PTEN'
g_prot = g.get_proteomics()
g_prot_list = list(g_prot.columns)

g_del_wt = all_prot_format_df(g, g_prot_list)
g_del_wt = g_del_wt.dropna(axis='columns', thresh = 10)
g_del_wt.head()



Name,A1BG_proteomics,A2M_proteomics,AAAS_proteomics,AACS_proteomics,AADAT_proteomics,AAED1_proteomics,AAGAB_proteomics,AAK1_proteomics,AAMDC_proteomics,AAMP_proteomics,...,ZSWIM8_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,0.07763,0.487228,-0.254208,-0.144373,0.551881,-0.025276,-0.467451,-0.089511,-0.078806,0.329603,...,-0.047437,-0.105908,-0.347076,,0.459635,0.079452,-0.784983,-0.488441,0.16799,Deletion
C3L-00365,-0.145975,0.798796,0.184242,-0.470603,,0.390211,0.245466,-0.609998,0.118625,-0.086927,...,0.161975,-0.213093,0.235571,,0.107421,0.048724,0.138403,-0.290141,0.405037,Deletion
C3L-00674,0.821991,1.09647,-0.094421,-0.106304,0.084578,0.176402,-0.248151,0.014061,-0.699773,-0.638462,...,-0.065534,-0.306717,0.879991,,0.883564,-0.172222,0.011876,-0.131889,-0.503581,Deletion
C3L-00677,-0.064567,0.129385,0.047751,-0.118187,0.237434,,0.303847,0.322163,-0.555479,-0.363414,...,-0.254535,0.463653,0.58023,0.503044,-0.604986,0.178077,-0.720059,-0.150197,-0.268715,Deletion
C3L-01040,-0.763691,-1.031834,-0.217194,-0.695701,0.184173,-0.474816,-0.051789,0.344842,-0.642746,0.068863,...,-0.092502,0.010639,-0.465079,,-0.500083,0.112651,1.00466,-0.230304,-0.102416,Deletion


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [6]:
g_cols = list(g_del_wt.columns[:-1])

# Get only sig genes
g_sig = u.wrap_ttest(g_del_wt, 'Mutation', g_cols, correction_method = 'fdr_bh')
# Create list of sig genes
if g_sig is not None:
    g_sig_list = list(g_sig.Comparison)
else: 
    g_sig_list = None
print('significant pvals: \n',g_sig)

# Get all pvals
g_pval = u.wrap_ttest(g_del_wt, 'Mutation', g_cols, return_all = True, correction_method = 'fdr_bh')
g_pval = g_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Gbm_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
                Comparison       P_Value
0        ARMH3_proteomics  5.396032e-11
1         CUTC_proteomics  1.593480e-10
2      PIP4K2A_proteomics  1.009419e-09
3         CUL2_proteomics  1.122076e-09
4         GDI2_proteomics  1.302273e-09
...                   ...           ...
1895  KIAA1522_proteomics  8.588845e-03
1896      NOB1_proteomics  8.592904e-03
1897      MSH3_proteomics  8.603975e-03
1898      FPR2_proteomics  8.615010e-03
1899      FBP2_proteomics  8.625413e-03

[1900 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


Part 3: Create the differences in median df. (median of wildtype tumors - median of cnv deletions)

In [7]:
g_med = get_change_in_medians_df(g, "Gbm", g_del_wt, g_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Part 4: Merge the p-values and the differences in median dfs.

In [8]:
g_merged = g_pval.merge(g_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(g_cols))

# Create csv
root = R'~\Github\WhenMutationsDontMatter\PTEN\Step_3_trans_effect\csv'
g_merged.to_csv(root+R'\gbm_pval_medians.csv',index=False)
g_merged

total_proteins_tested = 11009


Unnamed: 0,Proteomics,Gbm_P_Value,Gbm_Median
0,ARMH3_proteomics,5.396032e-11,-0.405134
1,CUTC_proteomics,1.593480e-10,-0.553255
2,PIP4K2A_proteomics,1.009419e-09,-0.838882
3,CUL2_proteomics,1.122076e-09,-0.586396
4,GDI2_proteomics,1.302273e-09,-0.610188
...,...,...,...
11004,YBX2_proteomics,,
11005,ZNF354C_proteomics,,
11006,ZNF432_proteomics,,
11007,ZNF805_proteomics,,


# Repeat for other cancers.

# Ovarian

In [9]:
gene = 'PTEN'
o_prot = o.get_proteomics()
o_prot = u.reduce_multiindex(o_prot, levels_to_drop = 1)
o_prot_list = list(o_prot.columns)

o_del_wt = all_prot_format_df(o, o_prot_list)
o_del_wt = o_del_wt.dropna(axis='columns', thresh = 10)



There are isoforms in ovarian. Create unique columns names by adding a number to differentiate. 

In [10]:
cols = pd.Series(o_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
o_del_wt.columns=cols

In [11]:
o_cols = list(o_del_wt.columns[:-1])

# Get only sig sites
o_sig = u.wrap_ttest(o_del_wt, 'Mutation', o_cols, correction_method = 'fdr_bh')
if o_sig is not None:
    o_sig_list = list(o_sig.Comparison)
else: 
    o_sig_list = None
print('significant pvals: \n',o_sig)

# Get all pvals
o_pval = u.wrap_ttest(o_del_wt, 'Mutation', o_cols, return_all = True, correction_method = 'fdr_bh')
o_pval = o_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Ov_P_Value'})

significant pvals: 
          Comparison       P_Value
0  MMS19_proteomics  1.058914e-08


In [12]:
o_med = get_change_in_medians_df(o, "Ov", o_del_wt, o_prot_list)

In [13]:
o_merged = o_pval.merge(o_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(o_cols))

# Create csv
o_merged.to_csv(root+R'\ov_pval_medians.csv',index=False)
o_merged

total_proteins_tested = 10601


Unnamed: 0,Proteomics,Ov_P_Value,Ov_Median
0,MMS19_proteomics,1.058914e-08,-0.324897
1,RACK1_proteomics,8.573522e-05,-0.191250
2,PI4K2A_proteomics,9.310383e-05,-0.298399
3,WAPL_proteomics,1.724680e-04,-0.218775
4,IDE_proteomics,2.011475e-04,-0.208025
...,...,...,...
10596,LRP1_proteomics,9.993474e-01,0.126014
10597,DDT_proteomics,9.994146e-01,-0.222938
10598,S100A11_proteomics,9.994208e-01,-0.039314
10599,NKIRAS2_proteomics,9.994558e-01,-0.002824


# Breast

In [14]:
gene = 'PTEN'
b_prot = b.get_proteomics()
b_prot = u.reduce_multiindex(b_prot, levels_to_drop = 1)
b_prot_list = list(b_prot.columns)

b_del_wt = all_prot_format_df(b, b_prot_list)
b_del_wt = b_del_wt.dropna(axis='columns', thresh = 10)



In [15]:
# Differentiate duplicate column names
cols = pd.Series(b_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
b_del_wt.columns=cols

In [16]:
b_cols = list(b_del_wt.columns[:-1])

# Get only sig sites
b_sig = u.wrap_ttest(b_del_wt, 'Mutation', b_cols, correction_method = 'fdr_bh')
if b_sig is not None:
    b_sig_list = list(b_sig.Comparison)
else: 
    b_sig_list = None
print('significant pvals: \n',b_sig)

# Get all pvals
b_pval = u.wrap_ttest(b_del_wt, 'Mutation', b_cols, return_all = True)
b_pval = b_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Brca_P_Value'})

significant pvals: 
         Comparison   P_Value
0  PTEN_proteomics  0.000004


In [17]:
b_med = get_change_in_medians_df(b, "Brca", b_del_wt, b_prot_list)

In [18]:
b_merged = b_pval.merge(b_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(b_cols))

# Create csv
b_merged.to_csv(root+R'\brca_pval_medians.csv',index=False)
b_merged.head()

total_proteins_tested = 10107


Unnamed: 0,Proteomics,Brca_P_Value,Brca_Median
0,PTEN_proteomics,4e-06,-1.008
1,EIF4H_proteomics,2.5e-05,0.41295
2,MIEF1_proteomics,3.1e-05,0.84135
3,TMSB10_proteomics,4.3e-05,0.8763
4,DRG1_proteomics,6.5e-05,0.75135


# Colon

In [19]:
gene = 'PTEN'
c_prot = col.get_proteomics()
c_prot_list = list(c_prot.columns)

c_del_wt = all_prot_format_df(col, c_prot_list)
c_del_wt = c_del_wt.dropna(axis='columns', thresh=10)



In [20]:
# Differentiate duplicate column names
cols = pd.Series(c_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
c_del_wt.columns=cols

In [21]:
c_cols = list(c_del_wt.columns[:-1])

# Get only sig sites
c_sig = u.wrap_ttest(c_del_wt, 'Mutation', c_cols, correction_method = 'fdr_bh')
if c_sig is not None:
    c_sig_list = list(c_sig.Comparison)
else: 
    c_sig_list = None
print('significant pvals: \n',c_sig)

# Get all pvals
c_pval = u.wrap_ttest(c_del_wt, 'Mutation', c_cols, return_all = True, correction_method = 'fdr_bh')
c_pval = c_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Colon_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
 None


In [22]:
c_med = get_change_in_medians_df(col, "Colon", c_del_wt, c_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [23]:
c_merged = c_pval.merge(c_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(c_cols))

# Create csv
c_merged.to_csv(root+R'\colon_pval_medians.csv',index=False)
c_merged

total_proteins_tested = 7739


Unnamed: 0,Proteomics,Colon_P_Value,Colon_Median
0,DFFA_proteomics,0.000037,-0.19930
1,WAPL_proteomics,0.000097,-0.19605
2,SEC14L2_proteomics,0.000136,-0.66800
3,GBF1_proteomics,0.000164,-0.12630
4,STK11IP_proteomics,0.000239,-0.31250
...,...,...,...
7734,WWP1_proteomics,,-0.12030
7735,ZBTB20_proteomics,,0.52450
7736,ZCCHC9_proteomics,,0.40600
7737,ZIC4_proteomics,,0.00200


# Hnscc

In [24]:
gene = 'PTEN'
h_prot = h.get_proteomics()
h_prot_list = list(h_prot.columns)

h_del_wt = all_prot_format_df(h, h_prot_list)
h_del_wt = h_del_wt.dropna(axis='columns', how='all')
h_del_wt = h_del_wt.dropna(axis = 'columns',thresh = 10)



In [25]:
h_cols = list(h_del_wt.columns[:-1])

# Get only sig sites
h_sig = u.wrap_ttest(h_del_wt, 'Mutation', h_cols, correction_method = 'fdr_bh')
if h_sig is not None:
    h_sig_list = list(h_sig.Comparison)
else: 
    h_sig_list = None
print('significant pvals: \n',h_sig)

# Get all pvals
h_pval = u.wrap_ttest(h_del_wt, 'Mutation', h_cols, return_all = True, correction_method = 'fdr_bh')
h_pval = h_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Hnscc_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
             Comparison       P_Value
0       NLN_proteomics  1.113430e-09
1    AKR1C3_proteomics  1.032977e-08
2    AKR1C1_proteomics  6.931627e-08
3      ADI1_proteomics  1.601880e-07
4     EPHX1_proteomics  3.899394e-07
..                 ...           ...
528   HIP1R_proteomics  2.294252e-03
529    GCSH_proteomics  2.294435e-03
530   DHCR7_proteomics  2.307549e-03
531    POGZ_proteomics  2.311247e-03
532   TRAF1_proteomics  2.340517e-03

[533 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


In [26]:
h_med = get_change_in_medians_df(h, "Hnscc", h_del_wt, h_prot_list)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [27]:
h_merged = h_pval.merge(h_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(h_cols))

# Create csv
h_merged.to_csv(root+R'\hnscc_pval_medians.csv',index=False)
h_merged

total_proteins_tested = 11325


Unnamed: 0,Proteomics,Hnscc_P_Value,Hnscc_Median
0,NLN_proteomics,1.113430e-09,0.431666
1,AKR1C3_proteomics,1.032977e-08,0.951905
2,AKR1C1_proteomics,6.931627e-08,1.961262
3,ADI1_proteomics,1.601880e-07,0.513198
4,EPHX1_proteomics,3.899394e-07,0.871688
...,...,...,...
11320,ZNF180_proteomics,,-0.897906
11321,ZNF213_proteomics,,0.420543
11322,ZNF507_proteomics,,0.322312
11323,ZNF611_proteomics,,0.291550


# Luad

In [28]:
gene = 'PTEN'
l_prot = l.get_proteomics()
l_prot = u.reduce_multiindex(l_prot, levels_to_drop = 1)
l_prot_list = list(l_prot.columns)

l_del_wt = all_prot_format_df(l, l_prot_list)
l_del_wt = l_del_wt.dropna(axis='columns', thresh=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
# Differentiate duplicate column names
cols = pd.Series(l_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
l_del_wt.columns=cols

In [30]:
l_cols = list(l_del_wt.columns[:-2])

# Get only sig sites
l_sig = u.wrap_ttest(l_del_wt, 'Mutation', l_cols, correction_method = 'fdr_bh')
if l_sig is not None:
    l_sig_list = list(l_sig.Comparison)
else: 
    l_sig_list = None
print('significant pvals: \n',l_sig)

# Get all pvals
l_pval = u.wrap_ttest(l_del_wt, 'Mutation', l_cols, return_all = True, correction_method = 'fdr_bh')
l_pval = l_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Luad_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
               Comparison       P_Value
0        ATE1_proteomics  2.851800e-07
1       SHOC2_proteomics  6.424816e-07
2        EGFR_proteomics  9.121167e-07
3      LANCL2_proteomics  1.742749e-06
4        GGCT_proteomics  2.854181e-06
..                   ...           ...
106    FERMT3_proteomics  4.804371e-04
107  ARHGAP15_proteomics  4.875693e-04
108    CRYBG2_proteomics  5.063478e-04
109     GNAI2_proteomics  5.076782e-04
110     CIAO1_proteomics  5.181887e-04

[111 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


In [31]:
l_med = get_change_in_medians_df(l, "Luad", l_del_wt, l_prot_list)

In [32]:
l_merged = l_pval.merge(l_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(l_cols))

# Create csv
l_merged.to_csv(root+R'\luad_pval_medians.csv',index=False)
l_merged.head()

total_proteins_tested = 10698


Unnamed: 0,Proteomics,Luad_P_Value,Luad_Median
0,ATE1_proteomics,2.8518e-07,-0.88365
1,SHOC2_proteomics,6.424816e-07,-0.606
2,EGFR_proteomics,9.121167e-07,2.3297
3,LANCL2_proteomics,1.742749e-06,1.21475
4,GGCT_proteomics,2.854181e-06,2.12715


# Lscc

In [33]:
gene = 'PTEN'
ls_prot = ls.get_proteomics()
ls_prot = u.reduce_multiindex(ls_prot, levels_to_drop = 1)
ls_prot_list = list(ls_prot.columns)

ls_del_wt = all_prot_format_df(ls, ls_prot_list)
ls_del_wt = ls_del_wt.dropna(axis='columns', thresh=10)



In [34]:
# Differentiate duplicate column names
cols = pd.Series(ls_del_wt.columns[:])

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

# rename the columns with the cols list.
ls_del_wt.columns=cols

In [35]:
ls_cols = list(ls_del_wt.columns[:-2])

# Get only sig sites
ls_sig = u.wrap_ttest(ls_del_wt, 'Mutation', ls_cols, correction_method = 'fdr_bh')
if ls_sig is not None:
    ls_sig_list = list(ls_sig.Comparison)
else: 
    ls_sig_list = None
print('significant pvals: \n', ls_sig)

# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', ls_cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})

significant pvals: 
           Comparison       P_Value
0   ATAD1_proteomics  5.610837e-10
1   BTAF1_proteomics  1.962971e-07
2  VPS26A_proteomics  1.220741e-05
3    PTEN_proteomics  1.473964e-05


In [36]:
ls_med = get_change_in_medians_df(ls, "Lscc", ls_del_wt, ls_prot_list)

In [37]:
ls_merged = ls_pval.merge(ls_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(ls_cols))

# Create csv
ls_merged.to_csv(root+R'\lscc_pval_medians.csv',index=False)
ls_merged.head()

total_proteins_tested = 11574


Unnamed: 0,Proteomics,Lscc_P_Value,Lscc_Median
0,ATAD1_proteomics,5.610837e-10,-1.34185
1,BTAF1_proteomics,1.962971e-07,-0.59625
2,VPS26A_proteomics,1.220741e-05,-0.453
3,PTEN_proteomics,1.473964e-05,-0.57975
4,KIF20B_proteomics,2.233989e-05,-1.13105


# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [38]:
gene = 'PTEN'
prot = en.get_proteomics()
e_prot_list = list(prot.columns)

mut_type = en.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = e_prot_list)
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
trunc_wt = trunc_wt.dropna(axis = 'columns',thresh = 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [39]:
e_cols = list(trunc_wt.columns[:-1])

# Get only sig sites
e_sig = u.wrap_ttest(trunc_wt, 'Mutation', e_cols, correction_method = 'fdr_bh')
if e_sig is not None:
    e_sig_list = list(e_sig.Comparison)
else: 
    e_sig_list = None
print('significant pvals: \n',e_sig)

# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', e_cols, return_all = True, correction_method = 'fdr_bh')
e_pval = e_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'En_P_Value'})

  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


significant pvals: 
              Comparison       P_Value
0      NOL10_proteomics  8.999599e-08
1       ABT1_proteomics  3.141336e-07
2      UTP25_proteomics  4.308134e-07
3     TOPBP1_proteomics  4.885533e-07
4       RIF1_proteomics  9.004982e-07
..                  ...           ...
407     ORC5_proteomics  1.865479e-03
408    ARNTL_proteomics  1.903651e-03
409    RBM19_proteomics  1.916007e-03
410  PPP2R2A_proteomics  1.917242e-03
411     FBP1_proteomics  1.925357e-03

[412 rows x 2 columns]


  **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  reject = pvals_sorted <= ecdffactor*alpha
  pvals_corrected[pvals_corrected>1] = 1
  pvals_corrected[pvals_corrected>1] = 1


Differences in median with adaption to trunctation mutations.

In [40]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for prot in e_prot_list:
    if prot+'_proteomics' in trunc_med.index and prot+'_proteomics' in wt_med.index:
        dif = trunc_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
        en_d[prot+'_proteomics'] = dif

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Proteomics'})

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [41]:
e_merged = e_pval.merge(en_med, on='Proteomics',how='outer')
print('total_proteins_tested =', len(e_cols))

# Create csv
e_merged.to_csv(root+R'\endo_pval_medians.csv',index=False)
e_merged

total_proteins_tested = 10693


Unnamed: 0,Proteomics,En_P_Value,En_Median
0,NOL10_proteomics,8.999599e-08,-0.44200
1,ABT1_proteomics,3.141336e-07,-0.48510
2,UTP25_proteomics,4.308134e-07,-0.73450
3,TOPBP1_proteomics,4.885533e-07,-0.61795
4,RIF1_proteomics,9.004982e-07,-0.54640
...,...,...,...
10688,TMEM50A_proteomics,,0.63100
10689,TSPAN4_proteomics,,-0.39950
10690,USP27X_proteomics,,-0.33050
10691,ZNF586_proteomics,,


# Get a list of significant genes in at least one cancer

In [42]:
# Keep only genes significant in > 1 cancer
all_sig = []

sig_lists = [g_sig_list, e_sig_list, b_sig_list, o_sig_list, c_sig_list, h_sig_list, l_sig_list, ls_sig_list]

for next_list in sig_lists:
    if next_list is not None:
        all_sig.append(next_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # remove duplicates
print('Number of significant in >= 1 cancer:', len(sig))

Number of significant in >= 1 cancer: 2611


In [43]:
s = pd.Series(sig)
s = s.replace(to_replace = r'_proteomics$', value = '', regex = True)
s.to_csv(root+R'\list_sig_one_cancer.csv', index=False)

# Get a list of significant genes in multiple cancers

In [44]:
# sig in multiple (more than 1) cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant in mult cancers:', len(mult))


Number of significant in mult cancers: 311


In [45]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_proteomics$', value = '', regex = True)
m.to_csv(root+R'\list_sig_multiple_cancers.csv', index=False)

# Number of significant comparisons for each cancer

In [46]:
sig_dict = {'Gbm': g_sig_list, 'En': e_sig_list, 'Brca': b_sig_list, 'Ov': o_sig_list, 'Colon': c_sig_list,
            'Hnscc': h_sig_list, 'Luad': l_sig_list, 'Lscc': ls_sig_list}
print('Number of significant tests:\n')
i = 0
for cancer in sig_dict:

    print(cancer, ':')
    if sig_dict[cancer] is not None: 
        print(len(sig_dict[cancer]))
    print('\n')
    if (i < 6):
        i += 1
    

Number of significant tests:

Gbm :
1900


En :
412


Brca :
1


Ov :
1


Colon :


Hnscc :
533


Luad :
111


Lscc :
4




In [47]:
protein_dict = {'Gbm': len(g_cols), 'En': len(e_cols), 'Brca': len(b_cols), 'Ov': len(o_cols), 'Colon': len(c_cols),
            'Hnscc': len(h_cols), 'Luad': len(l_cols), 'Lscc': len(ls_cols)}

print('Information about total proteins tested (dropped if < 10 samples with nonNaN data)\n')
i = 0
for cancer in protein_dict:

    print(cancer, ':')
    if protein_dict[cancer] is not None: 
        print('total_proteins_tested:', protein_dict[cancer])
    print('\n')
    if (i < 6):
        i += 1

Information about total proteins tested (dropped if < 10 samples with nonNaN data)

Gbm :
total_proteins_tested: 11009


En :
total_proteins_tested: 10693


Brca :
total_proteins_tested: 10107


Ov :
total_proteins_tested: 10601


Colon :
total_proteins_tested: 7739


Hnscc :
total_proteins_tested: 11325


Luad :
total_proteins_tested: 10698


Lscc :
total_proteins_tested: 11574


