# Create tables with p-value and change in medians for multiple cancers 

Create a dataframe with p-value results from t-tests for all proteins (trans gene proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as pu

In [2]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN'):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot)
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = cancer_object.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
        prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples

        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    else: 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = l.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot)
        omics = l.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = omics.drop(columns='PTEN_CNV')
        # Get only tumor samples
        p = l.get_proteomics(tissue_type='tumor')
        tumor_ids = list(p.index)
        get = omics.index.isin(tumor_ids)
        omics = omics[get]
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        del_wt = merged[get]
        del_wt['Mutation'] = np.where(
            del_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')

    return del_wt


In [3]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        dif = del_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
        med_dict[prot+'_proteomics'] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create Data frames with p-values and differences in median

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run many T-tests for all genes in the proteomics data frame. Use get_change_in_medians_df to create the data frame with change in median values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [4]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

Checking that endometrial index is up-to-date...



Checking that luad index is up-to-date...       



Checking that lscc index is up-to-date...



Checking that ovarian index is up-to-date...



                                            

In [5]:
test_gene = 'RFC2'

# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [6]:
gene = 'PTEN'
g_del_wt = all_prot_format_df(g, [test_gene])
g_del_wt = g_del_wt.dropna(axis='columns', how='all')
g_del_wt.head()



Name,RFC2_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00104,0.473474,Deletion
C3L-00365,0.422714,Deletion
C3L-00674,-0.416313,Deletion
C3L-00677,0.21758,Deletion
C3L-01040,0.189868,Deletion


In [7]:
cols = [test_gene+'_proteomics']

# Get all pvals
g = u.wrap_ttest(g_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
g_pval = g.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Gbm_P_Value'})

Tests

In [50]:
def t_get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Deletion"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        dif = del_med[prot+'_proteomics'] - wt_med[prot+'_proteomics'] 
        med_dict[prot+'_proteomics'] = dif
        print( del_med[prot+'_proteomics'],' - ',wt_med[prot+'_proteomics'],'=',med_dict[prot+'_proteomics'])
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df

In [51]:
# tests
print(g_pval.loc[g_pval['Proteomics']==test_gene+'_proteomics'], '\n')
d = t_get_change_in_medians_df(g, "Gbm", g_del_wt, [test_gene])
d

        Proteomics  Gbm_P_Value
0  RFC2_proteomics      0.00058 

0.164952195028982  -  -0.213301629910476 = 0.378253824939458


Unnamed: 0,Proteomics,Gbm_Median
0,RFC2_proteomics,0.378254


# Repeat for other cancers.

# Ovarian

In [40]:
gene = 'PTEN'

o_del_wt = all_prot_format_df(o, [test_gene])
# Drop duplicate columns - FIX
o_del_wt = o_del_wt.loc[:,~o_del_wt.columns.duplicated()]

AttributeError: 'DataFrame' object has no attribute 'get_genotype_all_vars'

In [11]:
cols = [test_gene+'_proteomics']

# Get all pvals
o = u.wrap_ttest(o_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
o_pval = o.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Ov_P_Value'})

In [41]:
# tests
print(o_pval.loc[o_pval['Proteomics']==test_gene+'_proteomics'],'\n')
d = t_get_change_in_medians_df(o, "test", o_del_wt, [test_gene])
d

        Proteomics  Ov_P_Value
0  RFC2_proteomics    0.381046 

0.0764600125  -  0.20315113550000002 = -0.12669112300000002


Unnamed: 0,Proteomics,test_Median
0,RFC2_proteomics,0.126691


# Breast

In [13]:
gene = 'PTEN'

b_del_wt = all_prot_format_df(b, [test_gene])
b_del_wt = b_del_wt.loc[:,~b_del_wt.columns.duplicated()]
b_del_wt.head()



Name,RFC2_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
CPT000814,0.5262,Wildtype_Tumor
CPT001846,-0.4629,Wildtype_Tumor
X01BR001,-0.8732,Wildtype_Tumor
X01BR009,-1.3717,Wildtype_Tumor
X01BR010,-1.0634,Deletion


In [14]:
# Get all pvals
b_pval = u.wrap_ttest(b_del_wt, 'Mutation', cols, return_all = True)
b_pval = b_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Brca_P_Value'})

In [42]:
# tests
print(b_pval.loc[b_pval['Proteomics']==test_gene+'_proteomics'],'\n')
d = t_get_change_in_medians_df(b, "test", b_del_wt, [test_gene])
d

        Proteomics  Brca_P_Value
0  RFC2_proteomics      0.548167 

-0.26095  -  -0.4465 = 0.18555


Unnamed: 0,Proteomics,test_Median
0,RFC2_proteomics,-0.18555


# Colon

In [16]:
c_del_wt = all_prot_format_df(col, [test_gene])
c_del_wt = c_del_wt.loc[:,~c_del_wt.columns.duplicated()]



In [17]:
# Get all pvals
c_pval = u.wrap_ttest(c_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
c_pval = c_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Colon_P_Value'})

In [43]:
# tests
print(c_pval.loc[c_pval['Proteomics']==test_gene+'_proteomics'],'\n')
d = t_get_change_in_medians_df(col, "test", c_del_wt, [test_gene])
d

        Proteomics  Colon_P_Value
0  RFC2_proteomics       0.219967 

0.00805  -  0.0883 = -0.08025


Unnamed: 0,Proteomics,test_Median
0,RFC2_proteomics,0.08025


# Hnscc

In [19]:
h_del_wt = all_prot_format_df(h, [test_gene])
h_del_wt.head()



Name,RFC2_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00977,24.809899,Wildtype_Tumor
C3L-00987,24.42335,Deletion
C3L-00994,24.301302,Wildtype_Tumor
C3L-00995,24.104034,Wildtype_Tumor
C3L-00997,24.6555,Wildtype_Tumor


In [20]:
# Get all pvals
h_pval = u.wrap_ttest(h_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
h_pval = h_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Hnscc_P_Value'})

In [44]:
# tests
print(h_pval.loc[h_pval['Proteomics']==test_gene+'_proteomics'],'\n')
d = t_get_change_in_medians_df(h, "test", h_del_wt, [test_gene])
d

        Proteomics  Hnscc_P_Value
0  RFC2_proteomics       0.000632 

24.979897583001797  -  24.608463481258603 = 0.37143410174319413


Unnamed: 0,Proteomics,test_Median
0,RFC2_proteomics,-0.371434


# Luad

In [22]:
l_del_wt = all_prot_format_df(l, [test_gene])
l_del_wt = l_del_wt.loc[:,~l_del_wt.columns.duplicated()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
# Get all pvals
l_pval = u.wrap_ttest(l_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
l_pval = l_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Luad_P_Value'})
# isoforms for some proteins

In [45]:
# tests
print(l_pval.loc[l_pval['Proteomics']==test_gene+'_proteomics'],'\n')
d = t_get_change_in_medians_df(l, "test", l_del_wt, [test_gene])
d

        Proteomics  Luad_P_Value
0  RFC2_proteomics      0.317144 

0.4792  -  0.18975 = 0.28945


Unnamed: 0,Proteomics,test_Median
0,RFC2_proteomics,-0.28945


# Lscc

In [25]:
ls_del_wt = all_prot_format_df(ls, [test_gene])
#ls_del_wt = ls_del_wt.loc[:,~ls_del_wt.columns.duplicated()]



In [26]:
# Get all pvals
ls_pval = u.wrap_ttest(ls_del_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
ls_pval = ls_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Lscc_P_Value'})
#ls_pval # isoforms for some proteins

In [46]:
# tests
print(ls_pval.loc[ls_pval['Proteomics']==test_gene+'_proteomics'], '\n')
d = t_get_change_in_medians_df(ls, "test", ls_del_wt, [test_gene])
d

        Proteomics  Lscc_P_Value
0  RFC2_proteomics      0.000013 

1.1843  -  0.1048 = 1.0795


Unnamed: 0,Proteomics,test_Median
0,RFC2_proteomics,-1.0795


# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frame shifts). Different code is needed to create the data frame for Endometrial.

In [28]:
mut_type = en.get_genotype_all_vars(gene)

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_genes = [test_gene])
prot_and_mutations = prot_and_mutations[prot_and_mutations.Sample_Status == "Tumor"] # drop Normal samples
ip_df = prot_and_mutations.iloc[:,:-4] #drop mutation, location cols
mut_type = mut_type[['Mutation']] 
merged = ip_df.join(mut_type) # merge mutation col from function (includes cnv)

# Keep two values to compare
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [29]:
# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', cols, return_all = True, correction_method = 'fdr_bh')
e_pval = e_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'En_P_Value'})

In [49]:
#test
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

print(e_pval.loc[e_pval['Proteomics'] == test_gene+'_proteomics'], '\n')

for prot in [test_gene]:
    dif = trunc_med[prot+'_proteomics'] - wt_med[prot+'_proteomics']
    en_d[prot+'_proteomics'] = dif
    print(trunc_med[prot+'_proteomics'],'-', wt_med[prot+'_proteomics'],'=',en_d[prot+'_proteomics'])
    

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['En_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Proteomics'})
en_med

        Proteomics  En_P_Value
0  RFC2_proteomics    0.000651 

-0.10800000000000001 - 0.264 = -0.372


Unnamed: 0,Proteomics,En_Median
0,RFC2_proteomics,-0.372
