# Make tables: calculate p-values and differential expressions for each cancer 

Create a dataframe with p-value results from t-tests for all proteins (trans proteomics when KRAS has missense mutations compared to KRAS wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'KRAS', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    mut_type['Mutation'].where(mut_type['Mutation'] != 'Missense_Mutation', 'Missense', inplace = True) # replace when false
    mut_type['Mutation'].where(mut_type['Mutation'] != 'nonsynonymous SNV', 'Missense', inplace = True)

    # Keep only tumor samples from proteomics
    prot_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = [gene_in], omics_df_name = 'phosphoproteomics', omics_genes = all_prot, tissue_type = 'tumor')
    # Reduce multiindex keeping phosphosite
    if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
        if cancer_object.get_cancer_type() in ('endometrial', 'colon'):
            prot_and_mutations = u.reduce_multiindex(prot_and_mutations, flatten=True) 
        elif cancer_object.get_cancer_type() in ('luad'):
            prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = [2,3], flatten = True) 
        prot_and_mutations = rename_duplicate_cols(prot_and_mutations) # make unique cols

    # Keep certain missense mutations
    prot_and_mutations['KRAS_Location'] = [','.join(map(str, l)) for l in prot_and_mutations['KRAS_Location']]
    hotspots = ['G12', 'G13', 'Q61', 'No_mutation']
    hotspots_wt = pd.DataFrame()
    for site in hotspots:
        df = prot_and_mutations[prot_and_mutations.KRAS_Location.str.contains(site, regex= True, na=False)]
        hotspots_wt = hotspots_wt.append(df)
    #print(hotspots_wt.KRAS_Location.value_counts())
    
    # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
    mut_type = mut_type[['Mutation']]
    prot_df = hotspots_wt.iloc[:,:-4] # Keep only proteomics
    merged = prot_df.join(mut_type)
    merged = merged.join(hotspots_wt['KRAS_Location'])

    # Keep only Wildtype and deletion
    compare = ['Wildtype_Tumor','Missense']
    get = merged['Mutation'].isin(compare)
    mut_wt = merged[get]
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_phosphoproteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [4]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    d = del_wt_df[del_wt_df.Mutation == "Missense"]
    wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot in del_med.index and prot in wt_med.index:
            dif = del_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Phospho'})
    
    return df
        

In [5]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [6]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Missense','Wildtype_Tumor'), 
    binary_col = 'Mutation', gene = 'KRAS'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        set_omics_cols = set(omics_cols)
        if binary_col in (set_omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    
    print('Num genes with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

#  Step 1: Create data frames with p-values and differential expressions

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run t-tests for all genes in the proteomic data frame. Use get_change_in_medians_df to create the data frame with differential expression values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [2]:
en = cptac.Endometrial()
l = cptac.Luad()
col = cptac.Colon()

                                                

In [None]:
# Get MAPK pathway proteins
prot = u.get_proteins_in_pathways('MAPK Signaling Pathway', 'wikipathways')
prot_list = list(prot.member)
print('Num interacting proteins:', len(prot_list))

# Calculate p-values and differential expressions

In [8]:
cancer_objects = {'Endo':en, 'Colon':col, 'Luad':l}

In [9]:
# Get total num phospho cols
total_phospho = {}
for c in cancer_objects:
    df = cancer_objects[c].get_phosphoproteomics()
    total_phospho[c] = len(df.columns)    

In [10]:
# Get MAPK pathway proteins
prot = u.get_proteins_in_pathways('MAPK Signaling Pathway', 'wikipathways')
prot_list = list(prot.member)
print('Num interacting proteins:', len(prot_list))

gene = 'KRAS'
merged_dfs = {}
    
for cancer in cancer_objects:
    print(cancer)
    
    # Format df for t-test
    mut_wt = all_prot_format_df(cancer_objects[cancer], prot_list)
    mut_wt = mut_wt.dropna(axis = 'columns', how = 'all') #drop cols with only nan
    
    # T-test
    cols_in = list(mut_wt.columns[:-2])
    print('cols tested: ', len(cols_in), '/', total_phospho[cancer], 'of total')

    # Get all pvals
    min_num = 5
    all_pval = u.wrap_ttest(mut_wt, 'Mutation', cols_in, return_all = True, correction_method = 'fdr_bh', mincount = min_num)
    all_pval = all_pval.rename(columns = {'Comparison': 'Phospho','P_Value': cancer+'_P_Value'})
    
    # Get difference in medians
    delta_median_df = get_change_in_medians_df(cancer_objects[cancer], cancer, mut_wt, cols_in)

    # Merge pval_df and delta_median_df
    pval_medians_df = all_pval.merge(delta_median_df, on='Phospho', how='outer')
    pval_medians_df = pval_medians_df.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
    merged_dfs[cancer] = pval_medians_df # testing purposes
    
    # Create csv
    pval_medians_df.to_csv('csv/Single_Cancer/'+cancer+'_phospho_MAPK.csv', index=False)
    print(pval_medians_df.head(), '\n')

Num interacting proteins: 246
Endo
cols tested:  1375 / 73212 of total
      Phospho  Endo_P_Value  Endo_Median
0  MKNK2_S220      0.001348      0.94700
1  NFKB2_S858      0.001348      0.69285
2   RAF1_T330      0.002399      1.12066
3   TAB1_S378      0.002916      0.46350
4  MKNK1_S401      0.003520      0.47800 

Colon
cols tested:  729 / 31339 of total
              Phospho  Colon_P_Value  Colon_Median
0  TGFBR2_S553_P37173       0.176137        0.4155
1    RELA_S238_Q04206       0.229576       -0.4200
2     NF1_S864_P21359       0.980298        0.0255
3    NF1_S2543_P21359       0.980298        0.2695
4    NF1_S2515_P21359       0.980298       -0.0775 

Luad
cols tested:  867 / 40971 of total
          Phospho  Luad_P_Value  Luad_Median
0      SOS1_S1161      0.000055      1.30250
1  MKNK1_S209S214      0.015097      1.49000
2      MKNK2_S220      0.015097      1.33870
3       FLNB_S316      0.015097      0.98345
4      FLNB_T2585      0.015097      1.00035 



# Get number of significant genes in each cancer

In [11]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.05
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.05]
    print(cancer, 'sig comparisons:', len(sig_df))
    sig_list = list(sig_df['Phospho'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # keep only one if gene in multiple cancer sig lists
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

Endo sig comparisons: 17
Colon sig comparisons: 0
Luad sig comparisons: 20

Number of significant comparisons in at least 1 cancer: 34


In [12]:
# Find common sites
e_df = merged_dfs['Endo']
e_sig = e_df.loc[e_df['Endo_P_Value'] < 0.05]
e_list = list(e_sig.Phospho)

luad_df = merged_dfs['Luad']
luad_sig = luad_df.loc[luad_df['Luad_P_Value'] < 0.05]
luad_list = list(luad_sig.Phospho)

both = [value for value in e_list if value in luad_list]
print('Shared sites:', len(both))
both

Shared sites: 3


['MKNK2_S220', 'NFKB2_S858', 'SOS1_S1161']

# Get a list of significant genes in at least one cancer

In [13]:
s = pd.Series(sig)
print('Number of significant genes in at least one cancer:', len(s))
s.to_csv('csv/list_sig_one_cancer_phospho_MAPK.csv', index=False)

Number of significant genes in at least one cancer: 34


# Get a list of significant genes in multiple cancers

In [14]:
# sig in multiple cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant genes in mult cancers:', len(mult))

Number of significant genes in mult cancers: 3


In [15]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_phosphoproteomics$', value = '', regex = True)
m.to_csv('csv/list_sig_mult_cancers_phospho_MAPK.csv', index=False)