# Make tables: calculate p-values and differential expressions for each cancer 

Create a dataframe with p-value results from t-tests for all proteins (trans proteomics when KRAS has missense mutations compared to KRAS wildtype). The dataframe also includes the differential expressions. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
'''
Params
cancer_object: Object of the loaded cancer data set.
all_prot: List of trans proteins used to get phosphoproteomics data. 
gene_in: String. Gene used to get mutation status for the Mutation col.
utils: utils package from cptac.  

Returns a dataframe with trans phosphoproteomics and mutation status of a specific gene.
'''

def phospho_format_df(cancer_object, all_prot, gene_in = 'KRAS', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    mut_type['Mutation'].where(mut_type['Mutation'] != 'Missense_Mutation', 'Missense', inplace = True) # replace when false
    mut_type['Mutation'].where(mut_type['Mutation'] != 'nonsynonymous SNV', 'Missense', inplace = True)

    # Keep only tumor samples from proteomics
    prot_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = [gene_in], omics_df_name = 'phosphoproteomics', omics_genes = all_prot, tissue_type = 'tumor')
    # Reduce a multiindex 
    if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
        if cancer_object.get_cancer_type() in ('endometrial', 'colon'):
            prot_and_mutations = u.reduce_multiindex(prot_and_mutations, flatten=True) 
        elif cancer_object.get_cancer_type() in ('luad'):
            prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = [2,3], flatten = True) 
        prot_and_mutations = rename_duplicate_cols(prot_and_mutations)

    # Keep certain missense mutations
    prot_and_mutations['KRAS_Location'] = [','.join(map(str, l)) for l in prot_and_mutations['KRAS_Location']]
    hotspots = ['G12', 'G13', 'Q61', 'No_mutation']
    hotspots_wt = pd.DataFrame()
    for site in hotspots:
        df = prot_and_mutations[prot_and_mutations.KRAS_Location.str.contains(site, regex= True, na=False)]
        hotspots_wt = hotspots_wt.append(df)
    #print(hotspots_wt.KRAS_Location.value_counts())
    
    # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
    mut_type = mut_type[['Mutation']]
    prot_df = hotspots_wt.iloc[:,:-4] # Keep only proteomics
    merged = prot_df.join(mut_type)
    merged = merged.join(hotspots_wt['KRAS_Location'])

    # Keep only Wildtype and deletion
    compare = ['Wildtype_Tumor','Missense']
    get = merged['Mutation'].isin(compare)
    mut_wt = merged[get]
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_phosphoproteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [4]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Formatted df with samples with hotspot missense or wt for KRAS.
all_prot_list: List. All proteins in proteomics data frame. 

Returns a df with differential expression for phosphoproteomics (KRAS hotspot - wt). 
A pos value represents an increase in phosphorylation abundance with mutant KRAS. 
A neg value represents an decrease in phosphorylation abundance with mutant KRAS.
'''

def get_change_in_medians_df(cancer_object, cancer_name, mut_wt_df, all_prot_list):
    miss = mut_wt_df[mut_wt_df.Mutation == "Missense"]
    wt = mut_wt_df[mut_wt_df.Mutation == "Wildtype_Tumor"]
    miss_med = miss.median()
    wt_med = wt.median()

    med_dict = {}
    
    for prot in all_prot_list:
        if prot in miss_med.index and prot in wt_med.index:
            dif = miss_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Phospho'})
    
    return df
        

In [5]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate
Returns a df with unique column names. '''

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

#  Step 1: Create data frames with p-values and differential expressions

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run t-tests for all genes in the proteomic data frame. Use get_change_in_medians_df to create the data frame with differential expression values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [6]:
en = cptac.Endometrial()
l = cptac.Luad()
col = cptac.Colon()

                                                

# Calculate p-values and differential expressions

In [7]:
cancer_objects = {'EC':en, 'CO':col, 'LUAD':l}

In [8]:
# Get total num prot cols
total_prot = {}
all_prot = []
for c in cancer_objects:
    df = cancer_objects[c].get_proteomics()
    if isinstance(df.keys(), pd.core.indexes.multi.MultiIndex):
        df = u.reduce_multiindex(df, levels_to_drop = 1)
    #df = rename_duplicate_cols(df) # make unique col names
    cols = list(df.columns)
    total_prot[c] = len(cols) # get total num genes
    all_prot = all_prot + cols 
prot_list = list(set(all_prot)) # get unique genes in proteomics for en, luad, colon

In [10]:
gene = 'KRAS'
merged_dfs = {}

print('All proteins:', len(prot_list))

for cancer in cancer_objects:
    print(cancer)
    # Format df for t-test
    mut_wt = phospho_format_df(cancer_objects[cancer], prot_list)
    mut_wt = mut_wt.dropna(axis = 'columns', how = 'all') # not enough data for cis effect (wrap_ttest drops)
    
    # T-test
    cols_in = list(mut_wt.columns[:-2])
    print('cols tested: ', len(cols_in))

    # Get all pvals
    min_num = 5
    all_pval = u.wrap_ttest(mut_wt, 'Mutation', cols_in, return_all = True, correction_method = 'fdr_bh', mincount = min_num)
    all_pval = all_pval.rename(columns = {'Comparison': 'Phospho','P_Value': cancer+'_P_Value'})
    
    # Get difference in medians
    delta_median_df = get_change_in_medians_df(cancer_objects[cancer], cancer, mut_wt, cols_in)

    # Merge pval_df and delta_median_df
    pval_medians_df = all_pval.merge(delta_median_df, on='Phospho', how='outer')
    pval_medians_df = pval_medians_df.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
    merged_dfs[cancer] = pval_medians_df # testing purposes
    
    # Create csv
    pval_medians_df.to_csv('csv/Single_Cancer/'+cancer+'_all_phospho.csv', index=False)
    print(pval_medians_df.head(), '\n')

All proteins: 12336
EC
cols tested:  68028
        Phospho  EC_P_Value  EC_Median
0   DTNBP1_S297    0.000028    0.85025
1   IGF2R_S2479    0.000045    1.03850
2   DTNBP1_S300    0.000045    0.93113
3     TPR_S1662    0.000073    1.08325
4  PLEKHS1_S185    0.000100    0.84150 

CO
cols tested:  30241
                Phospho  CO_P_Value  CO_Median
0  KIAA1468_S180_Q9P260    0.096680    -0.6265
1      IRS2_T404_Q9Y4H2    0.229934    -0.9090
2      IRS2_S620_Q9Y4H2    0.334754    -0.6590
3      TAP1_T545_Q03518    0.813854    -0.7525
4      IRS2_S736_Q9Y4H2    0.813854    -0.3880 

LUAD
cols tested:  39669
         Phospho  LUAD_P_Value  LUAD_Median
0      KRT8_S477      0.000079      2.82370
1  KRT8_S475S478      0.000079      4.51545
2      EEF1B2_S8      0.000079      2.11240
3    DNMBP_S1436      0.000244      1.63160
4     CTBP2_S905      0.000310      1.22980 



# Get number of significant genes in each cancer

In [None]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.05
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.05]
    print(cancer, 'sig comparisons:', len(sig_df))
    sig_list = list(sig_df['Phosphoproteomics'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # keep only one if gene in multiple cancer sig lists
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

# Get a list of significant genes in at least one cancer

In [None]:
s = pd.Series(sig)
print('Number of significant genes in at least one cancer:', len(s))
s.to_csv('csv/list_sig_all_phospho.csv', index=False)

# Get a list of significant genes in multiple cancers

In [None]:
# sig in multiple cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant genes in mult cancers:', len(mult))

In [None]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_proteomics$', value = '', regex = True)
m.to_csv('csv/list_sig_mult_all_phospho.csv', index=False)

In [None]:
m