# Make tables: calculate p-values and differential expressions for each cancer 

Create a df with FDR p-value results for proteins in the MAPK signaling pathway (t-test of trans phosphoproteomics with KRAS hotspot compared to KRAS wildtype). The df also includes differential expressions. The three cancers with frequent hotspot mutations are included in the df (EC, CO, LUAD).

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
print('cptac version:', cptac.version())

cptac version: 0.8.6


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
'''
Params
cancer_object: Object. The loaded cancer data set.
all_prot: List. trans proteins used to get proteomics data. 
gene_in: String. Gene used to get mutation status for the Mutation col.
utils: utils package from cptac.  

Returns a dataframe with trans proteomics and mutation status of a specific gene.
'''

def phospho_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    cancer_name = cancer_object.get_cancer_type()
    
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_name not in ('luad', 'endometrial'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'phosphoproteomics', omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce multiindex 
        if cancer_name not in ('colon'):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = [2,3], flatten = True) # multi cols: gene, site, peptide, database_ID
        else: 
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = [2], flatten = True)
        prot_and_mutations = rename_duplicate_cols(prot_and_mutations)
        
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('luad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='phosphoproteomics',genes1= gene_in, 
            genes2= all_prot, tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = [2,3], flatten = True) # luad multi: gene, site, number
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_CNV')
       
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
        
    elif cancer_object.get_cancer_type() in ('endometrial'):
        # merge cnv with genotype all mut type
        prot_and_mutations = cancer_object.join_omics_to_mutations(mutations_genes = [gene_in], 
                    omics_df_name = 'phosphoproteomics', omics_genes = all_prot, tissue_type = 'tumor') # drop Normal samples
        prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, flatten = True) # endo multi: gene, site
        
        prot_df = prot_and_mutations.iloc[:,:-4] #drop Mutation and Location cols
        mut_type = mut_type[['Mutation']] # Get Mutation col that includes CNV
        merged = prot_df.join(mut_type) # merge 

        # Create Truncation category and keep truncation and wt
        compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
                        mut_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_phosphoproteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [51]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Formatted df with samples with PTEN los or wt.
all_prot_list: List. All proteins in proteomics data frame. 

Returns a df with differential expression for phosphoproteomics (PTEN loss - wt). 
A pos value represents an increase in phosphorylation abundance with mutant PTEN. 
A neg value represents an decrease in phosphorylation abundance with mutant PTEN.
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    if  cancer_object.get_cancer_type() in ('endometrial'):
        d = del_wt_df[del_wt_df.Mutation == "Truncation"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    else:
        d = del_wt_df[del_wt_df.Mutation == "Deletion"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # + is mutant up compared to wt, - is mutant down
    for site in all_prot_list:
        if site in del_med.index and site in wt_med.index:
            dif = del_med[site] - wt_med[site]
            med_dict[site] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Phospho'})
    
    return df

In [32]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate
Returns a df with unique column names. '''

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

In [7]:
'''
df: DataFrame containing omics data and a binary column.
mincount: Int of the minimum num of actual values needed to pass cutoff and not be listed in not_enough_data list. 
omics_cols: Names of columns to check if there is enough data. 

Returns: List of genes with not enough data (num of non-NaN values are less than mincount).
'''

def find_few_data_genes(df, mincount, omics_cols = None, binary_labels = ('Missense','Wildtype_Tumor'), 
    binary_col = 'Mutation', gene = 'KRAS'):
    
    # Separate into binary groups
    label_1 = binary_labels[0]
    label_2 = binary_labels[1]
    partition1 = df.loc[df[binary_col] == label_1]
    partition2 = df.loc[df[binary_col] == label_2]
    
    # Get list of columns
    if omics_cols is None:
        omics_cols = list(df.columns)
        omics_cols.remove(binary_col)
    else:
        set_omics_cols = set(omics_cols)
        if binary_col in (set_omics_cols):
            omics_cols.remove(binary_col)
        
    # Append genes with less data than mincount (for mut or wt) to list
    not_enough_data = []
    for c in omics_cols:
        #print(len(partition1[c].dropna(axis='rows')))
        #print(len(partition2[c].dropna(axis='rows')))
        if len(partition1[c].dropna(axis='rows')) <= mincount: #non-nan vals less than min
            not_enough_data.append(c)
            continue
        elif len(partition2[c].dropna(axis='rows')) <= mincount:
            not_enough_data.append(c)
            continue
        
    
    print('Num genes with not enough data: ', len(not_enough_data), '/', len(omics_cols))        
    return not_enough_data

#  Step 1: Create data frames with p-values and differential expressions

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run t-tests for all genes in the proteomic data frame. Use get_change_in_medians_df to create the data frame with differential expression values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [5]:
g = cptac.Gbm()
en = cptac.Endometrial()
h = cptac.Hnscc()
l = cptac.Luad()
ls = cptac.Lscc()
o = cptac.Ovarian()
col = cptac.Colon()
b = cptac.Brca()

                                                

In [70]:
# Get PIK3CA/AKT pathway proteins
prot = u.get_proteins_in_pathways('PI3K-Akt Signaling Pathway', 'wikipathways')
prot_list = list(prot.member)
prot_list.remove('PTEN')
print('Num interacting proteins:', len(prot_list))

Num interacting proteins: 337


# Calculate p-values and differential expressions

In [36]:
cancer_objects = {'GBM': g, 'HNSCC':h, 'LUAD':l, 'OV':o, 'EC': en, 'LSCC':ls, 'BR':b, 'CO':col}

In [74]:
gene = 'PTEN'
merged_dfs = {}
    
for cancer in cancer_objects:
    print('\n',cancer)
    
    # Format df for t-test
    mut_wt = phospho_format_df(cancer_objects[cancer], prot_list)
    mut_wt = mut_wt.dropna(axis = 'columns', how = 'all') #drop cols with only nan
    
    # T-test
    cols_in = list(mut_wt.columns[:-2])
    print('cols tested: ', len(cols_in))
    
    # Get all pvals
    min_num = 5
    all_pval = u.wrap_ttest(mut_wt, 'Mutation', cols_in, return_all = True, correction_method = 'fdr_bh', mincount = min_num)
    all_pval = all_pval.rename(columns = {'Comparison': 'Phospho','P_Value': cancer+'_P_Value'})
    sig = all_pval.loc[all_pval[cancer+'_P_Value'] < 0.05]
    print('Num sig results:', len(sig))
    
    
    # Get difference in medians
    delta_median_df = get_change_in_medians_df(cancer_objects[cancer], cancer, mut_wt, cols_in)

    # Merge pval_df and delta_median_df
    pval_medians_df = all_pval.merge(delta_median_df, on='Phospho', how='outer')
    pval_medians_df = pval_medians_df.replace(to_replace = r'_phosphoproteomics', value = '', regex = True)
    merged_dfs[cancer] = pval_medians_df # testing purposes
    
    # Create csv
    #pval_medians_df.to_csv('csv/Single_Cancer/'+cancer+'_phospho.csv', index=False)
    print(pval_medians_df.head(), '\n')


 GBM
cols tested:  2074
Num sig results: 8
                 Phospho  GBM_P_Value  GBM_Median
0            SPP1_S276_1     0.008291   -1.622130
1             BRCA1_S114     0.010892    1.109838
2  BRCA1_S1187S1189S1191     0.010892    0.655312
3            BRCA1_S1481     0.046117    0.498223
4       ITGB4_S1454S1457     0.046117    0.373828 


 HNSCC
cols tested:  1176
Num sig results: 0
          Phospho  HNSCC_P_Value  HNSCC_Median
0     BRCA1_S1524       0.070982      0.879135
1  MAPK3_T202Y204       0.161509     -0.582324
2    COL6A3_S1783       0.161509     -0.942868
3      EIF4B_S309       0.161509     -0.451019
4      BRCA1_S632       0.161509      0.330176 


 LUAD
cols tested:  676
Num sig results: 8
       Phospho  LUAD_P_Value  LUAD_Median
0   EGFR_Y1197      0.025553      2.12725
1   EGFR_S1064      0.025553      2.82820
2  PIK3R5_S507      0.025553     -0.98655
3    EGFR_T693      0.025553      1.65610
4   EGFR_Y1172      0.029921      2.66920 


 OV
cols tested:  2442
Nu

# Get number of significant genes in each cancer

In [75]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.05
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.05]
    print(cancer, 'sig comparisons:', len(sig_df))
    #print(sig_df)
    sig_list = list(sig_df['Phospho'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # keep only one if gene in multiple cancer sig lists
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

GBM sig comparisons: 8
HNSCC sig comparisons: 0
LUAD sig comparisons: 8
OV sig comparisons: 0
EC sig comparisons: 2
LSCC sig comparisons: 0
BR sig comparisons: 0
CO sig comparisons: 0

Number of significant comparisons in at least 1 cancer: 18


In [76]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.1
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.1]
    print(cancer, 'sig comparisons:', len(sig_df))
    #print(sig_df)
    sig_list = list(sig_df['Phospho'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # keep only one if gene in multiple cancer sig lists
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

GBM sig comparisons: 25
HNSCC sig comparisons: 1
LUAD sig comparisons: 12
OV sig comparisons: 0
EC sig comparisons: 2
LSCC sig comparisons: 0
BR sig comparisons: 0
CO sig comparisons: 0

Number of significant comparisons in at least 1 cancer: 38


In [91]:
# Find common sites

cancer_1 = 'GBM'
cancer_2 = 'LUAD'
a = 0.1

c1_df = merged_dfs[cancer_1]
c1_sig = c1_df.loc[c1_df[cancer_1+'_P_Value'] < a]
c1_list = list(c1_sig.Phospho)

c2_df = merged_dfs[cancer_2]
c2_sig = c2_df.loc[c2_df[cancer_2+'_P_Value'] < a]
c2_list = list(c2_sig.Phospho)

both = [value for value in c1_list if value in c2_list]
print('Shared sites:', len(both))
both

Shared sites: 2


['EGFR_S1064', 'PIK3R1_S83']

# Get a list of significant genes in at least one cancer

In [14]:
s = pd.Series(sig)
print('Number of significant genes in at least one cancer:', len(s))
s.to_csv('csv/list_sig_one_cancer_phospho.csv', index=False)

Number of significant genes in at least one cancer: 62


# Get a list of significant genes in multiple cancers

In [15]:
# sig in multiple cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant genes in mult cancers:', len(mult))

Number of significant genes in mult cancers: 4


In [16]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_phosphoproteomics$', value = '', regex = True)
m.to_csv('csv/list_sig_mult_cancers_phospho.csv', index=False)