# Make tables: calculate p-values and differential expressions for each cancer 

Create a dataframe with p-value results from t-tests for all proteins (trans proteomics when PIK3CA has missense mutation compared to PTEN wildtype). The dataframe also includes the change in medians between missense mutation and wildtype.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u
import plot_utils as p

  import pandas.util.testing as tm


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
'''
Params
all_prot: list of trans genes
utils: from cptac

Returns a dataframe with trans proteomics and mutation status (PIK3CA missense or wt)
'''

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PIK3CA', utils = u):
    #mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    # Keep only tumor samples from proteomics
    prot_and_mutations = cancer_object.join_omics_to_mutations(
        mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot,
        tissue_type = 'tumor') # drop Normal samples
        
    # Reduce a multiindex 
    if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
        prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1) 

    # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
   
    mut_type["Mutation"] = mut_type['Mutation'].replace(['nonsynonymous SNV'], 'Missense_Mutation')#for colon
    
    hotspot_df = joined[joined.Location.str.contains('E542K') | 
                    joined.Location.str.contains('E545K') |
                    joined.Location.str.contains('H1047R')]
    prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
    merged = prot_df.join(mut_type)
        
    # Keep only Wildtype and Missense
    compare = ['Wildtype_Tumor','Missense_Mutation']
    get = merged['Mutation'].isin(compare)
    missense_wt = merged[get]
    return missense_wt
 


In [4]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PIK3CA. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PIK3CA wt and del (wt - Missense). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, miss_wt_df, all_prot_list):
    m = miss_wt_df[miss_wt_df.Mutation == "Missense_Mutation"]
    wt = miss_wt_df[miss_wt_df.Mutation == "Wildtype_Tumor"]
    miss_med = m.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot in miss_med.index and prot in wt_med.index:
            dif = miss_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

#  Step 1: Create data frames with p-values and differential expressions

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run t-tests for all genes in the proteomic data frame. Use get_change_in_medians_df to create the data frame with differential expression values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [3]:

en = cptac.Endometrial()
col = cptac.Colon()
b = cptac.Brca()

                                                

# Colon

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [6]:
gene = 'PIK3CA'
col_prot = col.get_proteomics()
col_prot = col_prot.drop(columns = 'PIK3CA') # cis effect
col_prot_list = list(col_prot.columns)

col_miss_wt = all_prot_format_df(col, col_prot_list)
col_miss_wt.head()


Name,A1BG_proteomics,A1CF_proteomics,A2M_proteomics,AAAS_proteomics,AACS_proteomics,AAGAB_proteomics,AAK1_proteomics,AAMDC_proteomics,AAMP_proteomics,AAR2_proteomics,...,ZNRD1_proteomics,ZNRF2_proteomics,ZPR1_proteomics,ZRANB2_proteomics,ZW10_proteomics,ZWILCH_proteomics,ZWINT_proteomics,ZYX_proteomics,ZZEF1_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO001,,,,,,,,,,,...,,,,,,,,,,Wildtype_Tumor
01CO005,-1.1,0.318,-0.487,0.0995,0.155,0.169,0.0653,-0.147,0.114,0.341,...,,0.0384,0.0221,0.25,0.0869,0.0331,,-0.733,-0.265,Wildtype_Tumor
01CO006,-1.12,-0.441,-0.347,-0.0029,0.0957,0.396,-0.0363,-0.549,0.22,0.248,...,0.324,,-0.394,0.0846,0.001,-0.345,,-0.658,0.0052,Wildtype_Tumor
01CO013,-1.89,0.112,-0.329,0.67,0.116,0.313,-0.238,-0.274,-0.554,0.27,...,,,-0.0656,-0.295,0.463,-0.448,0.0493,-0.904,-0.158,Wildtype_Tumor
01CO015,-1.62,0.263,-0.976,0.522,-0.273,0.504,-0.318,-0.846,0.2,0.53,...,,-0.73,-0.0554,0.437,0.0566,0.384,,-0.258,-0.343,Wildtype_Tumor


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [7]:
col_cols = list(col_miss_wt.columns[:-1])

# Get all pvals
col_pval = u.wrap_ttest(col_miss_wt, 'Mutation', col_cols, return_all = True, correction_method = 'fdr_bh', mincount = 5, pval_return_corrected=True)
col_pval = col_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': 'Colon_P_Value'})

Part 3: Create the differential expression df. (median of tumors with PIK3CA wildtype - median of tumors with PIK3CA missense mutation)

In [10]:
col_med = get_change_in_medians_df(col, "Colon", col_miss_wt, col_cols)




Part 4: Merge the p-value and differential expression dfs.

In [11]:
col_merged = col_pval.merge(col_med, on='Proteomics', how='outer')
col_merged = col_merged.replace(to_replace = r'_proteomics', value = '', regex = True)

# Create csv
#en_merged.to_csv('csv/Single_Cancer/Gbm_pval_medians.csv',index=False)
col_merged.head()

Unnamed: 0,Proteomics,Colon_P_Value,Colon_Median
0,ATP6V1D,0.398688,0.3625
1,ATP6V1E1,0.398688,0.2253
2,ABHD16A,0.446472,-0.2352
3,HIBADH,0.446472,-0.4651
4,OGT,0.542868,0.1172


# Repeat for other cancers.

In [12]:
cancer_objects = { 'Brca':b, 'Endo':en}

In [13]:
import warnings
warnings.filterwarnings('ignore')

gene = 'PIK3CA'
merged_dfs = {}
for cancer in cancer_objects:
    prot = cancer_objects[cancer].get_proteomics()
    prot = prot.drop(columns = 'PIK3CA') # cis effect
    
    # Reduce a multiindex 
    if isinstance(prot.keys(), pd.core.indexes.multi.MultiIndex):
        prot = u.reduce_multiindex(prot, levels_to_drop = 1)
    prot_list = list(prot.columns)
    
    # Format df for t-test
    miss_wt = all_prot_format_df(cancer_objects[cancer], prot_list)
    
    # Rename duplicate columns (isoforms)
    col_names = pd.Series(miss_wt.columns[:])
    for dup in col_names[col_names.duplicated()].unique(): 
        col_names[col_names[col_names == dup].index.values.tolist()] = [dup + '_isoform_' + str(i) if i != 0 else dup for i in range(sum(col_names == dup))]
    miss_wt.columns = col_names # rename the columns with the cols list
    
    # T-test
    cols_in = list(miss_wt.columns[:-1])

    # Get all pvals
    min_num = 5
    all_pval = u.wrap_ttest(miss_wt, 'Mutation', cols_in, return_all = True, correction_method = 'fdr_bh', mincount = min_num, pval_return_corrected=True)
    all_pval = all_pval.rename(columns = {'Comparison': 'Proteomics','P_Value': cancer+'_P_Value'})
    
    # Get difference in medians
    delta_median_df = get_change_in_medians_df(cancer_objects[cancer], cancer, miss_wt, cols_in)

    # Merge pval_df and delta_median_df
    pval_medians_df = all_pval.merge(delta_median_df, on='Proteomics', how='outer')
    pval_medians_df = pval_medians_df.replace(to_replace = r'_proteomics', value = '', regex = True)
    merged_dfs[cancer] = pval_medians_df # testing purposes
    
    # Create csv
    #pval_medians_df.to_csv('csv/Single_Cancer/'+cancer+'_pval_medians.csv', index=False)
    print(pval_medians_df.head(), '\n')

   Proteomics  Brca_P_Value  Brca_Median
0     BCL2L13      0.268889      -0.8575
1  HSPE1-MOB4      0.268889      -0.9800
2       CD320      0.403243      -1.6737
3      MRPL38      0.403243      -0.4385
4      PTPMT1      0.403243      -0.8060 

  Proteomics  Endo_P_Value  Endo_Median
0       A1BG      0.999946     0.071000
1      PRMT5      0.999946    -0.063500
2      PRMT6      0.999946     0.161620
3      PRMT7      0.999946    -0.094300
4      PRMT9      0.999946    -0.005094 



# Get number of significant genes in each cancer

In [14]:
# Add colon to merged_dfs dictionary
merged_dfs['Colon'] = col_merged


In [15]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.05
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.05]
    print(cancer, 'sig comparisons:', len(sig_df))
    sig_list = list(sig_df['Proteomics'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # keep only one if gene in multiple cancer sig lists
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

Brca sig comparisons: 0
Endo sig comparisons: 0
Colon sig comparisons: 0

Number of significant comparisons in at least 1 cancer: 0


In [16]:
# check

In [5]:
wp_pik3ca = u.get_interacting_proteins_wikipathways("PIK3CA")

In [9]:
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = ["PIK3CA"], omics_df_name = 'proteomics', omics_genes = wp_pik3ca,
    tissue_type = 'tumor')
prot_and_mutations

Name,NCK1_proteomics,CSNK1A1_proteomics,TTBK1_proteomics,ZYX_proteomics,EPOR_proteomics,PPARGC1A_proteomics,ADCY1_proteomics,GNAI3_proteomics,CREB3L1_proteomics,SCP2_proteomics,...,PIK3C2B_proteomics,ITGA4_proteomics,RASGRF1_proteomics,PPARA_proteomics,CDKN2A_proteomics,FOXC2_proteomics,PIK3CA_Mutation,PIK3CA_Location,PIK3CA_Mutation_Status,Sample_Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,-0.4620,0.3430,,-1.020,,,,0.125,,0.1020,...,-0.240,-0.1880,,,-0.2680,,[Missense_Mutation],[p.E545K],Single_mutation,Tumor
C3L-00008,-0.4610,0.2870,,-1.130,,,,0.499,,1.0900,...,-0.480,-0.1140,,,0.3820,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00032,0.0208,0.1860,,-0.540,,,,-0.331,,0.6200,...,-0.183,-0.3850,,,-0.0337,,[Missense_Mutation],[p.E545K],Single_mutation,Tumor
C3L-00090,-0.3540,0.2440,,-0.797,,,,0.188,0.126,0.4360,...,-0.046,-0.5050,,,-0.7020,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00098,0.1910,0.5180,,-1.850,,,,-0.222,,-0.3310,...,-0.370,-0.1920,,,-0.1870,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520,-0.6480,0.0253,,-0.830,,,,0.211,-0.506,0.6110,...,-0.272,-0.2850,,,-0.0388,,[Missense_Mutation],[p.E726K],Single_mutation,Tumor
C3N-01521,-0.1370,-0.0981,,-1.200,,,,1.000,,1.0900,...,0.179,-0.1640,,,-0.0839,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-01537,-0.0256,-0.2050,,-0.966,,,,0.512,-0.323,-0.0861,...,0.191,0.8650,,,2.0000,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-01802,-0.2190,0.3770,,-0.437,,,,-0.800,,-0.5510,...,0.428,-0.0389,,,1.2300,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor


In [12]:
hotspot_df = prot_and_mutations[prot_and_mutations.PIK3CA_Location.str.contains('E542K') | 
                    prot_and_mutations.PIK3CA_Location.str.contains('E545K') |
                    prot_and_mutations.PIK3CA_Location.str.contains('H1047R')|
                    prot_and_mutations.PIK3CA_Mutation.str.contains('Wildtype')]
hotspot_df
 
#prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
#merged = prot_df.join(hotspot_wt)


TypeError: unhashable type: 'list'

In [11]:
prot_and_mutations

Name,NCK1_proteomics,CSNK1A1_proteomics,TTBK1_proteomics,ZYX_proteomics,EPOR_proteomics,PPARGC1A_proteomics,ADCY1_proteomics,GNAI3_proteomics,CREB3L1_proteomics,SCP2_proteomics,...,PIK3C2B_proteomics,ITGA4_proteomics,RASGRF1_proteomics,PPARA_proteomics,CDKN2A_proteomics,FOXC2_proteomics,PIK3CA_Mutation,PIK3CA_Location,PIK3CA_Mutation_Status,Sample_Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00006,-0.4620,0.3430,,-1.020,,,,0.125,,0.1020,...,-0.240,-0.1880,,,-0.2680,,[Missense_Mutation],[p.E545K],Single_mutation,Tumor
C3L-00008,-0.4610,0.2870,,-1.130,,,,0.499,,1.0900,...,-0.480,-0.1140,,,0.3820,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00032,0.0208,0.1860,,-0.540,,,,-0.331,,0.6200,...,-0.183,-0.3850,,,-0.0337,,[Missense_Mutation],[p.E545K],Single_mutation,Tumor
C3L-00090,-0.3540,0.2440,,-0.797,,,,0.188,0.126,0.4360,...,-0.046,-0.5050,,,-0.7020,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00098,0.1910,0.5180,,-1.850,,,,-0.222,,-0.3310,...,-0.370,-0.1920,,,-0.1870,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520,-0.6480,0.0253,,-0.830,,,,0.211,-0.506,0.6110,...,-0.272,-0.2850,,,-0.0388,,[Missense_Mutation],[p.E726K],Single_mutation,Tumor
C3N-01521,-0.1370,-0.0981,,-1.200,,,,1.000,,1.0900,...,0.179,-0.1640,,,-0.0839,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-01537,-0.0256,-0.2050,,-0.966,,,,0.512,-0.323,-0.0861,...,0.191,0.8650,,,2.0000,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-01802,-0.2190,0.3770,,-0.437,,,,-0.800,,-0.5510,...,0.428,-0.0389,,,1.2300,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
