# Make tables: calculate p-values and differential expressions for each cancer 

Create a dataframe with p-value results from t-tests for all proteins (trans proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import re

import cptac
import cptac.utils as u
import plot_utils as p

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Returns a dataframe with proteomics and mutation type

# all_prot: list of trans genes

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in)
    
    if cancer_object.get_cancer_type() not in ('luad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(
            mutations_genes = [gene_in], omics_df_name = 'proteomics', omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations)
        
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] 
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('luad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics',genes1= gene_in, 
            genes2= all_prot, tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_CNV')
       
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_proteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [4]:
'''
Params
cancer_object: Object. Variable for the loaded cancer data set.
cancer_name: Str. name to add to the created dataframe.
del_wt_df: DataFrame. Only samples with deletions and wildtype for PTEN. 
all_prot_list: List. All proteins in proteomics data frame. 

Returns a dataframe with the difference in medians between proteomics with PTEN wt and del (wt - del). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    if  cancer_object.get_cancer_type() in ('endometrial'):
        d = del_wt_df[del_wt_df.Mutation == "Truncation"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    else:
        d = del_wt_df[del_wt_df.Mutation == "Deletion"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot in del_med.index and prot in wt_med.index:
            dif = del_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Proteomics'})
    
    return df
        

In [5]:
# Adds '_i' to duplicate col names where i is a number (increases with each duplicate)

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

#  Step 1: Create data frames with p-values and differential expressions

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run t-tests for all genes in the proteomic data frame. Use get_change_in_medians_df to create the data frame with differential expression values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [6]:
#g = cptac.Gbm()
#en = cptac.Endometrial()
#h = cptac.Hnscc()
#l = cptac.Luad()
#ls = cptac.Lscc()
#o = cptac.Ovarian()
col = cptac.Colon()
#b = cptac.Brca()

                                          

In [7]:
# Get df with PTEN deletions and wt
cancer = col
gene = 'PTEN'
prot = cancer.get_proteomics()
#prot= u.reduce_multiindex(prot, levels_to_drop = 1)
prot = prot.drop(columns = 'PTEN') # cis effect
prot_list = list(prot.columns)

del_wt = all_prot_format_df(cancer, prot_list)
del_wt.head()

Name,A1BG,A1CF,A2M,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZNRD1,ZNRF2,ZPR1,ZRANB2,ZW10,ZWILCH,ZWINT,ZYX,ZZEF1,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01CO001,,,,,,,,,,,...,,,,,,,,,,Wildtype_Tumor
01CO005,-1.1,0.318,-0.487,0.0995,0.155,0.169,0.0653,-0.147,0.114,0.341,...,,0.0384,0.0221,0.25,0.0869,0.0331,,-0.733,-0.265,Deletion
01CO006,-1.12,-0.441,-0.347,-0.0029,0.0957,0.396,-0.0363,-0.549,0.22,0.248,...,0.324,,-0.394,0.0846,0.001,-0.345,,-0.658,0.0052,Wildtype_Tumor
01CO008,-1.2,0.16,-1.85,0.119,-0.0924,0.0187,-0.214,0.328,-0.282,-0.348,...,,-0.384,-0.168,0.357,-0.325,0.349,,-0.821,0.2,Wildtype_Tumor
01CO013,-1.89,0.112,-0.329,0.67,0.116,0.313,-0.238,-0.274,-0.554,0.27,...,,,-0.0656,-0.295,0.463,-0.448,0.0493,-0.904,-0.158,Wildtype_Tumor


In [8]:
# get ids of samples with PTEN del
df = del_wt.loc[del_wt['Mutation'] == 'Deletion']
del_ids = list(df.index)
print('total samples with PTEN deletion:', len(del_ids))

total samples with PTEN deletion: 25


In [9]:
# check how many samples are in stage 1 and have PTEN loss
clinical = cancer.get_clinical()
#print(clinical.columns)
print('total samples:', '\n')
print(clinical.Stage.value_counts(), '\n')

only_del = clinical[clinical.index.isin(del_ids)]
print('samples with PTEN loss:')
only_del.Stage.value_counts()

total samples: 

Stage III    48
Stage II     42
Stage I      12
Stage IV      8
Name: Stage, dtype: int64 

samples with PTEN loss:


Stage II     12
Stage III     8
Stage I       4
Stage IV      1
Name: Stage, dtype: int64

In [10]:
# get a list of all samples with initial stages
id_list = []

stage_list = ['Stage I', 'Stage II'] # colon
#stage_list = ['I', 'II', 'IA', 'IB', 'IIA', 'IIB'] # LSCC

for stage in stage_list:
    stage_df = clinical.loc[clinical['Stage'] == stage]
    s1 = list(stage_df.index)
    id_list += s1
len(id_list)

54

In [11]:
# Keep samples in the initial stages
initial = del_wt[del_wt.index.isin(id_list)]
initial.Mutation.value_counts()

Wildtype_Tumor    36
Deletion          16
Name: Mutation, dtype: int64

In [12]:
prot_cols = list(initial.columns[:-1])

# Get all pvals
pval = u.wrap_ttest(initial, 'Mutation', prot_cols, return_all = True, correction_method = 'fdr_bh', mincount = 5)
pval

Unnamed: 0,Comparison,P_Value
0,NAXE,0.090697
1,FAM45A,0.166665
2,FHL2,0.321489
3,NDUFAF3,0.321489
4,LTBP2,0.362640
...,...,...
6784,FAU,0.999204
6785,TRIM22,0.999204
6786,RRM2,0.999204
6787,PGLYRP2,0.999378


In [13]:
initial[['NAXE', 'Mutation']]

Name,NAXE,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
01CO005,0.147,Deletion
01CO008,-0.252,Wildtype_Tumor
01CO013,0.0459,Wildtype_Tumor
01CO015,0.0692,Deletion
01CO022,-0.47,Wildtype_Tumor
05CO003,-0.148,Wildtype_Tumor
05CO005,,Wildtype_Tumor
05CO011,0.402,Deletion
05CO015,-0.339,Wildtype_Tumor
05CO020,-0.363,Wildtype_Tumor
