# Make tables: calculate p-values and differential expressions for each cancer 

Create a dataframe with p-value results from t-tests for all proteins (trans proteomics when PTEN has cnv deletions compared to PTEN wildtype). The dataframe also includes the change in medians between deletions and wildtype. Prepare these tables for further analysis by creating csv files.

In [6]:
import pandas as pd
import numpy as np
import re

import cptac
import cptac.pancan as pc
import cptac.utils as u
import plot_utils as p

In [2]:
print('cptac version:', cptac.version())

cptac version: 0.8.6


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [88]:
'''
Params
cancer_object: Object. The loaded cancer data set.
all_prot: List. trans proteins used to get proteomics data. 
gene_in: String. Gene used to get mutation status for the Mutation col.
utils: utils package from cptac.  

Returns a dataframe with trans proteomics and mutation status of a specific gene.
'''

def all_prot_format_df(cancer_object, all_prot, gene_in = 'PTEN', utils = u):
    mut_type = cancer_object.get_genotype_all_vars(gene_in, omics_source = 'washu') # washu has cnv
    if cancer_object.get_cancer_type() not in ('pancanluad'):
        # Keep only tumor samples from proteomics
        prot_and_mutations = cancer_object.join_omics_to_mutations(mutations_genes = [gene_in], 
                                            omics_df_name = 'proteomics', omics_source = 'umich', 
                                            omics_genes = all_prot, tissue_type = 'tumor')
        # Reduce a multiindex 
        if isinstance(prot_and_mutations.keys(), pd.core.indexes.multi.MultiIndex):
            prot_and_mutations = utils.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)
            prot_and_mutations = rename_duplicate_cols(prot_and_mutations) # add '_i' to distinguish
            
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        mut_type = mut_type[['Mutation']] # get mutation column (created in get_genotype_all_vars)
        prot_df = prot_and_mutations.iloc[:,:-4] # Keep only proteomics
        merged = prot_df.join(mut_type)
        
        # Keep only Wildtype and deletion
        compare = ['Wildtype_Tumor','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
    
    
    # Luad has no somatic mutations for PTEN which changes some things
    elif cancer_object.get_cancer_type() in ('pancanluad'): 
        # get_genotype_all_vars add cnv data under the column PTEN
        mut_type = mut_type.drop(columns= gene_in)
        # different code because no somatic mutation data for pten (can't join to somatic mutations)
        omics = cancer_object.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics', df1_source = 'washu', 
                                                  df2_source = 'umich', genes1= gene_in, genes2= all_prot, 
                                                  tissue_type = 'tumor')
        omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
        omics = rename_duplicate_cols(omics)
        omics = omics.drop(columns='PTEN_washu_CNV')
       
        # Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
        merged = omics.join(mut_type) 
        # Keep only No_Mutation (wildtype) and deletion
        compare = ['No_Mutation','Deletion']
        get = merged['Mutation'].isin(compare)
        mut_wt = merged[get]
        mut_wt['Mutation'] = np.where(
            mut_wt['Mutation'] == 'No_Mutation', 'Wildtype_Tumor', 'Deletion')
    
    mut_wt = mut_wt.rename(columns={col: re.sub(r'_umich_proteomics', '', col) for col in mut_wt.columns.tolist()})
    return mut_wt


In [63]:
'''
Params
cancer_object: Object of the loaded cancer data set.
cancer_name: String. Name to add to the created df.
del_wt_df: DataFrame. df of only samples with PTEN deletion and wildtype. 
all_prot_list: List. All proteins in proteomics df. 

Returns a df with the differential expression between proteomics with PTEN del and wt (del - wt). 
'''

def get_change_in_medians_df(cancer_object, cancer_name, del_wt_df, all_prot_list):
    if  cancer_object.get_cancer_type() in ('pancanucec'):
        d = del_wt_df[del_wt_df.Mutation == "Truncation"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    else:
        d = del_wt_df[del_wt_df.Mutation == "Deletion"]
        wt = del_wt_df[del_wt_df.Mutation == "Wildtype_Tumor"]
    del_med = d.median()
    wt_med = wt.median()

    med_dict = {}
    
    # Correlation: + is mutant up compared to wt, - is mutant down
    for prot in all_prot_list:
        if prot in del_med.index and prot in wt_med.index:
            dif = del_med[prot] - wt_med[prot]
            med_dict[prot] = dif
        
    df = pd.DataFrame.from_dict(med_dict, orient='index', columns=[cancer_name+'_Median'])
    df = df.reset_index().rename(columns={'index':'Umich_Proteomics'})
    
    return df
        

In [53]:
''' Adds '_i' to duplicate col names where the int i increases by one with each duplicate 
Returns a df with unique column names. '''

def rename_duplicate_cols(df):
    cols = pd.Series(df.columns[:])

    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]

    # rename the columns with the cols list.
    df.columns=cols
    return df

#  Step 1: Create data frames with p-values and differential expressions

Each cancer needs a data frame containing only samples that have PTEN cnv deletions and PTEN wildtype with trans proteomics. Use wrap_ttest to run t-tests for all genes in the proteomic data frame. Use get_change_in_medians_df to create the data frame with differential expression values. Merge both data frames. 

Load in cancer data sets from cptac. 

In [17]:
en = pc.PancanUcec()
h = pc.PancanHnscc()
l = pc.PancanLuad()
ls = pc.PancanLscc()
o = pc.PancanOv()
r = pc.PancanCcrcc()
co = pc.PancanCoad()
g = pc.PancanGbm()
b = pc.PancanBrca()

Loading washuluad v1.0.                          

  exec(code_obj, self.user_global_ns, self.user_ns)


                                                 

In [16]:
pc.download('all', redownload = True)

Please login to Box on the webpage that was just opened and grant access for cptac to download files through your account. If you accidentally closed the browser window, press Ctrl+C and call the download function again.
                                                 

True

# Gbm

Part 1: Format data frame using all_prot_format_df. This returns a dataframe with proteomics and a Mutation column containing only PTEN cnv deletions and wildtype tumors. 

In [58]:
gene = 'PTEN'
g_prot = g.get_proteomics(source = 'umich')
g_prot = g_prot.drop(columns = 'PTEN') # cis effect
g_prot_list = list(g_prot.columns.get_level_values(0))

g_del_wt = all_prot_format_df(g, g_prot_list)
g_del_wt.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  return array(a, dtype, copy=False, order=order)


Name,ARF5,M6PR,ESRRA,FKBP4,NDUFAF7,FUCA2,DBNDD1,HS3ST1,CYP51A1,USP28,...,AP1S2_3,EED,DDHD1_1,WIZ,ZBTB3,CTNND1_2,WIZ_1,WIZ_2,MSANTD2,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,-0.254733,-0.138938,0.662227,-0.074868,0.213529,0.123398,2.105458,,-0.660264,,...,-0.446007,0.070101,0.16009,0.129282,-0.166678,,-0.073448,-0.02902,-0.009463,Deletion
C3L-00365,-0.138512,-0.82452,0.494419,0.043783,-0.001394,0.101477,-0.287232,,-0.402679,0.059975,...,-0.844985,-0.234406,-0.448783,0.078407,,-0.17478,1.80695,0.033808,0.180578,Deletion
C3L-00674,-0.351464,-0.120197,-0.084371,-0.260275,0.096193,-0.360274,0.227499,1.217058,-0.165751,0.010124,...,-0.096328,-0.118506,-0.10769,0.17753,,-1.513869,0.065973,-0.126149,0.465241,Deletion
C3L-00677,-0.062869,0.094198,0.39107,-0.030638,0.742258,-0.417291,-0.013377,,-0.176649,0.535304,...,0.320945,-0.027522,0.104278,0.049948,-0.590267,0.162686,1.96457,0.161229,0.28381,Deletion
C3L-01040,-0.365351,0.070523,-0.472543,-0.255288,0.096844,0.356271,1.18294,,-0.30743,,...,-0.298907,-0.201144,0.440215,0.110757,0.119013,,0.030719,0.066426,0.189187,Deletion


Part 2: Run T-tests. Create a data frame of just significant comparisons and another data frame for all comparisons. A moderately stringent correction is used (FDR_BH).

In [59]:
g_cols = list(g_del_wt.columns[:-1])

# Get all pvals
g_pval = u.wrap_ttest(g_del_wt, 'Mutation', g_cols, return_all = True, correction_method = 'fdr_bh', mincount = 5)
g_pval = g_pval.rename(columns = {'Comparison': 'Umich_Proteomics','P_Value': 'GBM_P_Value'})

In [60]:
g_pval

Unnamed: 0,Umich_Proteomics,GBM_P_Value
0,CUL2,4.572167e-07
1,DOCK1_1,4.572167e-07
2,ATE1_1,2.291619e-06
3,GDI2,2.291619e-06
4,CUTC,2.291619e-06
...,...,...
11568,THAP10,9.996790e-01
11569,ZNF639,9.998768e-01
11570,HSDL2,9.999004e-01
11571,ALDOB,9.999004e-01


Part 3: Create the differential expression df. (median of tumors with PTEN wildtype - median of tumors with PTEN cnv deletions)

In [64]:
g_cols = list(g_del_wt[:-1])
g_med = get_change_in_medians_df(g, "GBM", g_del_wt, g_cols)
g_med

Unnamed: 0,Umich_Proteomics,GBM_Median
0,ARF5,0.046849
1,M6PR,-0.104081
2,ESRRA,-0.038006
3,FKBP4,0.104346
4,NDUFAF7,-0.038037
...,...,...
12996,ZBTB3,0.231032
12997,CTNND1_2,0.271572
12998,WIZ_1,0.220366
12999,WIZ_2,0.072386


Part 4: Merge the p-value and differential expression dfs.

In [65]:
g_merged = g_pval.merge(g_med, on='Umich_Proteomics', how='outer')
g_merged = g_merged.replace(to_replace = r'_proteomics', value = '', regex = True)

# Create csv
g_merged.to_csv('csv/Single_Cancer/GBM_pval_medians_pancan.csv',index=False)
g_merged.head()

Unnamed: 0,Umich_Proteomics,GBM_P_Value,GBM_Median
0,CUL2,4.572167e-07,-0.458427
1,DOCK1_1,4.572167e-07,-0.358361
2,ATE1_1,2.291619e-06,-0.481942
3,GDI2,2.291619e-06,-0.486754
4,CUTC,2.291619e-06,-0.490992


# Repeat for other cancers.

In [97]:
cancer_objects = {'OV':o} #{'HNSCC':h, 'LUAD':l, 'LSCC':ls, 'BR':b, 'CO':co, 'OV':o}

In [98]:
# Luad has no somatic mutations for PTEN which changes some things
#elif cancer_object.get_cancer_type() in ('pancanluad'): 
# get_genotype_all_vars add cnv data under the column PTEN

gene_in = 'PTEN'
all_prot = prot_and_iso

mut_type = o.get_genotype_all_vars(gene_in, omics_source = 'washu') # washu has cnv
mut_type = mut_type.drop(columns= gene_in)
# different code because no somatic mutation data for pten (can't join to somatic mutations)
omics = o.join_omics_to_omics(df1_name = 'CNV', df2_name='proteomics', df1_source = 'washu', 
                                          df2_source = 'umich', genes1= gene_in, genes2= all_prot, 
                                          tissue_type = 'tumor')
omics = utils.reduce_multiindex(omics, levels_to_drop = 1, flatten = True)
omics = rename_duplicate_cols(omics)
omics = omics.drop(columns='PTEN_washu_CNV')

AttributeError: 'NoneType' object has no attribute 'join_omics_to_mutations'

In [99]:
o.get_CNV('umich')

DataFrameNotIncludedError: CNV dataframe not included in the umichov dataset.

In [85]:
# Merge Mutation column from get_genotype_all_vars (includes cnv) with proteomics
merged = omics.join(mut_type)
merged

Name,A1BG_umich_proteomics,A1CF_umich_proteomics,A2M_umich_proteomics,A2ML1_umich_proteomics,A4GALT_umich_proteomics,AAAS_umich_proteomics,AACS_umich_proteomics,AADAC_umich_proteomics,AADAT_umich_proteomics,AAGAB_umich_proteomics,...,ZSWIM8_umich_proteomics,ZSWIM9_umich_proteomics,ZW10_umich_proteomics,ZWILCH_umich_proteomics,ZWINT_umich_proteomics,ZXDC_umich_proteomics,ZYG11B_umich_proteomics,ZYX_umich_proteomics,ZZEF1_umich_proteomics,Mutation
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11LU013,-0.522000,0.156352,-0.927449,,,-0.072449,-0.423627,-1.564835,0.307324,0.392956,...,0.230056,,0.469225,-0.110041,0.424662,0.085748,0.052731,-0.070704,-0.398572,Deletion
11LU016,-0.416894,,-1.034393,,,0.042920,0.854008,-1.745078,,0.141026,...,-0.217082,,0.504543,0.690377,0.130886,0.239890,0.073790,-0.128560,-0.063684,Deletion
11LU022,-0.114286,,-0.474074,,,0.184832,0.198384,-3.002410,,0.645675,...,0.018950,,-0.023673,0.370696,0.715778,0.281387,0.055611,-0.136202,-0.148158,No_Mutation
11LU035,-0.138902,,-0.525977,-0.080078,,0.164643,-0.122591,-0.976153,,0.662650,...,-0.092290,-0.125483,0.224966,0.778338,0.704170,-0.320516,-0.085086,-0.098488,-0.209858,No_Mutation
C3L-00001,-0.779792,,-1.054531,,,0.120052,-0.249726,-0.660498,,-0.097150,...,-0.232436,,0.286752,-0.284326,-0.143142,,0.253291,-0.184048,-0.297789,Deletion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02582,-0.892348,,-1.200360,,,0.161562,0.884322,-1.422920,,0.457689,...,0.089350,,0.262830,-0.258906,-0.248340,-0.480880,0.330258,-0.518121,-0.292263,Deletion
C3N-02586,-0.417899,,-0.109325,-0.758060,,0.154245,-0.012534,1.446736,,0.144611,...,-0.101379,0.079364,-0.090472,-0.220882,-0.684668,-0.197353,0.301477,-0.281588,-0.164234,Deletion
C3N-02587,-0.309479,-0.252554,-0.525969,,,-0.095018,0.515820,-1.650673,-0.620950,0.104104,...,0.005175,0.131856,0.377512,-0.993616,-0.483909,-0.543046,0.112943,-0.405470,0.088355,No_Mutation
C3N-02588,-0.611550,,-0.982055,,,-0.268940,0.272542,-1.333260,0.570812,0.172053,...,-0.257331,,0.104694,0.136598,0.569153,0.953944,0.288042,-0.265337,-0.224152,No_Mutation


In [89]:
import warnings
warnings.filterwarnings('ignore')

gene = 'PTEN'
merged_dfs = {}
for cancer in cancer_objects:
    print(cancer)
    prot = cancer_objects[cancer].get_proteomics(source = 'umich')
    prot = prot.drop(columns = 'PTEN') # cis effect
    
    # Reduce a multiindex 
    if isinstance(prot.keys(), pd.core.indexes.multi.MultiIndex):
        prot = u.reduce_multiindex(prot, levels_to_drop = 1)
    prot_list = list(prot.columns)
    
    # Format df for t-test
    del_wt = all_prot_format_df(cancer_objects[cancer], prot_list)
    
    # Get list of unique cols for t-test and get_change_in_medians
    prot_and_iso = list(del_wt.columns[:-1])

    # Get all pvals
    min_num = 5
    all_pval = u.wrap_ttest(del_wt, 'Mutation', prot_and_iso, return_all = True, 
                            correction_method = 'fdr_bh', mincount = min_num)
    all_pval = all_pval.rename(columns = {'Comparison': 'Umich_Proteomics','P_Value': cancer+'_P_Value'})
    
    # Get change in medians
    delta_median_df = get_change_in_medians_df(cancer_objects[cancer], cancer, del_wt, prot_and_iso)

    # Merge pval_df and delta_median_df
    pval_medians_df = all_pval.merge(delta_median_df, on='Umich_Proteomics', how='outer')
    merged_dfs[cancer] = pval_medians_df # testing purposes
    
    # Create csv
    pval_medians_df.to_csv('csv/Single_Cancer/'+cancer+'_pval_medians_pancan.csv', index=False)
    print(pval_medians_df.head(), '\n')

LUAD
  Umich_Proteomics  LUAD_P_Value  LUAD_Median
0           LANCL2      0.000933     0.386394
1             EGFR      0.002249     0.598962
2             WAPL      0.007815    -0.183739
3           ATE1_1      0.011157    -0.315236
4             CDK4      0.015576     0.172922 

LSCC
  Umich_Proteomics  LSCC_P_Value  LSCC_Median
0         ABRAXAS2      0.000051    -0.195523
1          CCDC186      0.000107    -0.303387
2          DENND10      0.000159    -0.190829
3            SLK_1      0.000159    -0.288777
4             GBF1      0.000159    -0.219999 

BR
  Umich_Proteomics  BR_P_Value  BR_Median
0           RHBDD3    0.759532   0.676329
1         IGHV4-34    0.759532   0.817872
2           CEMIP2    0.759532   0.475073
3           SEMA3F    0.759532  -0.779007
4             ITCH    0.759532   0.272043 

CO


AttributeError: 'NoneType' object has no attribute 'join_omics_to_mutations'

# Endometrial

The Endometrial data set does not have enough cnv deletions to perform a t-test, however the data set does have enough truncation type mutations (nonsense and frameshift). Different code is needed to create the data frame for Endometrial.

In [117]:
gene = 'PTEN'
prot = en.get_proteomics(source = 'umich')
prot = u.reduce_multiindex(prot, levels_to_drop = 1)
prot = prot.drop(columns = 'PTEN')
e_prot_list = list(prot.columns)

# Get mutations based on priority filter when a sample has multiple mutations
mut_type = en.get_genotype_all_vars(gene, omics_source = 'washu')

# merge cnv with genotype all mut type
prot_and_mutations = en.join_omics_to_mutations(
    mutations_genes = [gene], omics_df_name = 'proteomics', omics_source='umich', omics_genes = e_prot_list, tissue_type = 'tumor') # drop Normal samples
prot_and_mutations = u.reduce_multiindex(prot_and_mutations, levels_to_drop = 1)

prot_df = prot_and_mutations.iloc[:,:-4] #drop Mutation and Location cols
mut_type = mut_type[['Mutation']] # Get Mutation col that includes CNV
merged = prot_df.join(mut_type) # merge 

# Create Truncation category and keep truncation and wt
compare = ['Wildtype_Tumor','Nonsense_Mutation','Frame_Shift_Ins','Frame_Shift_Del']
get = merged['Mutation'].isin(compare)
trunc_wt = merged[get]
trunc_wt['Mutation'] = np.where(
                trunc_wt['Mutation'] == 'Wildtype_Tumor', 'Wildtype_Tumor', 'Truncation')
trunc_wt = rename_duplicate_cols(trunc_wt)

In [127]:
e_cols = list(trunc_wt.columns[:-1])

# Get all pvals
e_pval = u.wrap_ttest(trunc_wt, 'Mutation', e_cols, return_all = True, correction_method = 'fdr_bh', mincount = 5)
e_pval = e_pval.rename(columns = {'Comparison': 'Umich_Proteomics','P_Value': 'EC_P_Value'})

Differential expression with adaption for trunctation mutations.

In [132]:
t = trunc_wt[trunc_wt.Mutation == "Truncation"]
wt = trunc_wt[trunc_wt.Mutation == "Wildtype_Tumor"]
trunc_med = t.median()
wt_med = wt.median()

en_d = {}

for prot in e_cols:
    if prot in trunc_med.index and prot in wt_med.index:
        dif = trunc_med[prot] - wt_med[prot]
        en_d[prot] = dif

en_med = pd.DataFrame.from_dict(en_d, orient='index', columns=['EC_Median'])
en_med = en_med.reset_index().rename(columns={'index':'Umich_Proteomics'})

In [133]:
e_merged = e_pval.merge(en_med, on='Umich_Proteomics',how='outer')
e_merged = e_merged.replace(to_replace = r'_umich_proteomics', value = '', regex = True)

# Create csv
e_merged.to_csv('csv/Single_Cancer/EC_pval_medians_pancan.csv',index=False)
e_merged.head()

Unnamed: 0,Umich_Proteomics,EC_P_Value,EC_Median
0,TJP3,0.002105,0.562653
1,ABT1,0.002105,-0.479251
2,WDR75,0.002105,-0.384194
3,NOL10,0.002105,-0.366391
4,REXO4,0.002105,-0.444003


# Get number of significant genes in each cancer

In [134]:
# Add Gbm and En to merged_dfs dictionary
merged_dfs['GBM'] = g_merged
merged_dfs['EC'] = e_merged

In [136]:
# Using corrected pvals from wrap_ttest, each cancer's cutoff is 0.05
all_sig = []

for cancer in merged_dfs:
    df = merged_dfs[cancer]
    sig_df = df.loc[df[cancer+'_P_Value'] < 0.05]
    print(cancer, 'sig comparisons:', len(sig_df))
    sig_list = list(sig_df['Umich_Proteomics'])
    all_sig.append(sig_list)
        

flat_list = [item for sublist in all_sig for item in sublist] #change list of lists, to just one list
sig = list(set(flat_list)) # keep only one if gene in multiple cancer sig lists
print('\nNumber of significant comparisons in at least 1 cancer:', len(sig))

LUAD sig comparisons: 17
LSCC sig comparisons: 1368
BR sig comparisons: 0
GBM sig comparisons: 603
EC sig comparisons: 490

Number of significant comparisons in at least 1 cancer: 2246


# Get a list of significant genes in at least one cancer

In [137]:
s = pd.Series(sig)
print('Number of significant genes in at least one cancer:', len(s))
s.to_csv('csv/list_sig_one_cancer_pancan.csv', index=False)

Number of significant genes in at least one cancer: 2246


# Get a list of significant genes in multiple cancers

In [138]:
# sig in multiple cancers
mult = list(set(i for i in flat_list if flat_list.count(i) > 1)) # Keep genes sig in more than 1 cancer
print('Number of significant genes in mult cancers:', len(mult))

Number of significant genes in mult cancers: 223


In [139]:
m = pd.Series(mult)
m = m.replace(to_replace = r'_proteomics$', value = '', regex = True)
m.to_csv('csv/list_sig_multiple_cancers_pancan.csv', index=False)