# Data munging EGFR: trans effects 

For both the flagship and harmonized data, this notebook records the pearson correlation stats for EGFR proteomics vs all proteomics for each cancer type. It records all FDR Benjamini/Hochberg corrected p values. This data is then used in the Make_Supplemental_Tables notebook. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
'''
@Param df: Data frame. Takes a data frame of proteomic data frame.
This function takes the data frame from get_proteomics and labels any duplicate columns due to mutliple isoforms.
Then it returns the list of all proteins
'''

def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    #get list of proteins
    prot_genes_list = df.columns.values.tolist()
    return prot_genes_list

In [3]:
'''
@Param cancer: cptac data for a cancer Ex brain = cptac.Gbm()

This function makes a df with all trans effects. 
Step 1: Get proteomic data with only tumor samples.
Step 2: For each cancer type, create list of proteins by using create_prot_list function 
which extracts column names and labels duplicate columns caused by multiple isoforms.
Step 3:Use plot_utils function wrap_pearson_corr on df and compare EGFR proteomics to all other proteins in protein list.
Record all FDR corrected p values. 

'''

def make_trans_df(cancer):
    df1_tumor = cancer.get_proteomics(tissue_type= "tumor")
    
    if isinstance(df1_tumor.keys(), pd.core.indexes.multi.MultiIndex):
        df1_tumor = u.reduce_multiindex(df1_tumor, levels_to_drop = 1)
        
    prot_genes_list = create_prot_list(df1_tumor)
    trans_df = p.wrap_pearson_corr(df1_tumor,"EGFR",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
    return trans_df
    

In [4]:
#cptac version
cptac.version()   

'0.9.1'

In [5]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
hnscc = cptac.Hnscc()
lscc = cptac.Lscc()

                                            

GBM

In [6]:
gbm_df = make_trans_df(brain)
#Example
gbm_df = gbm_df.rename(columns={"Correlation": "Correlation_GBM","P_value": "P_value_GBM" })
gbm_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_GBM',index=False)
gbm_df

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM
2728,EGFR,1.000000,0.000000e+00
6656,PHLDA1,0.816848,3.507071e-21
3757,GRB2,-0.610889,6.729990e-08
8734,SOCS2,0.562720,3.420388e-06
1528,CDH4,0.559180,3.420388e-06
...,...,...,...
7216,PSMB4,-0.000033,9.998937e-01
7161,PRPF39,0.000123,9.998937e-01
9703,TRAPPC6B,0.000086,9.998937e-01
5351,MKI67,-0.000098,9.998937e-01


Kidney 

In [7]:
kidney_df = make_trans_df(kidney)

kidney_df = kidney_df.rename(columns={"Correlation": "Correlation_ccRCC","P_value": "P_value_ccRCC" })
kidney_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_ccRCC',index=False)

Ovarian 

In [8]:
ovarian_df = make_trans_df(ovar)

ovarian_df = ovarian_df.rename(columns={"Correlation": "Correlation_OV","P_value": "P_value_OV" })
ovarian_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_OV',index=False)

Colon

In [9]:
colon_df = make_trans_df(colon)

colon_df = colon_df.rename(columns={"Correlation": "Correlation_CO","P_value": "P_value_CO" })
colon_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_CO',index=False)

Brca 

In [10]:
brca_df = make_trans_df(brca)

brca_df = brca_df.rename(columns={"Correlation": "Correlation_BR","P_value": "P_value_BR" })
brca_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_BR',index=False)

Luad

In [11]:
luad_df = make_trans_df(luad)

luad_df = luad_df.rename(columns={"Correlation": "Correlation_LUAD","P_value": "P_value_LUAD" })
luad_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LUAD',index=False)

Hnscc

In [12]:
hnscc_df = make_trans_df(hnscc)

hnscc_df = hnscc_df.rename(columns={"Correlation": "Correlation_HNSCC","P_value": "P_value_HNSCC" })
hnscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_HNSCC',index=False)

Lscc

In [13]:
lscc_df = make_trans_df(lscc)

lscc_df = lscc_df.rename(columns={"Correlation": "Correlation_LSCC","P_value": "P_value_LSCC" })
lscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LSCC',index=False)

# Harmonized

In [15]:
import cptac.pancan as pc
pc_g = pc.PancanGbm()
pc_hn = pc.PancanHnscc()
pc_l = pc.PancanLuad()
pc_o = pc.PancanOv()
pc_c = pc.PancanCcrcc()
pc_col = pc.PancanCoad()
pc_b = pc.PancanBrca()
pc_ls = pc.PancanLscc()

                                                 

Gbm

In [16]:
pc_gbm_df = make_trans_df(pc_g)
#Example
pc_gbm_df = pc_gbm_df.rename(columns={"Correlation": "Correlation_GBM","P_value": "P_value_GBM" })
pc_gbm_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_GBM_Harmonized',index=False)
pc_gbm_df

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM
2813,EGFR,1.000000,0.000000e+00
2476,PHLDA1,0.789551,9.649849e-19
11243,CDH4,0.656331,4.744022e-10
5916,GRB2,-0.589008,3.310325e-07
7319,PHLDA3,0.561528,2.006861e-06
...,...,...,...
6514,UBE2H,-0.000084,9.996825e-01
5567,OSBP2,0.000071,9.997027e-01
7294,TMEM181,-0.000058,9.997128e-01
6509,TPRA1,-0.000068,9.997731e-01


Kidney

In [17]:
pc_kidney_df = make_trans_df(pc_c)
pc_kidney_df = pc_kidney_df.rename(columns={"Correlation": "Correlation_ccRCC","P_value": "P_value_ccRCC" })
pc_kidney_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_ccRCC_Harmonized',index=False)

Ovarian

In [18]:
pc_ovarian_df = make_trans_df(pc_o)
pc_ovarian_df = pc_ovarian_df.rename(columns={"Correlation": "Correlation_OV","P_value": "P_value_OV" })
pc_ovarian_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_OV_Harmonized',index=False)

Colon

In [19]:
pc_colon_df = make_trans_df(pc_col)
pc_colon_df = pc_colon_df.rename(columns={"Correlation": "Correlation_CO","P_value": "P_value_CO" })
pc_colon_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_CO_Harmonized',index=False)

Brca

In [20]:
pc_brca_df = make_trans_df(pc_b)
pc_brca_df = pc_brca_df.rename(columns={"Correlation": "Correlation_BR","P_value": "P_value_BR" })
pc_brca_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_BR_Harmonized',index=False)

Luad

In [21]:
pc_luad_df = make_trans_df(pc_l)
pc_luad_df = pc_luad_df.rename(columns={"Correlation": "Correlation_LUAD","P_value": "P_value_LUAD" })
pc_luad_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LUAD_Harmonized',index=False)

Hnscc

In [22]:
pc_hnscc_df = make_trans_df(pc_hn)
pc_hnscc_df = pc_hnscc_df.rename(columns={"Correlation": "Correlation_HNSCC","P_value": "P_value_HNSCC" })
pc_hnscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_HNSCC_Harmonized',index=False)

Lscc

In [23]:
pc_lscc_df = make_trans_df(pc_ls)
pc_lscc_df = pc_lscc_df.rename(columns={"Correlation": "Correlation_LSCC","P_value": "P_value_LSCC" })
pc_lscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LSCC_Harmonized',index=False)