# Data munging EGFR: trans effects 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics for each cancer type. It records all FDR Benjamini/Hochberg corrected p values. This data is then used in the Make_Supplemental_Tables notebook. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p
import warnings
warnings.filterwarnings('ignore')
import cptac.pancan as pc

  import pandas.util.testing as tm


In [2]:
'''
@Param df: Data frame. Takes a data frame of proteomic data frame.
This function takes the data frame from get_proteomics and labels any duplicate columns due to mutliple isoforms.
Then it returns the list of all proteins
'''

def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    #get list of proteins
    prot_genes_list = df.columns.values.tolist()
    return prot_genes_list

In [3]:
'''
@Param cancer: cptac data for a cancer Ex brain = cptac.Gbm()

This function makes a df with all trans effects. 
Step 1: Get proteomic data with only tumor samples.
Step 2: For each cancer type, create list of proteins by using create_prot_list function 
which extracts column names and labels duplicate columns caused by multiple isoforms.
Step 3:Use plot_utils function wrap_pearson_corr on df and compare EGFR proteomics to all other proteins in protein list.
Record all FDR corrected p values. 

'''

def make_trans_df(cancer):
    df1_tumor = cancer.get_proteomics(tissue_type= "tumor", source = "umich")
    df1_tumor = u.reduce_multiindex(df1_tumor, levels_to_drop = 1)
        
    prot_genes_list = create_prot_list(df1_tumor)
    trans_df = p.wrap_pearson_corr(df1_tumor,"EGFR",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
    return trans_df
    

In [4]:
#cptac version
cptac.version()   

'0.9.0'

In [5]:
g = pc.PancanGbm()
hn = pc.PancanHnscc()
l = pc.PancanLuad()
#o = pc.PancanOv()
c = pc.PancanCcrcc()
#col = pc.PancanCoad()
b = pc.PancanBrca()
ls = pc.PancanLscc()
en = pc.PancanUcec()

                                                 

GBM

In [13]:
df = g.get_proteomics(source = "umich", tissue_type = "tumor")
df = u.reduce_multiindex(df, levels_to_drop = 1)
df

Name,ARF5,M6PR,ESRRA,FKBP4,NDUFAF7,FUCA2,DBNDD1,HS3ST1,CYP51A1,USP28,...,ETNK1,AP1S2,EED,DDHD1,WIZ,ZBTB3,CTNND1,WIZ,WIZ,MSANTD2
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,-0.254733,-0.138938,0.662227,-0.074868,0.213529,0.123398,2.105458,,-0.660264,,...,-0.047874,-0.446007,0.070101,0.160090,0.129282,-0.166678,,-0.073448,-0.029020,-0.009463
C3L-00365,-0.138512,-0.824520,0.494419,0.043783,-0.001394,0.101477,-0.287232,,-0.402679,0.059975,...,0.350165,-0.844985,-0.234406,-0.448783,0.078407,,-0.174780,1.806950,0.033808,0.180578
C3L-00674,-0.351464,-0.120197,-0.084371,-0.260275,0.096193,-0.360274,0.227499,1.217058,-0.165751,0.010124,...,-0.167319,-0.096328,-0.118506,-0.107690,0.177530,,-1.513869,0.065973,-0.126149,0.465241
C3L-00677,-0.062869,0.094198,0.391070,-0.030638,0.742258,-0.417291,-0.013377,,-0.176649,0.535304,...,0.179200,0.320945,-0.027522,0.104278,0.049948,-0.590267,0.162686,1.964570,0.161229,0.283810
C3L-01040,-0.365351,0.070523,-0.472543,-0.255288,0.096844,0.356271,1.182940,,-0.307430,,...,0.098253,-0.298907,-0.201144,0.440215,0.110757,0.119013,,0.030719,0.066426,0.189187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03183,-0.154099,0.377664,0.230259,-0.160996,-0.345147,0.098204,-0.307796,0.540155,0.365869,0.093833,...,-0.053077,-0.744983,0.661918,-0.397211,-0.136850,,,,0.565946,-0.232044
C3N-03184,0.189578,-0.307571,,0.223307,-0.245702,0.015862,-0.014366,,0.366511,,...,0.141587,0.198692,0.379313,0.257048,0.669888,0.134818,-0.042240,,0.841351,
C3N-03186,0.216536,0.206218,-0.002028,-0.027721,0.022376,0.116589,0.412767,,-0.088390,0.131931,...,0.095028,-0.032430,-0.350445,-0.037268,-0.609430,,-0.274836,,-0.345816,0.383632
C3N-03188,0.020677,0.004811,-0.041030,-0.099233,-0.063813,-0.250591,-0.145725,-0.562148,0.194367,-0.406621,...,0.062744,-0.007752,-0.219871,0.271077,-0.312438,,,,-0.199850,0.252781


In [6]:
gbm_df = make_trans_df(g)
#Example


gbm_df = gbm_df.rename(columns={"Correlation": "Correlation_GBM","P_value": "P_value_GBM" })
gbm_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_GBM',index=False)
gbm_df

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM
2813,EGFR,1.000000,0.000000e+00
2476,PHLDA1,0.789551,9.649849e-19
11243,CDH4,0.656331,4.744022e-10
5916,GRB2,-0.589008,3.310325e-07
7319,PHLDA3,0.561528,2.006861e-06
...,...,...,...
6514,UBE2H,-0.000084,9.996825e-01
5567,OSBP2,0.000071,9.997027e-01
7294,TMEM181,-0.000058,9.997128e-01
6509,TPRA1,-0.000068,9.997731e-01


In [9]:
duplicates  = gbm_df[gbm_df.Comparison.isin(matching)]
duplicates

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM
10706,EPB41L3_3,-0.510933,0.000036
10922,FLOT2_1,0.729748,0.000138
4307,HMGA1_1,-0.477182,0.000149
11302,ALDH7A1_2,0.456890,0.000364
7004,DCLK1_1,0.435497,0.000712
...,...,...,...
10558,EEF1D_1,-0.001091,0.995709
10591,LMO7_1,-0.001372,0.996216
6442,WLS_1,-0.001295,0.996578
8122,EPB41_1,-0.000573,0.997654


In [8]:
gbm_genes = gbm_df.Comparison.to_list()
matching = [s for s in gbm_genes if "_" in s]
matching


['EPB41L3_3',
 'FLOT2_1',
 'HMGA1_1',
 'ALDH7A1_2',
 'DCLK1_1',
 'DPF3_1',
 'SRGAP3_1',
 'NFIX_2',
 'DTNA_1',
 'KCND3_1',
 'RALGAPA1_1',
 'DBI_1',
 'ABR_2',
 'ARHGAP27_1',
 'ALDH7A1_1',
 'PGM1_1',
 'ATE1_1',
 'COL6A3_1',
 'PML_4',
 'ZSCAN18_1',
 'VAPA_1',
 'CLIP2_1',
 'CD163_1',
 'LRRFIP1_1',
 'TPM3_4',
 'ZNF185_1',
 'OSBPL6_2',
 'IKBIP_1',
 'SH3GLB1_1',
 'CLUH_1',
 'GNAS_2',
 'TPM3_1',
 'SH3KBP1_1',
 'SEPTIN9_1',
 'COL14A1_1',
 'ANXA6_1',
 'P4HA1_1',
 'ACTN2_1',
 'DBNL_1',
 'TCF12_1',
 'PABPC4_1',
 'ARHGEF4_1',
 'MAGI1_1',
 'NECTIN3_1',
 'FAM126A_1',
 'EPB41L1_2',
 'PTPRS_1',
 'LAMP2_1',
 'MYL6_1',
 'RAP1GAP_1',
 'TPM1_2',
 'AMER2_1',
 'LIMS1_1',
 'CTNNA2_1',
 'SIRT3_1',
 'EHBP1_1',
 'DLG1_4',
 'CNOT4_1',
 'PPP2R2B_1',
 'CTIF_1',
 'LTBP3_1',
 'AMDHD2_1',
 'CDKN2A_1',
 'GTF2I_1',
 'MADD_2',
 'ABI2_1',
 'KCNAB2_1',
 'PCDH9_1',
 'SLC9A3R2_1',
 'SEPTIN8_1',
 'ITGA7_2',
 'DLG2_2',
 'MAP2_2',
 'CAPZB_1',
 'ACAN_1',
 'SYNE1_2',
 'RAP1GDS1_3',
 'CPEB2_1',
 'POSTN_2',
 'ZC3H14_1',
 'MYH14_1',


Kidney 

In [9]:
kidney_df = make_trans_df(c)

kidney_df = kidney_df.rename(columns={"Correlation": "Correlation_ccRCC","P_value": "P_value_ccRCC" })
kidney_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_ccRCC',index=False)

Ovarian 

In [None]:
#ovarian_df = make_trans_df(ovar)

#ovarian_df = ovarian_df.rename(columns={"Correlation": "Correlation_OV","P_value": "P_value_OV" })
#ovarian_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_OV',index=False)

Colon

In [None]:
#colon_df = make_trans_df(colon)

#colon_df = colon_df.rename(columns={"Correlation": "Correlation_CO","P_value": "P_value_CO" })
#colon_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_CO',index=False)

Brca 

In [10]:
brca_df = make_trans_df(b)

brca_df = brca_df.rename(columns={"Correlation": "Correlation_BR","P_value": "P_value_BR" })
brca_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_BR',index=False)

Luad

In [11]:
luad_df = make_trans_df(l)

luad_df = luad_df.rename(columns={"Correlation": "Correlation_LUAD","P_value": "P_value_LUAD" })
luad_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LUAD',index=False)

Hnscc

In [12]:
hnscc_df = make_trans_df(hn)

hnscc_df = hnscc_df.rename(columns={"Correlation": "Correlation_HNSCC","P_value": "P_value_HNSCC" })
hnscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_HNSCC',index=False)

Lscc

In [13]:
lscc_df = make_trans_df(ls)

lscc_df = lscc_df.rename(columns={"Correlation": "Correlation_LSCC","P_value": "P_value_LSCC" })
lscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LSCC',index=False)

In [14]:
lscc_df

Unnamed: 0,Comparison,Correlation_LSCC,P_value_LSCC
2961,EGFR,1.000000,0.000000e+00
2613,PHLDA1,0.682116,1.398013e-12
7616,PHLDA3,0.651618,5.103796e-11
5110,PHLDA2,0.650100,3.054310e-09
6461,TRIM29,0.553806,7.010705e-07
...,...,...,...
27,CEACAM7,0.000267,9.992219e-01
2660,TCF12,0.000102,9.994004e-01
10283,JAK3,-0.000077,9.995287e-01
11825,SZT2,0.000056,9.996186e-01
