# Data munging EGFR: trans effects 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics for each cancer type. It records all FDR Benjamini/Hochberg corrected p values. This data is then used in the Make_Supplemental_Tables notebook. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p
import warnings
warnings.filterwarnings('ignore')
import cptac.pancan as pc

In [2]:
'''
@Param df: Data frame. Takes a data frame of proteomic data frame.
This function takes the data frame from get_proteomics and labels any duplicate columns due to mutliple isoforms.
Then it returns the list of all proteins
'''

def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    #get list of proteins
    prot_genes_list = df.columns.values.tolist()
    return prot_genes_list

In [3]:
'''
@Param cancer: cptac data for a cancer Ex brain = cptac.Gbm()

This function makes a df with all trans effects. 
Step 1: Get proteomic data with only tumor samples.
Step 2: For each cancer type, create list of proteins by using create_prot_list function 
which extracts column names and labels duplicate columns caused by multiple isoforms.
Step 3:Use plot_utils function wrap_pearson_corr on df and compare EGFR proteomics to all other proteins in protein list.
Record all FDR corrected p values. 

'''

def make_trans_df(cancer):
    df1_tumor = cancer.get_proteomics(tissue_type= "tumor", source = "umich")
    df1_tumor = u.reduce_multiindex(df1_tumor, levels_to_drop = 1)
        
    prot_genes_list = create_prot_list(df1_tumor)
    trans_df = p.wrap_pearson_corr(df1_tumor,"EGFR",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
    return trans_df
    

In [4]:
#cptac version
cptac.version()   

'1.1.0'

In [5]:
g = pc.PancanGbm()
hn = pc.PancanHnscc()
l = pc.PancanLuad()
o = pc.PancanOv()
c = pc.PancanCcrcc()
col = pc.PancanCoad()
b = pc.PancanBrca()
ls = pc.PancanLscc()
en = pc.PancanUcec()

                                                 

GBM

In [6]:
df = g.get_proteomics(source = "umich", tissue_type = "tumor")
df = u.reduce_multiindex(df, levels_to_drop = 1)
df

Name,A1BG,A2M,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,AAR2,...,ZSCAN5A,ZSWIM8,ZSWIM9,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11B,ZYX,ZZEF1
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,0.105891,0.330922,-0.024051,0.035827,,-0.613311,-0.373164,0.055880,0.336079,0.016398,...,,0.217419,,0.074363,-0.245974,,0.081082,0.131731,-0.465064,-0.234109
C3L-00365,0.270545,0.482065,0.337431,-0.423607,0.452508,0.142582,-0.476576,0.211269,-0.027972,0.449446,...,,0.021952,,-0.119719,0.047814,,0.402015,0.040095,0.113710,-0.164766
C3L-00674,0.746517,0.756213,-0.285780,-0.178984,0.007682,0.110723,-0.223643,-0.555793,-0.406553,-0.203438,...,,0.065502,,-0.118483,0.402017,,0.965188,-0.207249,0.194245,0.031098
C3L-00677,0.170279,-0.011307,0.236229,-0.190993,-0.110567,0.192846,-0.304092,0.052535,0.046097,0.367439,...,,0.140339,,0.493884,0.577437,0.170166,-0.481224,-0.112655,-0.391506,-0.000814
C3L-01040,-0.453700,-0.773204,0.058468,-0.495386,,0.036179,0.086031,-0.363497,0.431699,0.144329,...,,0.192952,,0.089773,-0.353854,,0.567895,0.146947,0.583504,-0.136888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03183,-0.068626,0.235189,0.566995,0.280848,0.253721,0.236217,-0.560787,0.159689,0.267884,0.124830,...,,-0.086089,,0.036136,-0.040281,,0.098477,0.084055,-0.332491,-0.068266
C3N-03184,-0.404048,-0.590294,0.555585,0.069570,-0.402289,0.040775,-0.580254,0.274442,0.138819,0.609501,...,,0.567237,0.461003,0.177797,0.497769,,,0.314336,0.524937,-0.173756
C3N-03186,0.565923,0.741530,-0.089037,0.006431,,0.130268,0.204040,-0.161883,-0.055028,-0.256099,...,,-0.103363,,-0.034115,-0.083924,,,0.181785,-0.141950,0.243426
C3N-03188,-0.150964,-0.451871,-0.012555,-0.102667,0.334487,0.100329,0.023444,-0.426100,-0.364939,-0.169027,...,,0.040449,,-0.088354,-0.127570,,-0.124730,0.541167,0.064135,-0.169849


In [7]:
gbm_df = make_trans_df(g)
#Example

gbm_df = gbm_df.rename(columns={"Correlation": "Correlation_GBM","P_value": "P_value_GBM" })
gbm_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_GBM',index=False)
gbm_df

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM
2926,EGFR,1.000000,0.000000e+00
7172,PHLDA1,0.789551,9.649849e-19
1650,CDH4,0.656331,4.744022e-10
4023,GRB2,-0.589008,3.310325e-07
9423,SOCS2,0.565367,2.006861e-06
...,...,...,...
10744,UBE2H,-0.000084,9.996825e-01
6755,OSBP2,0.000071,9.997027e-01
10237,TMEM181,-0.000058,9.997128e-01
10438,TPRA1,-0.000068,9.997731e-01


In [8]:
gbm_genes = gbm_df.Comparison.to_list()
matching = [s for s in gbm_genes if "_" in s]
matching


['EPB41L3_3',
 'FLOT2_1',
 'HMGA1_1',
 'ALDH7A1_2',
 'DCLK1_1',
 'DPF3_1',
 'SRGAP3_1',
 'NFIX_2',
 'DTNA_1',
 'KCND3_1',
 'RALGAPA1_1',
 'DBI_1',
 'ABR_2',
 'ARHGAP27_1',
 'ALDH7A1_1',
 'PGM1_1',
 'ATE1_1',
 'COL6A3_1',
 'PML_4',
 'ZSCAN18_1',
 'VAPA_1',
 'CLIP2_1',
 'CD163_1',
 'LRRFIP1_1',
 'TPM3_4',
 'ZNF185_1',
 'OSBPL6_2',
 'IKBIP_1',
 'SH3GLB1_1',
 'CLUH_1',
 'GNAS_2',
 'TPM3_1',
 'SH3KBP1_1',
 'SEPTIN9_1',
 'COL14A1_1',
 'ANXA6_1',
 'P4HA1_1',
 'ACTN2_1',
 'DBNL_1',
 'TCF12_1',
 'PABPC4_1',
 'ARHGEF4_1',
 'MAGI1_1',
 'NECTIN3_1',
 'FAM126A_1',
 'EPB41L1_2',
 'PTPRS_1',
 'LAMP2_1',
 'MYL6_1',
 'RAP1GAP_1',
 'TPM1_2',
 'AMER2_1',
 'LIMS1_1',
 'CTNNA2_1',
 'SIRT3_1',
 'EHBP1_1',
 'DLG1_4',
 'CNOT4_1',
 'PPP2R2B_1',
 'CTIF_1',
 'LTBP3_1',
 'AMDHD2_1',
 'CDKN2A_1',
 'GTF2I_1',
 'MADD_2',
 'ABI2_1',
 'KCNAB2_1',
 'PCDH9_1',
 'SLC9A3R2_1',
 'SEPTIN8_1',
 'ITGA7_2',
 'DLG2_2',
 'MAP2_2',
 'CAPZB_1',
 'ACAN_1',
 'SYNE1_2',
 'CPEB2_1',
 'RAP1GDS1_3',
 'POSTN_2',
 'ZC3H14_1',
 'MYH14_1',


In [9]:
duplicates  = gbm_df[gbm_df.Comparison.isin(matching)]
duplicates

Unnamed: 0,Comparison,Correlation_GBM,P_value_GBM
3095,EPB41L3_3,-0.510933,0.000036
3542,FLOT2_1,0.729748,0.000138
4292,HMGA1_1,-0.477182,0.000149
389,ALDH7A1_2,0.456890,0.000364
2450,DCLK1_1,0.435497,0.000712
...,...,...,...
2900,EEF1D_1,-0.001091,0.995709
5236,LMO7_1,-0.001372,0.996216
11135,WLS_1,-0.001295,0.996578
3084,EPB41_1,-0.000573,0.997654


Kidney 

In [10]:
kidney_df = make_trans_df(c)

kidney_df = kidney_df.rename(columns={"Correlation": "Correlation_ccRCC","P_value": "P_value_ccRCC" })
kidney_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_ccRCC',index=False)

Ovarian 

In [20]:
ovarian_df = make_trans_df(o)

ovarian_df = ovarian_df.rename(columns={"Correlation": "Correlation_OV","P_value": "P_value_OV" })
ovarian_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_OV',index=False)

Colon

In [21]:
colon_df = make_trans_df(col)

colon_df = colon_df.rename(columns={"Correlation": "Correlation_CO","P_value": "P_value_CO" })
colon_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_CO',index=False)

Brca 

In [13]:
brca_df = make_trans_df(b)

brca_df = brca_df.rename(columns={"Correlation": "Correlation_BR","P_value": "P_value_BR" })
brca_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_BR',index=False)

Luad

In [14]:
luad_df = make_trans_df(l)

luad_df = luad_df.rename(columns={"Correlation": "Correlation_LUAD","P_value": "P_value_LUAD" })
luad_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LUAD',index=False)

Hnscc

In [15]:
hnscc_df = make_trans_df(hn)

hnscc_df = hnscc_df.rename(columns={"Correlation": "Correlation_HNSCC","P_value": "P_value_HNSCC" })
hnscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_HNSCC',index=False)

Lscc

In [16]:
lscc_df = make_trans_df(ls)

lscc_df = lscc_df.rename(columns={"Correlation": "Correlation_LSCC","P_value": "P_value_LSCC" })
lscc_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_LSCC',index=False)

In [17]:
lscc_df

Unnamed: 0,Comparison,Correlation_LSCC,P_value_LSCC
3082,EGFR,1.000000,0.000000e+00
7531,PHLDA1,0.682116,1.398013e-12
7533,PHLDA3,0.651618,5.103796e-11
7532,PHLDA2,0.650100,3.054310e-09
5371,KRT5,0.554862,7.010705e-07
...,...,...,...
6500,MYO19,0.000138,9.992219e-01
10396,TCF12,0.000102,9.994004e-01
5095,JAK3,-0.000077,9.995287e-01
10264,SZT2,0.000056,9.996186e-01


colon

In [18]:
colon_df = make_trans_df(col)

colon_df = colon_df.rename(columns={"Correlation": "Correlation_Colon","P_value": "P_value_Colon" })
colon_df.to_csv('csv_files/trans_effects_all_prot_fdr_corrected_Colon',index=False)