# Data munging for Figure 1C (Make supplemental table 2)

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics. It records all FDR Benjamini/Hochberg corrected p values. This data is then used to make EGFR figure 1C. It is also supplemental table 2. 

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p
import warnings
warnings.filterwarnings('ignore')

In [16]:
'''
@Param df: Data frame. Takes a data frame of proteomic data frame.
This function takes the data frame from get_proteomics and labels any duplicate columns due to mutliple isoforms.
Then it returns the list of all proteins
'''

def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    #get list of proteins
    prot_genes_list = df.columns.values.tolist()
    return prot_genes_list

In [23]:
'''
@Param cancer: cptac data for a cancer Ex brain = cptac.Gbm()

This function makes a df with all trans effects. 
Step 1: Get proteomic data with only tumor samples.
Step 2: For each cancer type, create list of proteins by using create_prot_list function 
which extracts column names and labels duplicate columns caused by multiple isoforms.
Step 3:Use plot_utils function wrap_pearson_corr on df and compare EGFR proteomics to all other proteins in protein list.
Record all FDR corrected p values. 

'''

def make_trans_df(cancer):
    df1_tumor = cancer.get_proteomics(tissue_type= "tumor")
    
    if isinstance(df1_tumor.keys(), pd.core.indexes.multi.MultiIndex):
        df1_tumor = u.reduce_multiindex(df1_tumor, levels_to_drop = 1)
        
    prot_genes_list = create_prot_list(df1_tumor)
    trans_df = p.wrap_pearson_corr(df1_tumor,"EGFR",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
    return trans_df
    

In [18]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
hnscc = cptac.Hnscc()
lscc = cptac.Lscc()

version 3scc v3.2.......                    
                            

GBM

In [25]:
gbm_df = make_trans_df(brain)
#Example
gbm_df.head()

Unnamed: 0,Comparison,Correlation,P_value
2728,EGFR,1.0,0.0
6656,PHLDA1,0.816848,3.507071e-21
3757,GRB2,-0.610889,6.72999e-08
8734,SOCS2,0.56272,3.420388e-06
1528,CDH4,0.55918,3.420388e-06


Kidney 

In [26]:
kidney_df = make_trans_df(kidney)


Ovarian 

In [34]:
ovarian_df = make_trans_df(ovar)

Colon

In [27]:
colon_df = make_trans_df(colon)

Brca 

In [28]:
brca_df = make_trans_df(brca)

Luad

In [29]:
luad_df = make_trans_df(luad)


Hnscc

In [30]:
hnscc_df = make_trans_df(hnscc)


Lscc

In [31]:
lscc_df = make_trans_df(lscc)

# Merge all data frames into one wide data frame

Did not inculde Colon in final csv file because it doesn't have a cis effect. Csv file used in notebook Plot_EGFR_1C_parts

In [35]:
Gbm_kidney = pd.merge(gbm_df, kidney_df, on="Comparison", how = "outer")
Gbm_kidney = Gbm_kidney.rename(columns={"Correlation_x": "Correlation_Gbm","P_value_x":"P_value_Gbm" ,"Correlation_y":"Correlation_kidney","P_value_y": "P_value_kidney" })


In [36]:
pancan = pd.merge(Gbm_kidney, ovarian_df, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Ovar","P_value": "P_value_Ovar" })

In [37]:
pancan = pd.merge(pancan, brca_df, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Brca","P_value": "P_value_Brca" })


In [38]:
pancan = pd.merge(pancan, luad_df, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Luad","P_value": "P_value_Luad" })

In [39]:
pancan = pd.merge(pancan, hnscc_df, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Hnscc","P_value": "P_value_Hnscc" })


In [40]:
pancan = pd.merge(pancan, colon_df, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Colon","P_value": "P_value_Colon" })

In [41]:
pancan = pd.merge(pancan, lscc_df, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Lscc","P_value": "P_value_Lscc" })


In [42]:
pancan

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_kidney,P_value_kidney,Correlation_Ovar,P_value_Ovar,Correlation_Brca,P_value_Brca,Correlation_Luad,P_value_Luad,Correlation_Hnscc,P_value_Hnscc,Correlation_Colon,P_value_Colon,Correlation_Lscc,P_value_Lscc
0,EGFR,1.000000,0.000000e+00,1.000000,0.000000,1.00000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000e+00,1.000000,0.000000,1.000000,0.000000e+00
1,PHLDA1,0.816848,3.507071e-21,0.254436,0.060261,,,0.364797,0.002164,0.260110,0.074530,0.664271,8.888640e-12,0.386104,0.122847,0.713420,2.644826e-14
2,GRB2,-0.610889,6.729990e-08,-0.217427,0.120342,-0.19009,0.346111,-0.177379,0.142733,-0.302439,0.020631,-0.532341,3.320092e-06,0.150960,0.347409,-0.198042,2.437176e-01
3,SOCS2,0.562720,3.420388e-06,,,,,,,,,0.020297,9.557300e-01,,,0.472624,1.417921e-02
4,CDH4,0.559180,3.420388e-06,0.148407,0.513490,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14863,ZNF283,,,,,,,,,,,,,,,0.002610,9.941900e-01
14864,TRIM39,,,,,,,,,,,,,,,-0.002379,9.945224e-01
14865,ADGRA3,,,,,,,,,,,,,,,0.002228,9.958381e-01
14866,CEP57L1,,,,,,,,,,,,,,,-0.001102,9.977544e-01


In [43]:
pancan = pancan.to_csv("Pval_corr_table_Fig_1C_suppl_table2.csv")