# Pearson Dfs EGFR proteomics vs all proteins 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics. It records all FDR Benjamini/Hochberg corrected p values.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p


  import pandas.util.testing as tm


In [2]:
def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    prot_genes_list = df.columns.values.tolist()
    prot_genes_list.remove('EGFR_Mutation')
    prot_genes_list.remove('EGFR_Location')
    prot_genes_list.remove('EGFR_Mutation_Status')
    prot_genes_list.remove('Sample_Status')
    return prot_genes_list

In [3]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
Ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
Hnscc = cptac.Hnscc()
Lscc = cptac.Lscc()

Checking that ccrcc index is up-to-date...



Checking that hnscc index is up-to-date...  



Checking that lscc index is up-to-date... 



version 3scc v3.2.......                 
                            



# Step 1 Create Data frames 

For each cancer type,
load proteomic data and mutation data for EGFR and
filter out non-tumor samples 

# Step 2  Create list of Proteins for Comparison

For each cancer type, create list of proteins by using create_prot_list function which extracts column names and labels duplicate columns caused by multiple isoforms.

# Step 3 Run Pearson Correlation Function  

Run function on df and compare EGFR proteomics to all other proteins in protein list. Record all FDR corrected p values. 

# GBM

In [4]:
df1 = brain.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [5]:
prot_genes_list = create_prot_list(df1_tumor)

In [6]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)

df2.to_csv("csv_files/GBM_EGFR_all_pearson_FDR_pval.csv")
df2['Cancer Type']='GBM'
df2.to_csv("csv_files/GBM_EGFR_all_pearson_FDR2_pval.csv")
df2



Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
2728,EGFR_proteomics,1.000000,0.000000e+00,GBM
6656,PHLDA1_proteomics,0.816848,3.507071e-21,GBM
3757,GRB2_proteomics,-0.610889,6.729990e-08,GBM
8734,SOCS2_proteomics,0.562720,3.420388e-06,GBM
1528,CDH4_proteomics,0.559180,3.420388e-06,GBM
...,...,...,...,...
7216,PSMB4_proteomics,-0.000033,9.998937e-01,GBM
7161,PRPF39_proteomics,0.000123,9.998937e-01,GBM
9703,TRAPPC6B_proteomics,0.000086,9.998937e-01,GBM
5351,MKI67_proteomics,-0.000098,9.998937e-01,GBM


# Kidney 

In [7]:
df1 = kidney.join_omics_to_mutations(omics_df_name="proteomics",mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]
df1_tumor



Name,A1BG_proteomics,A1CF_proteomics,A2M_proteomics,A4GALT_proteomics,AAAS_proteomics,AACS_proteomics,AADAC_proteomics,AADAT_proteomics,AAED1_proteomics,AAGAB_proteomics,...,ZWINT_proteomics,ZXDC_proteomics,ZYG11B_proteomics,ZYX_proteomics,ZZEF1_proteomics,ZZZ3_proteomics,EGFR_Mutation,EGFR_Location,EGFR_Mutation_Status,Sample_Status
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,-0.304302,0.641447,-0.000025,,0.207831,-0.364128,,-1.203886,-0.217934,0.216894,...,,-0.021827,0.133927,0.237280,0.114409,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00010,1.195915,0.194620,1.360294,,0.126956,-0.572843,,-1.596546,,0.221696,...,,-0.205642,0.182434,,0.201374,-0.068340,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00011,-0.286155,-0.780455,-0.101089,,0.292629,0.035812,,,,0.300863,...,,0.316298,-0.009772,-0.019653,-0.095339,0.008961,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00026,0.135730,0.404286,0.261384,,0.155568,0.336311,,,0.709046,0.244198,...,,-0.120501,0.054559,-0.313236,0.062194,0.052825,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3L-00079,-0.123959,-0.677773,-0.362547,,0.187605,-0.320026,,-1.300148,-0.153216,0.229676,...,0.068182,,0.178869,0.266290,-0.028647,0.003682,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,-0.173487,-0.686012,0.816201,,0.322337,0.187351,,,,0.072120,...,,-0.063266,-0.103128,0.352912,0.211666,,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-01648,-0.350081,-0.699248,-0.303287,,0.015626,-0.161738,,,,0.192821,...,,-0.163551,-0.128527,0.482901,0.211126,0.086113,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor
C3N-01649,0.246378,-0.847288,-0.186221,,0.113546,-0.687156,,,,-0.232000,...,,0.110566,-0.035962,0.752686,0.105418,-0.127322,[Silent],[p.I213I],Single_mutation,Tumor
C3N-01651,-0.242872,0.486950,-0.057568,,0.198747,0.146118,,,-0.004491,-0.127252,...,,,0.351924,0.569947,-0.156008,-0.186567,[Wildtype_Tumor],[No_mutation],Wildtype_Tumor,Tumor


In [8]:
prot_genes_list = create_prot_list(df1_tumor)

In [9]:
df2=p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Kidney'
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_FDR2_pvals.csv")
df2

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
2544,EGFR_proteomics,1.000000,0.000000e+00,Kidney
6513,PPM1B_proteomics,-0.584494,1.005437e-07,Kidney
741,ATP11A_proteomics,0.577079,1.368835e-07,Kidney
6296,PLCB1_proteomics,0.561952,3.668177e-07,Kidney
504,APAF1_proteomics,0.560909,3.668177e-07,Kidney
...,...,...,...,...
6592,PRDM10_proteomics,-0.000443,9.983727e-01,Kidney
2277,DHX29_proteomics,0.000211,9.985571e-01,Kidney
4317,KLHL11_proteomics,-0.000199,9.986944e-01,Kidney
4502,LMNA_proteomics,-0.000131,9.990173e-01,Kidney


# Ovarian 

In [10]:
df1 = Ovar.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [11]:
prot_genes_list = create_prot_list(df1_tumor)

In [12]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh',return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Ovarian'
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_FDR2_pvals.csv")


# Colon

In [13]:
df1 = colon.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [14]:
prot_genes_list = create_prot_list(df1_tumor)

In [15]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Colon_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Colon'
df2.to_csv("csv_files/Colon_EGFR_all_pearson_FDR2_pvals.csv")
df2

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
1853,EGFR_proteomics,1.000000,0.000000,Colon
5059,RAE1_proteomics,-0.498802,0.000473,Colon
6811,UTP6_proteomics,-0.510343,0.000473,Colon
2339,GAR1_proteomics,-0.489788,0.000631,Colon
5069,RALYL_proteomics,-0.485763,0.000650,Colon
...,...,...,...,...
201,AK1_proteomics,-0.000256,0.998577,Colon
3134,KRI1_proteomics,-0.000217,0.998691,Colon
3797,MUL1_proteomics,-0.000272,0.998691,Colon
830,CADPS_proteomics,0.000064,0.999774,Colon


# Brca 

In [16]:
df1 = brca.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [17]:
prot_genes_list = create_prot_list(df1_tumor)

In [18]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Brca_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Brca'
df2.to_csv("csv_files/Brca_EGFR_all_pearson_FDR2_pvals.csv")

# LUAD

In [19]:
df1 = luad.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [20]:
prot_genes_list = create_prot_list(df1_tumor)

In [21]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Luad_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Luad'
df2.to_csv("csv_files/Luad_EGFR_all_pearson_FDR2_pvals.csv")

# HNSCC

In [22]:
df1 = Hnscc.join_omics_to_mutations(omics_df_name="proteomics",  mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [23]:
prot_genes_list = create_prot_list(df1_tumor)

In [24]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Hnscc'
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_FDR2_pvals.csv")

# Lscc

In [25]:
df1 = Lscc.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [26]:
prot_genes_list = create_prot_list(df1_tumor)

In [27]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Lscc'
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_FDR2_pvals.csv")

In [28]:
df2

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
3015,EGFR_proteomics,1.000000,0.000000e+00,Lscc
7212,PHLDA1_proteomics,0.713420,2.644826e-14,Lscc
7214,PHLDA3_proteomics,0.682034,1.705262e-12,Lscc
5201,LANCL2_proteomics,0.575817,2.040789e-07,Lscc
4835,ITGB4_proteomics,0.566095,3.972505e-07,Lscc
...,...,...,...,...
333,AIMP1_proteomics_1,-0.000053,9.997474e-01,Lscc
7715,PRKD2_proteomics,-0.000082,9.997474e-01,Lscc
3787,GALK2_proteomics,-0.000065,9.997474e-01,Lscc
7666,PRDM15_proteomics,0.000049,9.997474e-01,Lscc
