# Pearson Dfs EGFR proteomics vs all proteins 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics. It records only the FDR Benjamini/Hochberg significant comparisons. I returns both the corrected an uncorrected p values.  

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p


In [None]:
def create_prot_list(df):
    prot_genes_list = df.columns.values.tolist()
    prot_genes_list.remove('EGFR_Mutation')
    prot_genes_list.remove('EGFR_Location')
    prot_genes_list.remove('EGFR_Mutation_Status')
    prot_genes_list.remove('Sample_Status')
    return prot_genes_list

In [None]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
Ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
Hnscc = cptac.Hnscc()
Lscc = cptac.Lscc()

# Step 1 Create Data frames 

For each cancer type,
load proteomic data and mutation data for EGFR and
filter out non-tumor samples 

# Step 2  Create list of Proteins for Comparison

For each cancer type, create list of proteins by using create_prot_list function which extracts column names and removes extra columns.

# Step 3 Run Pearson Correlation Function  

Run function on df and compare EGFR proteomics to all other proteins in protein list. Store only the significant comparisons. Save df as csv file

# GBM

In [32]:
df1 = brain.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [33]:
prot_genes_list = create_prot_list(df1_tumor)

In [34]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
df2.to_csv("csv_files/GBM_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='GBM'
df2.to_csv("csv_files/GBM_EGFR_all_pearson_FDR2.csv")
df2



Unnamed: 0,Comparison,Correlation,P_value,fdr_bh_p_val,Cancer Type
454,EGFR_proteomics,1.000000,0.000000e+00,0.000000e+00,GBM
1043,PHLDA1_proteomics,0.816848,6.553435e-25,3.507071e-21,GBM
642,GRB2_proteomics,-0.610889,1.886384e-11,6.729990e-08,GBM
1353,SOCS2_proteomics,0.562720,1.343464e-09,3.420388e-06,GBM
274,CDH4_proteomics,0.559180,1.790048e-09,3.420388e-06,GBM
...,...,...,...,...,...
1511,USP2_proteomics,0.280545,7.399884e-03,4.993292e-02,GBM
1536,WAPL_proteomics,-0.267631,7.403862e-03,4.993292e-02,GBM
1324,SLC25A42_proteomics,0.267588,7.413549e-03,4.995405e-02,GBM
523,FANCD2_proteomics,0.278945,7.416331e-03,4.995405e-02,GBM


# Kidney 

In [35]:
df1 = kidney.join_omics_to_mutations(omics_df_name="proteomics",mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [36]:
prot_genes_list = create_prot_list(df1_tumor)

In [37]:
df2=p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh',return_all = False)
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Kidney'
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_FDR2.csv")
df2

Unnamed: 0,Comparison,Correlation,P_value,fdr_bh_p_val,Cancer Type
278,EGFR_proteomics,1.000000,0.000000e+00,0.000000e+00,Kidney
108,ATP11A_proteomics,0.577079,4.144636e-11,2.010563e-07,Kidney
711,PLCB1_proteomics,0.561952,1.684847e-10,4.489889e-07,Kidney
76,APAF1_proteomics,0.560909,1.851119e-10,4.489889e-07,Kidney
418,HPCAL1_proteomics,0.556681,2.701940e-10,4.543590e-07,Kidney
...,...,...,...,...,...
416,HNRNPK_proteomics,-0.262782,5.545345e-03,4.960018e-02,Kidney
415,HNRNPK_proteomics,-0.262782,5.545345e-03,4.960018e-02,Kidney
705,PIP4K2A_proteomics,-0.262774,5.546917e-03,4.960018e-02,Kidney
796,RAP1GAP2_proteomics,-0.273992,5.561371e-03,4.968363e-02,Kidney


# Ovarian 

In [38]:
df1 = Ovar.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [39]:
prot_genes_list = create_prot_list(df1_tumor)

In [40]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Ovarian'
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_FDR2.csv")


# Colon

In [41]:
df1 = colon.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [42]:
prot_genes_list = create_prot_list(df1_tumor)

In [43]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh',return_all = False)
df2.to_csv("csv_files/Colon_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Colon'
df2.to_csv("csv_files/Colon_EGFR_all_pearson_FDR2.csv")
df2

Unnamed: 0,Comparison,Correlation,P_value,fdr_bh_p_val,Cancer Type
145,EGFR_proteomics,1.000000,0.000000e+00,0.000000,Colon
586,UTP6_proteomics,-0.510343,1.733748e-07,0.000473,Colon
425,RAE1_proteomics,-0.498802,1.993818e-07,0.000473,Colon
178,GAR1_proteomics,-0.489788,3.550550e-07,0.000631,Colon
428,RALYL_proteomics,-0.485763,4.569947e-07,0.000650,Colon
...,...,...,...,...,...
541,TIMP1_proteomics,0.287791,4.257833e-03,0.049589,Colon
85,CCDC6_proteomics,0.287735,4.265624e-03,0.049589,Colon
259,LAMA5_proteomics,0.287734,4.265767e-03,0.049589,Colon
516,SSR2_proteomics,0.301814,4.266608e-03,0.049589,Colon


# Brca 

In [44]:
df1 = brca.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [45]:
prot_genes_list = create_prot_list(df1_tumor)

In [46]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh',return_all = False)
df2.to_csv("csv_files/Brca_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Brca'
df2.to_csv("csv_files/Brca_EGFR_all_pearson_FDR2.csv")

# LUAD

In [47]:
df1 = luad.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [48]:
prot_genes_list = create_prot_list(df1_tumor)

In [49]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
df2.to_csv("csv_files/Luad_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Luad'
df2.to_csv("csv_files/Luad_EGFR_all_pearson_FDR2.csv")

# HNSCC

In [50]:
df1 = Hnscc.join_omics_to_mutations(omics_df_name="proteomics",  mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [51]:
prot_genes_list = create_prot_list(df1_tumor)

In [52]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Hnscc'
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_FDR2.csv")

# Lscc

In [53]:
df1 = Lscc.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [54]:
prot_genes_list = create_prot_list(df1_tumor)

In [55]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_FDR.csv")
df2['Cancer Type']='Lscc'
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_FDR2.csv")