# Pearson Dfs EGFR proteomics vs all proteins 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics. It records only the FDR Benjamini/Hochberg significant comparisons. I returns the corrected p values.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p


  import pandas.util.testing as tm


In [2]:
def create_prot_list(df):
    prot_genes_list = df.columns.values.tolist()
    prot_genes_list.remove('EGFR_Mutation')
    prot_genes_list.remove('EGFR_Location')
    prot_genes_list.remove('EGFR_Mutation_Status')
    prot_genes_list.remove('Sample_Status')
    return prot_genes_list

In [3]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
Ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
Hnscc = cptac.Hnscc()
Lscc = cptac.Lscc()

Checking that ccrcc index is up-to-date...



Checking that hnscc index is up-to-date...  



Checking that lscc index is up-to-date... 



version 3scc v3.2.......                 
                            



# Step 1 Create Data frames 

For each cancer type,
load proteomic data and mutation data for EGFR and
filter out non-tumor samples 

# Step 2  Create list of Proteins for Comparison

For each cancer type, create list of proteins by using create_prot_list function which extracts column names and removes extra columns.

# Step 3 Run Pearson Correlation Function  

Run function on df and compare EGFR proteomics to all other proteins in protein list. Store only the significant comparisons. Save df as csv file

# GBM

In [4]:
df1 = brain.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [5]:
prot_genes_list = create_prot_list(df1_tumor)

In [6]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)

df2.to_csv("csv_files/GBM_EGFR_all_pearson_FDR_pval.csv")
df2['Cancer Type']='GBM'
df2.to_csv("csv_files/GBM_EGFR_all_pearson_FDR2_pval.csv")
df2



Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
2728,EGFR_proteomics,1.000000,0.000000e+00,GBM
6656,PHLDA1_proteomics,0.816848,3.507071e-21,GBM
3757,GRB2_proteomics,-0.610889,6.729990e-08,GBM
8734,SOCS2_proteomics,0.562720,3.420388e-06,GBM
1528,CDH4_proteomics,0.559180,3.420388e-06,GBM
...,...,...,...,...
7216,PSMB4_proteomics,-0.000033,9.998937e-01,GBM
7161,PRPF39_proteomics,0.000123,9.998937e-01,GBM
9703,TRAPPC6B_proteomics,0.000086,9.998937e-01,GBM
5351,MKI67_proteomics,-0.000098,9.998937e-01,GBM


# Kidney 

In [7]:
df1 = kidney.join_omics_to_mutations(omics_df_name="proteomics",mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [8]:
prot_genes_list = create_prot_list(df1_tumor)

In [9]:
df2=p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Kidney'
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_FDR2_pvals.csv")
df2

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
2488,EGFR_proteomics,1.000000,0.000000e+00,Kidney
717,ATP11A_proteomics,0.577079,2.010563e-07,Kidney
488,APAF1_proteomics,0.560909,4.489889e-07,Kidney
6161,PLCB1_proteomics,0.561952,4.489889e-07,Kidney
3756,HPCAL1_proteomics,0.556681,4.543590e-07,Kidney
...,...,...,...,...
1008,C16orf86_proteomics,-0.000337,9.983813e-01,Kidney
2227,DHX29_proteomics,0.000211,9.985635e-01,Kidney
4229,KLHL11_proteomics,-0.000199,9.986987e-01,Kidney
4406,LMNA_proteomics,-0.000131,9.990194e-01,Kidney


# Ovarian 

In [10]:
df1 = Ovar.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [11]:
prot_genes_list = create_prot_list(df1_tumor)

In [12]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh',return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Ovarian'
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_FDR2_pvals.csv")


# Colon

In [13]:
df1 = colon.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [14]:
prot_genes_list = create_prot_list(df1_tumor)

In [15]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Colon_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Colon'
df2.to_csv("csv_files/Colon_EGFR_all_pearson_FDR2_pvals.csv")
df2

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
1853,EGFR_proteomics,1.000000,0.000000,Colon
5059,RAE1_proteomics,-0.498802,0.000473,Colon
6811,UTP6_proteomics,-0.510343,0.000473,Colon
2339,GAR1_proteomics,-0.489788,0.000631,Colon
5069,RALYL_proteomics,-0.485763,0.000650,Colon
...,...,...,...,...
201,AK1_proteomics,-0.000256,0.998577,Colon
3134,KRI1_proteomics,-0.000217,0.998691,Colon
3797,MUL1_proteomics,-0.000272,0.998691,Colon
830,CADPS_proteomics,0.000064,0.999774,Colon


# Brca 

In [16]:
df1 = brca.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [17]:
prot_genes_list = create_prot_list(df1_tumor)

In [18]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Brca_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Brca'
df2.to_csv("csv_files/Brca_EGFR_all_pearson_FDR2_pvals.csv")

# LUAD

In [19]:
df1 = luad.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [20]:
prot_genes_list = create_prot_list(df1_tumor)

In [21]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Luad_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Luad'
df2.to_csv("csv_files/Luad_EGFR_all_pearson_FDR2_pvals.csv")

# HNSCC

In [22]:
df1 = Hnscc.join_omics_to_mutations(omics_df_name="proteomics",  mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [23]:
prot_genes_list = create_prot_list(df1_tumor)

In [24]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Hnscc'
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_FDR2_pvals.csv")

# Lscc

In [25]:
df1 = Lscc.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [26]:
prot_genes_list = create_prot_list(df1_tumor)

In [27]:
df2= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_FDR_pvals.csv")
df2['Cancer Type']='Lscc'
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_FDR2_pvals.csv")