# Pearson Dfs EGFR proteomics vs all proteins 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics. It records all FDR Benjamini/Hochberg corrected p values.  

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p


  import pandas.util.testing as tm


In [2]:
def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    prot_genes_list = df.columns.values.tolist()
    prot_genes_list.remove('EGFR_Mutation')
    prot_genes_list.remove('EGFR_Location')
    prot_genes_list.remove('EGFR_Mutation_Status')
    prot_genes_list.remove('Sample_Status')
    return prot_genes_list

In [None]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
Ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
Hnscc = cptac.Hnscc()
Lscc = cptac.Lscc()

Checking that ccrcc index is up-to-date...



Formatting dataframes...                    

# Step 1 Create Data frames 

For each cancer type,
load proteomic data and mutation data for EGFR and
filter out non-tumor samples 

# Step 2  Create list of Proteins for Comparison

For each cancer type, create list of proteins by using create_prot_list function which extracts column names and labels duplicate columns caused by multiple isoforms.

# Step 3 Run Pearson Correlation Function  

Run function on df and compare EGFR proteomics to all other proteins in protein list. Record all FDR corrected p values. 

# GBM

In [None]:
df1 = brain.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
gbm = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)




# Kidney 

In [None]:
df1 = kidney.join_omics_to_mutations(omics_df_name="proteomics",mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]


In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
kidney =p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)

# Ovarian 

In [None]:
df1 = Ovar.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
ovarian = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh',return_all = True, return_corrected_pvals = True)



# Colon

In [None]:
df1 = colon.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
colon = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)


# Brca 

In [None]:
df1 = brca.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
brca = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)


# LUAD

In [None]:
df1 = luad.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
luad = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)


# HNSCC

In [None]:
df1 = Hnscc.join_omics_to_mutations(omics_df_name="proteomics",  mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
hnscc = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)


# Lscc

In [None]:
df1 = Lscc.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]

In [None]:
prot_genes_list = create_prot_list(df1_tumor)

In [None]:
lscc = p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True, return_corrected_pvals = True)


# Merge all data frames into one wide data frame

Did not inculde Colon in final csv file because it doesn't have a cis effect. Csv file used in notebook Plot_EGFR_1C_parts

In [None]:
Gbm_kidney = pd.merge(gbm, kidney, on="Comparison", how = "outer")
Gbm_kidney = Gbm_kidney.rename(columns={"Correlation_x": "Correlation_Gbm","P_value_x":"P_value_Gbm" ,"Correlation_y":"Correlation_kidney","P_value_y": "P_value_kidney" })


In [None]:
pancan = pd.merge(Gbm_kidney, ovarian, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Ovar","P_value": "P_value_Ovar" })

In [None]:
pancan = pd.merge(pancan, brca, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Brca","P_value": "P_value_Brca" })


In [None]:
pancan = pd.merge(pancan, luad, on="Comparison", how = "outer")
pancan = pancan.rename(columns={"Correlation": "Correlation_Luad","P_value": "P_value_Luad" })

In [None]:
pancan = pd.merge(pancan, hnscc, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Hnscc","P_value": "P_value_Hnscc" })


In [None]:
pancan = pd.merge(pancan, colon, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Colon","P_value": "P_value_Colon" })

In [None]:
pancan = pd.merge(pancan, lscc, on="Comparison", how = "outer")
pancan = pancan.rename(columns={ "Correlation": "Correlation_Lscc","P_value": "P_value_Lscc" })
pancan = pancan.to_csv("Pval_corr_table_Fig_1C_return_all.csv")

In [None]:
pancan