# Pearson Dfs EGFR proteomics vs all proteins 

This notebook records the pearson correlation stats for EGFR proteomics vs all proteomics. It records all comparisons 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p


  import pandas.util.testing as tm


In [2]:
def wrap_pearson_corr(df,label_column, alpha=.05,comparison_columns=None,correction_method='bonferroni',return_all = True):


    #df = df.dropna(axis=1, how="all")

    '''If no comparison columns specified, use all columns except the specified labed column'''
    if not comparison_columns:
        comparison_columns = list(df.columns)
        comparison_columns.remove(label_column)
    '''Store comparisons,p-values, correlation in their own array'''
    comparisons = []
    pvals = []
    correlation=[]


    '''Format results in a pandas dataframe'''
    newdf = pd.DataFrame(columns=['Comparison','Correlation','P_value'])
    for gene in comparison_columns:
        #create subset df with interacting gene/ gene (otherwise drop NaN drops everything)
        df_subset = df[[label_column,gene]]
        #do a linear regression to see if it's a meaningful association
        #dropna will remove rows with nan
        df_subset = df_subset.dropna(axis=0, how="any")
        count_row = df_subset.shape[0]
        if count_row > 30:
            x1 = df_subset[[label_column]].values
            y1 = df_subset[[gene]].values
            x1 = x1[:,0]
            y1 = y1[:,0]
            corr, pval = scipy.stats.pearsonr(x1,y1)

            comparisons.append(gene)
            pvals.append(pval)
            correlation.append(corr)


    '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
    results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
    reject = results[0]

    if return_all:
        for i in range(0,len(comparisons)):
            newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],'P_value': pvals[i], 'Significant': reject[i]}, ignore_index=True)
           
    '''Else only add significant comparisons'''
    if (return_all == False):
            for i in range(0, len(reject)):
                if reject[i]:
                    newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],'P_value': pvals[i]}, ignore_index=True)

    '''Sort dataframe by ascending p-value'''
    newdf = newdf.sort_values(by='P_value', ascending=True)
    '''If results df is not empty, return it, else return None'''
    return newdf



       

In [3]:
def create_prot_list(df):
    df = df.loc[:,~df.columns.duplicated()]
    prot_genes_list = df.columns.values.tolist()
    prot_genes_list.remove('EGFR_Mutation')
    prot_genes_list.remove('EGFR_Location')
    prot_genes_list.remove('EGFR_Mutation_Status')
    prot_genes_list.remove('Sample_Status')
    return prot_genes_list

In [4]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
Ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
Hnscc = cptac.Hnscc()
Lscc = cptac.Lscc()

Checking that ccrcc index is up-to-date...



Checking that hnscc index is up-to-date...  



Checking that lscc index is up-to-date... 



                                         



# Step 1 Create Data frames 

For each cancer type,
load proteomic data and mutation data for EGFR and
filter out non-tumor samples 

# Step 2  Create list of Proteins for Comparison

For each cancer type, create list of proteins by using create_prot_list function which extracts column names and removes extra columns.

# Step 3 Run Pearson Correlation Function  

Run function on df and compare EGFR proteomics to all other proteins in protein list. Store only the significant comparisons. Save df as csv file

# GBM

In [5]:
df1 = brain.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [6]:
prot_genes_list = create_prot_list(df1_tumor)

In [8]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True)
df2.to_csv("csv_files/GBM_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='GBM'
df2.to_csv("csv_files/GBM_EGFR_all_pearson_return_all4.csv")

In [9]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2728,EGFR_proteomics,1.000000,0.000000e+00,1.0,GBM
6656,PHLDA1_proteomics,0.816848,6.553435e-25,1.0,GBM
3757,GRB2_proteomics,-0.610889,1.886384e-11,1.0,GBM
8734,SOCS2_proteomics,0.562720,1.343464e-09,1.0,GBM
1528,CDH4_proteomics,0.559180,1.790048e-09,1.0,GBM
...,...,...,...,...,...
10103,USP2_proteomics,0.280545,7.399884e-03,1.0,GBM
10260,WAPL_proteomics,-0.267631,7.403862e-03,1.0,GBM
8491,SLC25A42_proteomics,0.267588,7.413549e-03,1.0,GBM
3150,FANCD2_proteomics,0.278945,7.416331e-03,1.0,GBM


# Kidney 

In [10]:
df1 = kidney.join_omics_to_mutations(omics_df_name="proteomics",mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [11]:
prot_genes_list = create_prot_list(df1_tumor)

In [12]:
df2=wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh',return_all = True)
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Kidney'
df2.to_csv("csv_files/Kidney_EGFR_all_pearson_return_all4.csv")

In [13]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2427,EGFR_proteomics,1.000000,0.000000e+00,1.0,Kidney
693,ATP11A_proteomics,0.577079,4.144636e-11,1.0,Kidney
5992,PLCB1_proteomics,0.561952,1.684847e-10,1.0,Kidney
466,APAF1_proteomics,0.560909,1.851119e-10,1.0,Kidney
3651,HPCAL1_proteomics,0.556681,2.701940e-10,1.0,Kidney
...,...,...,...,...,...
6655,RAP1GAP2_proteomics,-0.273992,5.561371e-03,1.0,Kidney
2714,FAF2_proteomics,0.262631,5.573690e-03,1.0,Kidney
104,ACP5_proteomics,-0.262417,5.613804e-03,1.0,Kidney
7360,SFRP1_proteomics,-0.262301,5.635779e-03,1.0,Kidney


# Ovarian 

In [14]:
df1 = Ovar.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [15]:
prot_genes_list = create_prot_list(df1_tumor)

In [16]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True)
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Ovarian'
df2.to_csv("csv_files/Ovar_EGFR_all_pearson_return_all4.csv")


In [17]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2417,EGFR_proteomics,1.000000,0.000000e+00,1.0,Ovarian
2623,ERO1A_proteomics,0.512158,7.452351e-07,1.0,Ovarian
1454,CGGBP1_proteomics,-0.492993,2.192339e-06,1.0,Ovarian
6395,PROSER2_proteomics,0.524593,2.244019e-06,1.0,Ovarian
1763,CP_proteomics,0.489328,2.675011e-06,1.0,Ovarian
...,...,...,...,...,...
375,ANKRD1_proteomics,-0.422420,8.600168e-04,1.0,Ovarian
4767,MID1_proteomics,0.358764,8.667758e-04,1.0,Ovarian
2777,FAM57A_proteomics,0.523408,8.866811e-04,1.0,Ovarian
5170,NAP1L1_proteomics,-0.357673,9.011982e-04,1.0,Ovarian


# Colon

In [18]:
df1 = colon.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [19]:
prot_genes_list = create_prot_list(df1_tumor)

In [20]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh',return_all = True)
df2.to_csv("csv_files/Colon_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Colon'
df2.to_csv("csv_files/Colon_EGFR_all_pearson_return_all4.csv")

In [21]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
1853,EGFR_proteomics,1.000000,0.000000e+00,1.0,Colon
6811,UTP6_proteomics,-0.510343,1.733748e-07,1.0,Colon
5059,RAE1_proteomics,-0.498802,1.993818e-07,1.0,Colon
2339,GAR1_proteomics,-0.489788,3.550550e-07,1.0,Colon
5069,RALYL_proteomics,-0.485763,4.569947e-07,1.0,Colon
...,...,...,...,...,...
6302,TIMP1_proteomics,0.287791,4.257833e-03,1.0,Colon
924,CCDC6_proteomics,0.287735,4.265624e-03,1.0,Colon
3170,LAMA5_proteomics,0.287734,4.265767e-03,1.0,Colon
5996,SSR2_proteomics,0.301814,4.266608e-03,1.0,Colon


# Brca 

In [22]:
df1 = brca.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [23]:
prot_genes_list = create_prot_list(df1_tumor)

In [24]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list, correction_method='fdr_bh',return_all = True)
df2.to_csv("csv_files/Brca_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Brca'
df2.to_csv("csv_files/Brca_EGFR_all_pearson_return_all4.csv")

In [25]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2559,EGFR_proteomics,1.000000,0.000000e+00,1.0,Brca
5068,MPP6_proteomics,0.648202,4.834438e-15,1.0,Brca
1925,CPNE8_proteomics,0.616453,2.229263e-13,1.0,Brca
1982,CRYBG3_proteomics,0.609997,4.612176e-13,1.0,Brca
6633,PSAT1_proteomics,0.609319,4.973595e-13,1.0,Brca
...,...,...,...,...,...
3312,GGH_proteomics,0.229847,1.347031e-02,1.0,Brca
4284,KIF15_proteomics,0.229837,1.347467e-02,1.0,Brca
5029,MNS1_proteomics,0.270088,1.353485e-02,1.0,Brca
7324,RRP1_proteomics,0.229647,1.355376e-02,1.0,Brca


# LUAD

In [26]:
df1 = luad.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [27]:
prot_genes_list = create_prot_list(df1_tumor)

In [28]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True)
df2.to_csv("csv_files/Luad_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Luad'
df2.to_csv("csv_files/Luad_EGFR_all_pearson_return_all4.csv")

In [29]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2698,EGFR_proteomics,1.000000,0.000000e+00,1.0,Luad
3478,GGCT_proteomics,0.609006,1.676601e-12,1.0,Luad
4652,LANCL2_proteomics,0.588808,1.328507e-11,1.0,Luad
6017,NUDCD3_proteomics,0.568383,9.362813e-11,1.0,Luad
8821,TAX1BP1_proteomics,0.561554,1.746498e-10,1.0,Luad
...,...,...,...,...,...
9792,VGLL4_proteomics,0.261583,5.773032e-03,1.0,Luad
8589,ST5_proteomics,0.261481,5.792846e-03,1.0,Luad
2112,CSTF1_proteomics,0.261472,5.794461e-03,1.0,Luad
8501,SPRYD4_proteomics,-0.261441,5.800620e-03,1.0,Luad


# HNSCC

In [30]:
df1 = Hnscc.join_omics_to_mutations(omics_df_name="proteomics",  mutations_genes="EGFR")
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [31]:
prot_genes_list = create_prot_list(df1_tumor)

In [32]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True)
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Hnscc'
df2.to_csv("csv_files/Hnscc_EGFR_all_pearson_return_all4.csv")

In [33]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2670,EGFR_proteomics,1.000000,0.000000e+00,1.0,Hnscc
4657,LANCL2_proteomics,0.796728,3.798194e-25,1.0,Hnscc
6502,PHLDA3_proteomics,0.763784,4.542521e-22,1.0,Hnscc
6500,PHLDA1_proteomics,0.664271,3.426615e-15,1.0,Hnscc
6501,PHLDA2_proteomics,0.649034,2.302129e-14,1.0,Hnscc
...,...,...,...,...,...
3627,GPD1L_proteomics,-0.266473,5.097373e-03,1.0,Hnscc
8005,SELL_proteomics,-0.266360,5.116897e-03,1.0,Hnscc
5045,MBD1_proteomics,-0.266098,5.162417e-03,1.0,Hnscc
7775,RRP12_proteomics,0.266066,5.167833e-03,1.0,Hnscc


# Lscc

In [34]:
df1 = Lscc.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]



In [35]:
prot_genes_list = create_prot_list(df1_tumor)

In [36]:
df2= wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = True)
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_return_all3.csv")
df2['Cancer Type']='Lscc'
df2.to_csv("csv_files/Lscc_EGFR_all_pearson_return_all4.csv")

In [37]:
newdf = df2[df2['Significant'] == 1.0]
newdf

Unnamed: 0,Comparison,Correlation,P_value,Significant,Cancer Type
2832,EGFR_proteomics,1.000000,0.000000e+00,1.0,Lscc
6790,PHLDA1_proteomics,0.709812,3.919195e-18,1.0,Lscc
6792,PHLDA3_proteomics,0.685644,1.412309e-16,1.0,Lscc
4884,LANCL2_proteomics,0.572261,6.528910e-11,1.0,Lscc
4532,ITGB4_proteomics,0.567380,1.026981e-10,1.0,Lscc
...,...,...,...,...,...
862,ATXN10_proteomics,0.290219,2.101111e-03,1.0,Lscc
4961,LIMA1_proteomics,0.290115,2.109252e-03,1.0,Lscc
166,ADA2_proteomics,-0.289977,2.120119e-03,1.0,Lscc
7778,RFXAP_proteomics,-0.289937,2.123227e-03,1.0,Lscc
