# Pearson Dfs EGFR phospho sites vs all phospho sites

This notebooks makes df for each cancer type that compares an EGFR phospho site against all phospho sites. (returns all)

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest


import cptac
import cptac.utils as u
import plot_utils as p


In [2]:

'''
@Param df: Dataframe. Contains numeric values (such as proteomics) for linear regression
@Param label_column: String. Name of column that will be your x axis and will be compared to all values in df unless otherwise specified. 
@Param alpha: significant level
@Param comparison_columns: columns that will be looped through and used as y axis for linear regression. 
All other columns beside label column unless specified here. 
@Param correction_method: String. Specifies method of adjustment for multiple testing. See -
https://www.statsmodels.org/stable/generated/statsmodels.stats.multitest.multipletests.html
    - for documentation and available methods.

This function will return a data frame will all significant linear regressions. The data frame includes the comparison, slope, R-squared, and P-value. 
'''
def wrap_lin_regression(df,label_column, alpha=.05,comparison_columns=None,correction_method='bonferroni',return_all = True):
    

    
    '''If no comparison columns specified, use all columns except the specified labed column'''
    if not comparison_columns:
        comparison_columns = list(df.columns)
        comparison_columns.remove(label_column)
    '''Store comparisons,p-values, correlation in their own array'''
    comparisons = []
    pvals = []
    correlation=[]
    
    
    '''Format results in a pandas dataframe'''
    newdf = pd.DataFrame(columns=['Comparison','Correlation','P_value'])
    for inter_gene in comparison_columns:
        #create subset df with interacting gene/ gene (otherwise drop NaN drops everything)
        df_subset = df[[label_column,inter_gene]]
        #do a linear regression to see if it's a meaningful association
        #dropna will remove rows with nan
        df_subset = df_subset.dropna(axis=0, how="any")
        count_row = df_subset.shape[0]
        if count_row > 20:
            x1 = df_subset[[label_column]].values
            y1 = df_subset[[inter_gene]].values
            x1 = x1[:,0]
            y1 = y1[:,0]

            #slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x1,y1)
            corr, pval = scipy.stats.pearsonr(x1,y1)

            comparisons.append(inter_gene)
            pvals.append(pval)
            correlation.append(corr)
            #slope_val.append(slope)
        
    '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
    results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
    reject = results[0]
        
    if return_all:
        for i in range(0,len(comparisons)):
            newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],'P_value': pvals[i]}, ignore_index=True)
        
    '''Else only add significant comparisons'''
    if (return_all == False):
            for i in range(0, len(reject)):
                if reject[i]:
                    newdf = newdf.append({'Comparison': comparisons[i],"Slope": slope_val[i], 'R_squared': r_squared[i], 'P_value': pvals[i]}, ignore_index=True)
                    
    '''Sort dataframe by ascending p-value'''
    newdf = newdf.sort_values(by='P_value', ascending=True)
    '''If results df is not empty, return it, else return None'''
    return newdf



       
       

In [2]:
def get_phospho_sites(df):
    phospho_sites = df.columns.values.tolist()
    phospho_sites.remove('EGFR_Mutation_')
    phospho_sites.remove('EGFR_Location_')
    phospho_sites.remove('EGFR_Mutation_Status_')
    phospho_sites.remove('Sample_Status_')
    return phospho_sites 


In [3]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
Ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
Hnscc = cptac.Hnscc()
Lscc = cptac.Lscc()

Checking that ccrcc index is up-to-date...



Checking that hnscc index is up-to-date...  



                                          

# Step 1 Make data frame 

For each cancer, get phosphoproteomic and mutation data. Drop extra multi-index levels. Merge levels to have column names include gene and phospho site 

# Step 2 Get Phospho Sites

For each cancer, get phospho sites by using get_phospho_sites function. Function extracts column names and deletes unnecessary columns. It returns list of phospho sites. 

# Step 3 Run Pearson Correlation Function

Run correlation function and compare EGFR phospho site to all phospho sites. Save df as csv file. 

# GBM

In [4]:
df1 = brain.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]



In [5]:
phospho_sites = get_phospho_sites(phosphoprot)

In [8]:
df =(p.wrap_pearson_corr(phosphoprot,"EGFR_phosphoproteomics_Y1197",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='GBM'
df.to_csv("csv_files/GBM_EGFR_Y1197_phospho.csv")
df

AttributeError: module 'plot_utils' has no attribute 'wrap_pearson_corr'

# Kidney

In [8]:
df1 = kidney.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]



In [9]:
phospho_sites = get_phospho_sites(phosphoprot)

In [10]:
df =(wrap_lin_regression(df3,"EGFR_phosphoproteomics_Y1144",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='Kidney'
df.to_csv("csv_files/Kidney_EGFR_Y1144_phospho.csv")

NameError: name 'df3' is not defined

# Ovarian 

In [None]:
df1 = Ovar.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]

In [None]:
phospho_sites = get_phospho_sites(phosphoprot)

In [None]:
df =(wrap_lin_regression(df3,"EGFR_phosphoproteomics_Y1172",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='Ovar'
df.to_csv("csv_files/Ovar_EGFR_Y1172_phospho.csv")


# Colon

In [None]:
df1 = colon.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]

In [None]:
phospho_sites = get_phospho_sites(phosphoprot)


In [None]:
df =(wrap_lin_regression(phosphoprot,"EGFR_phosphoproteomics_Y1092",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='Colon'
df.to_csv("csv_files/colon_EGFR_Y1092_phospho.csv")



# Brca 

In [None]:
df1 = brca.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot= df1.loc[df1['Sample_Status_'] == "Tumor"]

In [None]:
phospho_sites = get_phospho_sites(phosphoprot)


In [None]:
phosphoprot.filter(like='EGFR_phosphoproteomics', axis=1)
#no EGFR phospho Y data 

# LUAD

In [None]:
df1 = luad.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]

In [None]:
phospho_sites = get_phospho_sites(phosphoprot)

In [None]:
df =(wrap_lin_regression(phosphoprot,"EGFR_phosphoproteomics_Y1197",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='Luad'
df.to_csv("csv_files/Luad_EGFR_Y1197_phospho.csv")



# HNSCC

In [None]:
df1 = Hnscc.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]

In [None]:
phospho_sites = get_phospho_sites(phosphoprot)

In [None]:
df =(wrap_lin_regression(df3,"EGFR_phosphoproteomics_Y1197",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='Hnscc'
df.to_csv("csv_files/Hnscc_EGFR_Y1197_phospho.csv")

# Lscc

In [None]:
df1 = Lscc.join_omics_to_mutations(omics_df_name="phosphoproteomics", mutations_genes="EGFR")
df1.columns = df1.columns.droplevel(2)
df1.columns = df1.columns.droplevel(2)
df1.columns =df1.columns.map('_'.join)
phosphoprot = df1.loc[df1['Sample_Status_'] == "Tumor"]


In [None]:
phospho_sites = get_phospho_sites(phosphoprot)

In [None]:
df =(wrap_lin_regression(df3,"EGFR_phosphoproteomics_Y1197",comparison_columns = phospho_sites, return_all = True))
df['Cancer Type']='Lscc'
df.to_csv("csv_files/Lscc_EGFR_Y1197_phospho.csv")

