# Gene vs phospho correlation

This notebook finds the correlation of phosphorylation between an individual site and all phosphylation for the corresponding gene. It thens find the percent of genes that have atleast 1 site with a corelation greater than 0.9 (or 0.8,

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest
import math

import cptac
import cptac.utils as u
import plot_utils as p
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [7]:
'''
@Param df: Data frame. Takes a data frame of proteomic data frame.
This function takes the data frame from get_proteomics and labels any duplicate columns due to mutliple isoforms.
Then it returns the list of all proteins
'''

def create_prot_list(df):
    #Add _(number) to keep track of duplicates due to multiple isoforms 
    cols = pd.Series(df.columns[:])
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns=cols
    #get list of proteins
    prot_genes_list = df.columns.values.tolist()
    return prot_genes_list

In [3]:
def wrap_pearson_corr(df,label_column, alpha=.05,comparison_columns=None,correction_method='bonferroni',return_all = True, return_corrected_pvals = False):


    #df = df.dropna(axis=1, how="all")

    '''If no comparison columns specified, use all columns except the specified labed column'''
    if not comparison_columns:
        comparison_columns = list(df.columns)
        comparison_columns.remove(label_column)
    '''Store comparisons,p-values, correlation in their own array'''
    comparisons = []
    pvals = []
    correlation=[]


    '''Format results in a pandas dataframe'''
    
    newdf = pd.DataFrame()
    for gene in comparison_columns:
        #create subset df with interacting gene/ gene (otherwise drop NaN drops everything)
        df_subset = df[[label_column,gene]]
        #do a linear regression to see if it's a meaningful association
        #dropna will remove rows with nan
        df_subset = df_subset.dropna(axis=0, how="any")
        count_row = df_subset.shape[0]
        if count_row > 30:
            x1 = df_subset[[label_column]].values
            y1 = df_subset[[gene]].values
            x1 = x1[:,0]
            y1 = y1[:,0]
            corr, pval = scipy.stats.pearsonr(x1,y1)

            comparisons.append(gene)
            pvals.append(pval)
            correlation.append(corr)
            
    if correction_method != "none":

        '''Correct for multiple testing to determine if each comparison meets the new cutoff'''
        results = statsmodels.stats.multitest.multipletests(pvals=pvals, alpha=alpha, method=correction_method)
        reject = results[0]
        corrected_pval = results[1]

        if return_all:
            if return_corrected_pvals:
                for i in range(0,len(comparisons)):
                    newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i], "P_value": corrected_pval[i]}, ignore_index=True)

            if return_corrected_pvals == False:
                for i in range(0,len(comparisons)):
                    newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],'P_value': pvals[i]}, ignore_index=True)

        '''Else only add significant comparisons'''
        if (return_all == False):

            if return_corrected_pvals:
                for i in range(0, len(reject)):
                    if reject[i]:
                        newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],"P_value": corrected_pval[i]}, ignore_index=True)
            if return_corrected_pvals == False:
                    for i in range(0, len(reject)):
                        if reject[i]:
                            newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],"P_value": pvals[i]}, ignore_index=True)

    if correction_method == "none":
        for i in range(0, len(comparisons)):
            newdf = newdf.append({'Comparison': comparisons[i],"Correlation": correlation[i],"P_value": pvals[i]}, ignore_index=True)

   # newdf = newdf.sort_values(by= 'P_value', ascending=True)
    '''If results df is not empty, return it, else return None'''
    return newdf

In [4]:
#cptac version
cptac.version()   

'0.8.6'

In [2]:
#load cptac data 
brain = cptac.Gbm()
kidney = cptac.Ccrcc()
ovar = cptac.Ovarian()
colon = cptac.Colon()
brca = cptac.Brca()
luad = cptac.Luad()
hnscc = cptac.Hnscc()
lscc = cptac.Lscc()
endo = cptac.Endometrial()

                                                

In [5]:
def phospho_gene_vs_site(cancer):
    #get phosphoproteomic data 
    phospho_site = cancer.get_phosphoproteomics("tumor")
    #drop database_id and peptide 
    if (cancer != colon) & (cancer != endo):
        phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
        phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
        phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
    if (cancer == colon): 
        phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
        phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
    if cancer == endo:
         phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
    genes = phospho_gene.columns.to_list() 
    genes = list(set(genes))#get unique list of all genes

    phospho_site.columns =phospho_site.columns.map('_'.join) #join gene and site
    phospho_site = phospho_site.loc[:,~phospho_site.T.duplicated(keep='first')] # drop columns where values are idendical

    phospho_site = phospho_site.dropna(thresh=30, axis=1) #drop columns with less than 30 values 
    
    df = pd.DataFrame() 
    for gene in genes:
        phospho_gene = phospho_site.filter(regex=gene +"_",axis=1)
        prot_genes_list = create_prot_list(phospho_gene)

        if len(prot_genes_list) > 1: # only find correlation if there is more than one site 
            phospho_gene[gene] = phospho_gene.mean(numeric_only=True, axis=1) # average phospho across all sites for gene
            #find correlations for all sites within that gene 
            corr_df = wrap_pearson_corr(phospho_gene,gene,comparison_columns= prot_genes_list,correction_method='none', return_all = True, return_corrected_pvals = False)
            corr_df["Gene"] = gene 
            df = df.append(corr_df)
    return df

In [25]:
def phospho_gene_vs_site(cancer):
    #get phosphoproteomic data 
    phospho_site = cancer.get_phosphoproteomics("tumor")
    #drop database_id and peptide 
    if (cancer != colon) & (cancer != endo):
        phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
        phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
        phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
    if (cancer == colon): 
        phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
        phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
    if cancer == endo:
         phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
    genes = phospho_gene.columns.to_list() 
    genes = list(set(genes))#get unique list of all genes

    phospho_site.columns =phospho_site.columns.map('_'.join) #join gene and site
    phospho_site = phospho_site.loc[:,~phospho_site.T.duplicated(keep='first')] # drop columns where values are idendical

    df = pd.DataFrame() 
    for gene in genes:
        phospho_gene = phospho_site.filter(regex=gene +"_",axis=1)
        prot_genes_list = create_prot_list(phospho_gene)
        if len(prot_genes_list) > 1: # only find correlation if there is more than one site 
        
            site_corrs = phospho_gene.corr(method = "pearson", min_periods = 20) #pairwise correclations
            site_corrs = site_corrs.values.tolist()#array to list
            from pandas.core.common import flatten
            site_corrs = list(flatten(site_corrs))
            site_corrs = [x for x in site_corrs if (math.isnan(x) != True)] #remove na 
       
            #site_corrs.remove(1.0)# remove comparison with same gene
            site_corrs = list(set(site_corrs))# remove duplicate pairwise correlations
      
          
            abs_val_corrs = [abs(x) for x in site_corrs]
            cutoffs = [0.9,0.8,0.7,0.6,0.1]
            for corr in cutoffs:

                result = all(x >= corr for x in abs_val_corrs)
                if not abs_val_corrs:
                    result = False
                df = df.append({'Gene': gene,"Correlation": corr,"Pass_cutoff": result}, ignore_index=True)

        
    return df

In [26]:
gbm_df = phospho_gene_vs_site(brain)

gbm_df

Unnamed: 0,Correlation,Gene,Pass_cutoff
0,0.9,HECW2,0.0
1,0.8,HECW2,0.0
2,0.7,HECW2,0.0
3,0.6,HECW2,0.0
4,0.1,HECW2,0.0
...,...,...,...
33510,0.9,TMUB1,1.0
33511,0.8,TMUB1,1.0
33512,0.7,TMUB1,1.0
33513,0.6,TMUB1,1.0


In [151]:
gbm_df_9 = gbm_df.loc[gbm_df['Correlation'] == 0.9]
gbm_df_9


Unnamed: 0,Correlation,Gene,Pass_cutoff
0,0.9,ZNF276,0.0
5,0.9,WBP1L,1.0
10,0.9,STX4,0.0
15,0.9,PGLS,1.0
20,0.9,RSPH1,1.0
...,...,...,...
33490,0.9,EBF1,1.0
33495,0.9,OPLAH,0.0
33500,0.9,AHDC1,0.0
33505,0.9,RALY,0.0


In [154]:
gbm_df_9.Pass_cutoff.sum()/ len(gbm_df_9)

0.21139788154557662

In [27]:
#gbm_df = phospho_gene_vs_site(brain)
ovar_df = phospho_gene_vs_site(ovar)
brca_df = phospho_gene_vs_site(brca)
luad_df = phospho_gene_vs_site(luad)
hnscc_df = phospho_gene_vs_site(hnscc)
lscc_df = phospho_gene_vs_site(lscc)
kidney_df = phospho_gene_vs_site(kidney)
colon_df = phospho_gene_vs_site(colon)
endo_df = phospho_gene_vs_site(endo)


In [28]:
endo_df

Unnamed: 0,Correlation,Gene,Pass_cutoff
0,0.9,HECW2,0.0
1,0.8,HECW2,0.0
2,0.7,HECW2,0.0
3,0.6,HECW2,0.0
4,0.1,HECW2,1.0
...,...,...,...
31670,0.9,TMUB1,1.0
31671,0.8,TMUB1,1.0
31672,0.7,TMUB1,1.0
31673,0.6,TMUB1,1.0


In [None]:
cancer_dfs = {'GBM':gbm_df, 'HNSCC':hnscc_df, 'LUAD':luad_df, 'BR':brca_df, 'CO':colon_df, 'OV':ovar_df, "ccRCC":kidney_df,"EC": endo_df}

newdf = pd.DataFrame()
for cancer in cancer_dfs:
    df = cancer_dfs[cancer]
    total = df.Gene.to_list()
    total = len(set(total))
    test_nums = [0.9,0.8,0.7,0.6]
    all_percents = []
    for num in test_nums:
        
        df_num = df[df.Correlation > num]
        list1 = df_num.Gene.to_list()
        num_corr = len(set(list1))
        percent = num_corr/total
        
        all_percents.append(percent)
        
    newdf = newdf.append({'Cancer': cancer ,"percent_0.9 corr": all_percents[0], "percent_0.8 corr":all_percents[1], "percent_0.7 corr":all_percents[2],"percent_0.6 corr": all_percents[3] } ,ignore_index=True )

    
    

   

In [None]:
newdf

In [14]:
phospho_site = brain.get_phosphoproteomics("tumor")
    #drop database_id and peptide 

phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
phospho_site = u.reduce_multiindex(phospho_site, levels_to_drop = 2)
phospho_gene = u.reduce_multiindex(phospho_site, levels_to_drop = 1)
genes = phospho_gene.columns.to_list() 

genes = list(set(genes))#get unique list of all genes

phospho_site.columns =phospho_site.columns.map('_'.join) #join gene and site
phospho_site = phospho_site.loc[:,~phospho_site.T.duplicated(keep='first')] # drop columns where values are idendical

phospho_site = phospho_site.dropna(thresh=20, axis=1) #drop columns with less than 30 values 
    
phospho_site

Unnamed: 0_level_0,AAAS_S495,AAAS_S495,AAAS_S525,AAAS_S541,AAED1_S12,AAGAB_S196S199,AAGAB_S215,AAGAB_S311,AAGAB_T216S218,AAK1_S14,...,ZZEF1_T1512S1518,ZZEF1_T1521,ZZEF1_T1521T1523,ZZEF1_T2074,ZZEF1_T66,ZZZ3_S113,ZZZ3_S113,ZZZ3_S314,ZZZ3_S391,ZZZ3_S397
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00104,0.053418,,0.118623,,,,-0.749345,,,0.095324,...,,,,-0.012659,,-0.162407,,,,
C3L-00365,-0.093105,-0.095450,0.069499,,-1.221096,,0.245832,-0.427945,,-0.661599,...,0.368710,-0.235903,,-0.883682,,0.520211,,,,
C3L-00674,-1.084975,,,1.368630,,,,0.497238,,0.317659,...,1.001173,,,0.818227,,0.114993,0.171042,,,-1.037834
C3L-00677,-0.260149,0.390607,0.336969,0.411596,,0.118422,,,,,...,-0.442111,0.512767,0.515239,-0.538142,-0.132746,0.013019,0.287170,,,
C3L-01040,-0.344549,,-0.082962,,,,-0.017669,,,0.548392,...,,,,0.128901,,-0.151054,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03183,-0.046331,0.223123,,,0.129488,,-0.291690,-0.257608,,,...,0.096117,-0.163778,-0.458433,-0.442097,,0.326221,,0.424204,0.485515,0.200915
C3N-03184,0.439788,0.799441,,-0.359969,,-0.669859,0.211198,,,,...,-0.085896,,,-0.179171,,0.147963,,-0.394168,0.393012,
C3N-03186,-0.284134,,,0.530625,,,,0.645441,0.292919,,...,0.022576,,0.769847,0.391365,0.419967,-0.384171,,0.054770,,
C3N-03188,-1.037135,0.371373,,,-0.086821,,-0.275246,-0.064109,,,...,0.600983,0.305315,0.601319,0.435356,,-0.873856,,-0.247936,-0.459617,-0.328697


In [16]:
gene = "PGLS"
phospho_gene_test = phospho_site.filter(regex=gene +"_",axis=1)
phospho_gene_test

Unnamed: 0_level_0,PGLS_S49
Patient_ID,Unnamed: 1_level_1
C3L-00104,-0.008787
C3L-00365,-0.186648
C3L-00674,0.391807
C3L-00677,0.291408
C3L-01040,0.250515
...,...
C3N-03183,0.516006
C3N-03184,-0.995225
C3N-03186,0.221631
C3N-03188,-0.438092


In [4]:
test_df = phospho_site

#result = test_df.corr(method = "pearson", min_periods = 20)


In [92]:
values = result.values.tolist()
from pandas.core.common import flatten
values = list(flatten(values))
values = list(set(values))

values.remove(1.0)
cleanedList = [x for x in values if (math.isnan(x) != True)]
abs(cleanedList)


TypeError: bad operand type for abs(): 'list'

In [144]:
abs_val_corrs = [1.0,0.95]
result = all(x >= 0.9 for x in abs_val_corrs)

result

True

In [21]:
def phospho_gene_vs_site_test(test_df):
    phospho_site = test_df
    df = pd.DataFrame() 
    for gene in genes:
        phospho_gene = phospho_site.filter(regex="PGLS" +"_",axis=1)
        prot_genes_list = create_prot_list(phospho_gene)
        test = False
        if len(prot_genes_list) > 1: # only find correlation if there is more than one site 
        
            site_corrs = phospho_gene.corr(method = "pearson", min_periods = 20) #pairwise correclations
            site_corrs = site_corrs.values.tolist()#array to list
            from pandas.core.common import flatten
            site_corrs = list(flatten(site_corrs))
            site_corrs = [x for x in site_corrs if (math.isnan(x) != True)] #remove na 
       
            #site_corrs.remove(1.0)# remove comparison with same gene
            site_corrs = list(set(site_corrs))# remove duplicate pairwise correlations
      
          
            abs_val_corrs = [abs(x) for x in site_corrs]
            cutoffs = [0.9,0.8,0.7,0.6,0.1]
            test = True
            #for corr in cutoffs:

             #   result = all(x >= corr for x in abs_val_corrs)
              #  if not abs_val_corrs:
               #     result = False
                #df = df.append({'Gene': gene,"Correlation": corr,"Pass_cutoff": result}, ignore_index=True)

        
    return test

In [22]:
gbm_df = phospho_gene_vs_site_test(test_df)

gbm_df

KeyboardInterrupt: 

In [24]:
phospho_site = test_df
df = pd.DataFrame() 

phospho_gene = phospho_site.filter(regex="PGLS" +"_",axis=1)
prot_genes_list = create_prot_list(phospho_gene)
test = False
if len(prot_genes_list) > 1: # only find correlation if there is more than one site 
    test = True
print(test)

False
