# Binary_correlation_plots

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re

import cptac
import cptac.utils as u

import plot_utils as p

  import pandas.util.testing as tm


# Select two cancers for the plot

In [3]:
g = cptac.Gbm()
brca = cptac.Brca()

                                        



                                         

In [4]:
def create_prot_list(df):
    df = df.loc[:,~df.columns.duplicated()]
    prot_genes_list = df.columns.values.tolist()
    prot_genes_list.remove('EGFR_proteomics')
    prot_genes_list.remove('EGFR_Mutation')
    prot_genes_list.remove('EGFR_Location')
    prot_genes_list.remove('EGFR_Mutation_Status')
    prot_genes_list.remove('Sample_Status')
    return prot_genes_list

# Calculate FDR-bh (lowest sig p-value = cutoff)

Gbm Pearson Correlation

In [5]:
gene = 'EGFR'
df1 = g.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes= gene)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]
prot_genes_list = create_prot_list(df1_tumor)




In [6]:
# Get only sig genes
g_sig= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
g_sig
# get_smallest p-value
cutoff_gbm = g_sig.P_value[-1:]
cutoff_gbm

1571    0.007423
Name: P_value, dtype: float64

Brca Pearson Correlation

In [20]:
gene = 'EGFR'

df1 = brca.join_omics_to_mutations(omics_df_name="proteomics", mutations_genes= gene)
df1.columns = df1.columns.droplevel(1)
df1_tumor = df1.loc[df1['Sample_Status'] == "Tumor"]
prot_genes_list = create_prot_list(df1_tumor)




In [21]:


# Get only sig sites
brca_sig= p.wrap_pearson_corr(df1_tumor,"EGFR_proteomics",comparison_columns= prot_genes_list,correction_method='fdr_bh', return_all = False)
brca_sig
# get_smallest p-value
cutoff_brca = brca_sig.P_value[-1:]
cutoff_brca

2057    0.013554
Name: P_value, dtype: float64

Read in data frame that has all pearson correlations/ pvalues for all egfr trans genes. Subset down for just brca/gbm 

In [22]:
prot_df = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_all_return_all_wide.csv")
prot_df = prot_df[["Comparison","Correlation_Gbm","P_value_Gbm","Correlation_Brca","P_value_Brca"]]

prot_df

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_Brca,P_value_Brca
0,EGFR_proteomics,1.000000,0.000000e+00,1.000000,0.000000
1,PHLDA1_proteomics,0.816848,6.553435e-25,0.364797,0.000190
2,GRB2_proteomics,-0.610889,1.886384e-11,-0.177379,0.057899
3,SOCS2_proteomics,0.562720,1.343464e-09,,
4,CDH4_proteomics,0.559180,1.790048e-09,,
...,...,...,...,...,...
14047,TFR2_proteomics,,,,
14048,MYBL2_proteomics,,,,
14049,ZSCAN12_proteomics,,,,
14050,ZNF836_proteomics,,,,


# Create df with p-values for all genes in a pathway

In [23]:
# get a list of genes in a complete Pathway 
#This one is from Reactomes Hemostasis pathway
hemostasis_genes = pd.read_csv("~/Downloads/reactome_hemostasis.csv")
heme_all_genes = []
for gene in hemostasis_genes:
    gene += "_proteomics"
    heme_all_genes.append(gene)



# Create the color column based on significance in both cancers 

In [14]:
def Color(row):
    color = 'Not_sig'

    c1_cutoff = 0.007264# lowest p-value for cancer 1 (GBM)
    c2_cutoff = 0.013554 #Brca

    for item in row:
        if pd.isnull(item):
            continue
        if row['P_value_Brca'] <= c1_cutoff and row['P_value_Gbm'] <= c2_cutoff:
            color = 'Both_sig'
        elif row['P_value_Gbm'] <= c1_cutoff:
            color = 'Gbm_sig'
        elif row['P_value_Brca'] <= c2_cutoff:
            color = 'Brca_sig'
           
    return color

In [15]:
print('lowest p-value for Gbm:',cutoff_gbm, '\n')
print('lowest p-value for Brca:',cutoff_brca)

lowest p-value for Gbm: 1571    0.007423
Name: P_value, dtype: float64 

lowest p-value for Brca: 2057    0.013554
Name: P_value, dtype: float64


In [16]:
prot_df['Color'] = prot_df.apply(Color, axis=1)
color = prot_df[['Color']]
prot_df

Unnamed: 0,Comparison,Correlation_Gbm,P_value_Gbm,Correlation_Brca,P_value_Brca,Color
0,EGFR_proteomics,1.000000,0.000000e+00,1.000000,0.000000,Both_sig
1,PHLDA1_proteomics,0.816848,6.553435e-25,0.364797,0.000190,Both_sig
2,GRB2_proteomics,-0.610889,1.886384e-11,-0.177379,0.057899,Gbm_sig
3,SOCS2_proteomics,0.562720,1.343464e-09,,,Gbm_sig
4,CDH4_proteomics,0.559180,1.790048e-09,,,Gbm_sig
...,...,...,...,...,...,...
14047,TFR2_proteomics,,,,,Not_sig
14048,MYBL2_proteomics,,,,,Not_sig
14049,ZSCAN12_proteomics,,,,,Not_sig
14050,ZNF836_proteomics,,,,,Not_sig


In [17]:
corr_df = prot_df[["Comparison","Correlation_Gbm","Correlation_Brca","Color"]]
corr_df

Unnamed: 0,Comparison,Correlation_Gbm,Correlation_Brca,Color
0,EGFR_proteomics,1.000000,1.000000,Both_sig
1,PHLDA1_proteomics,0.816848,0.364797,Both_sig
2,GRB2_proteomics,-0.610889,-0.177379,Gbm_sig
3,SOCS2_proteomics,0.562720,,Gbm_sig
4,CDH4_proteomics,0.559180,,Gbm_sig
...,...,...,...,...
14047,TFR2_proteomics,,,Not_sig
14048,MYBL2_proteomics,,,Not_sig
14049,ZSCAN12_proteomics,,,Not_sig
14050,ZNF836_proteomics,,,Not_sig


In [18]:
# Create pathway columns
grp = plot_df['Comparison'].isin(innate_all_genes)
grp2 = plot_df["Comparison"].isin(heme_all_genes)

grp = group.to_frame().rename(columns={'Comparison':'Innate_Immune'})
grp2 = group2.to_frame().rename(columns={'Comparison':'Hemostasis'})

# MERGE
pathways = corr_df.join(grp)
pathways = pathways.join(grp2)
pathways = pathways.drop(pathways.index[0])
pathways

NameError: name 'plot_df' is not defined

In [None]:
# Create dictionary to map colors to values in the color column
hue_dict = {'Both_sig':'green',
            'Brca_sig':'yellow',
            'Gbm_sig': 'blue',
            'Not_sig': 'grey'}

# Plot

In [None]:
p.binary_val_plot(pathways, "narrow_corr", 'Correlation_Gbm', 'Correlation_Brca','Correlations for the Hemostasis Pathway',['Innate_Immune','Hemostasis'], hue_col = "Color", color_dict = hue_dict)