# Metabolism Circle Heatmap

This notebooks takes the genes that are a hit from the Brca NCI-Nature_2016 (PDGFR-beta pathway) and maps them on a large circle heat map. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import re
import sys 
import statsmodels.stats.multitest

import gseapy as gp
from gseapy.plot import barplot, dotplot

import cptac
import cptac.utils as u
import plot_utils as p 

  import pandas.util.testing as tm


Load df with all of the genes that are FDR significant. Then get list of just the gene names and use them to run a GSEA. 

In [2]:
prot_FDR = pd.read_csv("../Step3.1_Pearson_dfs_by_cancer/csv_files/Brca_EGFR_all_pearson_FDR.csv")
df_FDR= prot_FDR.drop(['Unnamed: 0'], axis=1)
df_FDR = df_FDR.set_index("Comparison")
df1_transposed = df_FDR.T 
df1_transposed


Comparison,EGFR_proteomics,MPP6_proteomics,CPNE8_proteomics,CRYBG3_proteomics,PSAT1_proteomics,PHC3_proteomics,GATA3_proteomics,KRT16_proteomics,KRT5_proteomics,NXN_proteomics,...,TSC1_proteomics,MCM5_proteomics,HNRNPA3_proteomics,GC_proteomics,LASP1_proteomics,GGH_proteomics,KIF15_proteomics,MNS1_proteomics,RRP1_proteomics,ZDHHC20_proteomics
Correlation,1.0,0.6482017,0.6164527,0.6099971,0.6093187,-0.6032044,-0.5989113,0.5953255,0.5947704,0.5928643,...,-0.229952,0.229894,-0.229883,0.229882,-0.229848,0.229847,0.229837,0.270088,0.229647,0.229381
P_value,0.0,4.834438e-15,2.229263e-13,4.612176e-13,4.973595e-13,9.738761e-13,1.547846e-12,2.267369e-12,2.404384e-12,2.938573e-12,...,0.013427,0.013451,0.013455,0.013456,0.01347,0.01347,0.013475,0.013535,0.013554,0.013665


In [3]:
brca_prot = df1_transposed.columns.values.tolist()
brca_genes = []
for gene in brca_prot :
    brca_genes.append((re.sub("_proteomics", "", gene)))
len(brca_genes)

2669

Run GSEA using reactome 2016 set

In [4]:
brca_enr = gp.enrichr(gene_list = brca_genes, description='Tumor_partition', gene_sets='NCI-Nature_2016', 
                       outdir='test/enrichr_kegg')
brca_enr.res2d.head(2)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,NCI-Nature_2016,Urokinase-type plasminogen activator (uPA) and...,25/42,3.110276e-12,6.500478e-10,0,0,4.460383,118.183685,ITGB1;ITGAM;LRP1;SRC;ITGB3;ITGB2;SERPINE1;PLG;...
1,NCI-Nature_2016,PDGFR-beta signaling pathway Homo sapiens c901...,47/128,2.028208e-11,2.119477e-09,0,0,2.751499,67.745429,USP6NL;DOCK4;TAGLN;LRP1;SRC;ARPC1B;ITGB3;BRK1;...


In [8]:
#get just the pdgfr genes
brca_df = brca_enr.res2d
pdgfr = brca_df.iloc[1,1]
#pdgfr = pdgfr.split(';')
#len(pdgfr)
pdgfr

'PDGFR-beta signaling pathway Homo sapiens c901a3e4-6194-11e5-8ac5-06603eb7f303'

In [9]:
#Get append version of the df with all cancer type, fdr sig trans results
df_FDR_append = pd.read_csv("../Step3.2_combining_pearson_dfs/csv_files/pancan_EGFR_pearson_sig_all_prot_append_FDR.csv")
df_FDR_append = df_FDR_append.drop(['Unnamed: 0'], axis=1)


In [11]:
#filter down df with just pdgfr genes 
pdgfr_column_names = []
for gene in pdgfr:
    gene += "_proteomics"
    pdgfr_column_names.append(gene)

df_FDR_pdgfr = df_FDR_append[df_FDR_append.Comparison.isin(pdgfr_column_names)]
df_FDR_pdgfr

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
175,ARPC1B_proteomics,-0.431473,8.240555e-06,GBM
194,HCK_proteomics,-0.426001,1.101720e-05,GBM
206,ARPC3_proteomics,-0.423330,1.267190e-05,GBM
299,ARPC2_proteomics,-0.406327,3.005095e-05,GBM
416,YES1_proteomics,0.383762,8.813311e-05,GBM
...,...,...,...,...
8384,SRC_proteomics,0.466158,2.860941e-07,Lscc
8477,BAIAP2_proteomics,0.385796,3.149701e-05,Lscc
8665,ITGAV_proteomics,0.317536,7.242515e-04,Lscc
8701,JUN_proteomics,0.311282,9.326442e-04,Lscc


In [13]:
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_pdgfr, "P_value","Correlation","Comparison","Cancer Type",plot_width= 1200, plot_height = 650)

In [33]:
#get just the upa genes
brca_df = brca_enr.res2d
upa = brca_df.iloc[0,9]
upa = upa.split(';')
upa.remove("EGFR")
len(upa)
upa


['ITGB1',
 'ITGAM',
 'LRP1',
 'SRC',
 'ITGB3',
 'ITGB2',
 'SERPINE1',
 'PLG',
 'VLDLR',
 'VTN',
 'PLAU',
 'ITGAV',
 'ELANE',
 'FGB',
 'FGA',
 'TGFB1',
 'FGG',
 'MMP3',
 'FN1',
 'PLAUR',
 'MMP9',
 'MMP12',
 'ITGA5',
 'GPLD1']

In [31]:
#filter down df with just upa genes 
upa_column_names = []
for gene in upa:
    gene += "_proteomics"
    upa_column_names.append(gene)

df_FDR_upa = df_FDR_append[df_FDR_append.Comparison.isin(upa_column_names)]
df_FDR_upa

Unnamed: 0,Comparison,Correlation,P_value,Cancer Type
196,PLAUR_proteomics,-0.425639,1.122876e-05,GBM
271,ITGB1_proteomics,-0.412364,2.223482e-05,GBM
305,FGB_proteomics,-0.404936,3.218521e-05,GBM
350,FGG_proteomics,-0.396275,4.899957e-05,GBM
445,FGA_proteomics,-0.380889,0.0001005188,GBM
645,ITGB2_proteomics,-0.352658,0.0003434888,GBM
817,ITGB3_proteomics,-0.332311,0.0007780124,GBM
838,ITGAM_proteomics,-0.330556,0.0008327428,GBM
1097,SERPINE1_proteomics,-0.306814,0.002009326,GBM
1285,LRP1_proteomics,0.290306,0.00355837,GBM


In [32]:
#Make plot using plot utils
p.plotCircleHeatMap(df_FDR_upa, "P_value","Correlation","Comparison","Cancer Type",plot_width= 1200, plot_height = 650)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = (df["size2"])*3
