# Create Heat Map for significant mRNA Splicing genes

This notebook looks at the significant genes in at least one cancer in the DNA Replication pathway. Pancancer heat maps are created with circle size showing significance and color showing differences in median.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp
import re
import sys 

import cptac
import cptac.utils as u

import plot_utils as p

# Sig in multiple cancers

In [16]:
mult_sig_df = pd.read_csv(root+R'\mult_sig_pval_heatmap.csv')

mult_sig_list = list(mult_sig_df.Proteomics.unique()) # list of genes with a sig pval in > 1 cancer
enr2 = gp.enrichr(gene_list = mult_sig_list, description='Tumor_partition', gene_sets='Reactome_2016', 
                       outdir='/Enrichr')

In [45]:
enr2.res2d.iloc[[5]]

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
5,rRNA modification in the nucleus Homo sapiens ...,13/58,4.0777e-12,1.039814e-09,0,0,14.414015,378.014573,UTP6;IMP3;WDR3;HEATR1;NAT10;WDR75;IMP4;PWP2;WD...,Reactome_2016


In [19]:
index2 = 5
trans2 = enr2.res2d.Genes[index2]
genes_mult = trans2.split(';')
print(enr2.res2d.Term[index2])
print('total genes:',len(genes_mult))

rRNA modification in the nucleus Homo sapiens R-HSA-6790901
total genes: 13


In [38]:
bool_df = mult_sig_df.Proteomics.isin(genes_mult)
plot_df2 = mult_sig_df[bool_df]
plot_df2.Proteomics.unique()

array(['PWP2', 'WDR75', 'TBL3', 'HEATR1', 'NAT10', 'DCAF13', 'WDR46',
       'IMP4', 'WDR3', 'IMP3', 'UTP18', 'MPHOSPH10', 'UTP6'], dtype=object)

In [39]:
# Only include p-values < a certain cutoff
a = 0.05
plot_df2 = plot_df2.loc[plot_df2['P_Value'] <= a]

In [40]:
p.plotCircleHeatMap(plot_df2, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=300, plot_width=600)

In [41]:
get = pos_neg_df.Proteomics.isin(genes_mult) # bool df where True has both pos and neg
genes_pn_mult = pos_neg_df[get] # Keep only genes with pos and neg
genes_pn_mult.Proteomics.unique()

array(['WDR75', 'HEATR1', 'NAT10', 'DCAF13', 'WDR46', 'IMP3', 'MPHOSPH10',
       'UTP6'], dtype=object)

In [42]:
# Only include p-values < a certain cutoff
genes_pn_mult = genes_pn_mult.loc[genes_pn_mult['P_Value'] <= a]

In [43]:
p.plotCircleHeatMap(genes_pn_mult, circle_var = 'P_Value', color_var='Medians', x_axis= 'Proteomics', y_axis = 'Cancer',
                    plot_height=300, plot_width=400)

# Check other pathways the genes (+/- and sig in mult. cancers) are part of

In [30]:
gnm = list(genes_pn_mult.Proteomics.unique())
print(gnm)
enr3 = gp.enrichr(gene_list = gnm, description='Tumor_partition', gene_sets='KEGG_2016', 
                       outdir='/Enrichr')

['WDR75', 'HEATR1', 'NAT10', 'DCAF13', 'WDR46', 'IMP3', 'UTP6', 'MPHOSPH10']


In [31]:
enr3.res2d.head()

Unnamed: 0,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Gene_set
0,Ribosome biogenesis in eukaryotes Homo sapiens...,6/89,1.818101e-13,5.327036e-11,0,0,168.539326,4944.238268,UTP6;IMP3;HEATR1;NAT10;MPHOSPH10;WDR75,KEGG_2016


In [23]:
# Look at p-values 
certain_gene = 'PRPF6'
all_df = pd.read_csv(root+R'\all_heatmap.csv')
gene_df = all_df.loc[all_df['Proteomics'] == certain_gene]
gene_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
212,PRPF6,5e-05,0.39187,Gbm
12022,PRPF6,0.006819,0.14016,Hnscc
27645,PRPF6,0.359535,0.14475,Luad
37501,PRPF6,0.393073,0.1077,Lscc
47320,PRPF6,0.161801,0.06885,Brca
58451,PRPF6,0.338623,-0.006257,Ov
65845,PRPF6,0.000959,-0.1936,En
82570,PRPF6,0.803451,-0.063,Colon
