# Make Figure 6: Mitotic Cell Cycle

This notebooks takes all the trans genes that are significant in multiple cancers and runs a GSEA using Reactome. It then takes a subset of genes from the Mitotic Cell Cycle pathway and creates a heatmap. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp

import cptac
import cptac.utils as u
import plot_utils as p



In [2]:
print('cptac version:', cptac.version())

cptac version: 0.9.0


# Step 1: Run GSEA

First read in sig_pval_heatmap.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. 

In [3]:
all_df = pd.read_csv('csv/all_heatmap_pancan.csv')
mult_sig_df = pd.read_csv('csv/mult_sig_pval_heatmap_pancan.csv') 

In [5]:
prot_list = list(mult_sig_df.Umich_Proteomics.unique()) # list of genes with a sig pval in mult cancers
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='Reactome_2016')

In [6]:
prot_enr.res2d.head(5)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,Reactome_2016,Gene Expression Homo sapiens R-HSA-74160,56/1631,1.44654e-14,7.464144e-12,0,0,3.875343,123.49561,DDX47;ARID4B;CHD4;USP39;SMG8;RRP9;SART1;MED14;...
1,Reactome_2016,rRNA modification in the nucleus Homo sapiens ...,10/58,7.933964e-10,2.046963e-07,0,0,19.296753,404.357628,EMG1;UTP6;DDX47;HEATR1;MPHOSPH10;IMP4;DDX52;UT...
2,Reactome_2016,Processing of Capped Intron-Containing Pre-mRN...,13/193,2.716771e-07,4.672847e-05,0,0,6.739709,101.895311,SF3B3;CPSF1;NUP133;DHX9;SRRT;DDX23;USP39;NUP16...
3,Reactome_2016,SUMOylation of DNA replication proteins Homo s...,7/44,5.261923e-07,6.787881e-05,0,0,17.28979,249.96885,TOP2A;PCNA;NUP133;BIRC5;RAE1;NUP160;AURKB
4,Reactome_2016,"Cell Cycle, Mitotic Homo sapiens R-HSA-69278",18/462,4.676241e-06,0.000482588,0,0,3.82327,46.923049,TOP2A;PCNA;NUP133;RFC2;DCTN3;NUP160;AURKB;TPX2...


# Step 2: Get the list of significant genes 

In [8]:
index = 4
trans = prot_enr.res2d.Genes[index]
genes = trans.split(';')
print(prot_enr.res2d.Term[index])
print('Adjusted p-value:', prot_enr.res2d['Adjusted P-value'][index])
print('total genes:',len(genes))
genes

Cell Cycle, Mitotic Homo sapiens R-HSA-69278
Adjusted p-value: 0.0004825880412053945
total genes: 18


['TOP2A',
 'PCNA',
 'NUP133',
 'RFC2',
 'DCTN3',
 'NUP160',
 'AURKB',
 'TPX2',
 'ORC4',
 'ACTR1A',
 'XPO1',
 'CDK4',
 'CDK2',
 'BIRC5',
 'KIF20A',
 'B9D2',
 'RAE1',
 'CEP78']

In [20]:
ndf = all_df[all_df.Umich_Proteomics.isin(genes)]
plot_df = ndf.loc[ndf['P_Value'] <= 0.05] # Only plot sig genes

In [25]:
p.plotCircleHeatMap(plot_df, circle_var='P_Value', color_var='Medians', x_axis='Umich_Proteomics', y_axis='Cancer',
                    plot_height=350, plot_width=800, x_axis_lab='', font_size=12, legend_max=0.05, 
                    legend_med = 0.001, legend_min=0.00001, save_png='Fig_6_Cell_Cycle.png')

In [8]:
# Catagorize genes
narrowed_cell_cycle = ['DCTN1', 'ACTR1A', 'MAD2L1', 'MCM4', 'PCNA', 'MCM5', 'MCM2','PCM1',
              'MCM7', 'TPR', 'MCM3', 'MCM6', 'NUP153', 'RFC3', 'CDK11B', 'XPO1','PRKCB', 'RFC2', 'NUF2', 
              'RAB8A', 'RFC4', 'RANBP2', 'CENPF', 'GINS4', 'NDC80', 'OPTN', 'PPP2R2A', 'PPP2R2D',
              'TPX2', 'GINS2', 'BUB1B', 'TOP2A', 'TOPBP1', 'NUP210', 'MSH2', 'MSH6', 'CDK11B', '']

dna_synth = ['MCM4', 'PCNA', 'MCM5', 'MCM2','PCM1', 'MCM7', 'MCM3', 'RFC3', 
             'RFC2', 'RFC4', 'GINS4', 'GINS2', 'TOP2A']

dynactin = ['DCTN1','ACTR1A']
spindle = ['BUB1B', 'CENPF', 'MAD2L1', 'NDC80', 'NUF2', 'PCM1', 'TPX2','DCTN1','ACTR1A']

checkpoints = ['BUB1B','PPP2R2A', 'PPP2R2D', 'TOPBP1','MAD2L1']

nuclear_pore_transport = ['TPR','NUP153','NUP210', 'XPO1','RANBP2']
other_transport = ['RAB8A']
other = ['PRKCB','OPTN','CDK11B']

path_list = {'DNA_Synth':dna_synth, 'Spindle':spindle, 'Dynactin': dynactin, 'Checkpoints':checkpoints,
            'Nuclear_Pore_Transport':nuclear_pore_transport}


In [23]:
bool_df = all_df.Proteomics.isin(narrowed_cell_cycle)
df = all_df[bool_df]
plot_df = df.loc[df['P_Value'] <= 0.05] # Only plot sig genes
plot_df

Unnamed: 0,Proteomics,P_Value,Medians,Cancer
6,DCTN1,0.000015,-0.382988,GBM
9,ACTR1A,0.000038,-0.491126,GBM
106,MCM6,0.001303,1.085407,GBM
107,MCM4,0.001303,1.236484,GBM
114,MAD2L1,0.001303,0.546914,GBM
...,...,...,...,...
66220,BUB1B,0.026111,-0.559000,EC
66269,NUF2,0.034789,-0.612500,EC
66277,CENPF,0.037588,-0.439000,EC
66315,TPR,0.042634,-0.184300,EC


In [24]:
ordered_prot_list = ['MSH2', 'MSH6', 'GINS2', 'GINS4', 'MCM2', 'MCM3', 'MCM4', 'MCM5', 'MCM6', 'MCM7',
                     'RFC2', 'RFC3', 'RFC4', 'RFC5',  'PCNA', 'TOP2A', 'CDK11B', 'PPP2R2A', 'PPP2R2D', 
                     'TOPBP1', 'MAD2L1', 'BUB1B', 'CENPF', 'NDC80', 'NUF2', 'TPX2', 'PCM1', 'DCTN1','ACTR1A']

In [25]:
# Reorder index to reorder the protein order in heatmap
plot_df["Index"] = plot_df["Proteomics"] + "_" + plot_df["Cancer"]
plot_df = plot_df.set_index("Index")

ordered_list = []
cancer_list = ['BR', 'CO', 'EC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'OV']
for gene in ordered_prot_list:
    for cancer in cancer_list:
        ordered_list.append(gene+'_'+cancer)

ordered_df = plot_df.reindex(ordered_list)
ordered_df = ordered_df.loc[ordered_df['P_Value'] < 0.05] # Keep sig 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
print('min p-value:', ordered_df.P_Value.min())

min p-value: 1.5008898009485894e-05


In [27]:
p.plotCircleHeatMap(ordered_df, circle_var='P_Value', color_var='Medians', x_axis='Proteomics', y_axis='Cancer',
                    plot_height=350, plot_width=800, x_axis_lab='', font_size=12, legend_max=0.05, 
                    legend_med = 0.001, legend_min=0.00001, save_png='Fig_6_Cell_Cycle.png')

The manuscript mentions the average of the FDR p-values for MCM complex proteins in EC. Here is where we calculate the average for these p-values.

In [40]:
# average FDR p-val of MCM2-7 in EC
mcm = ['MCM2','MCM3','MCM4','MCM5','MCM6','MCM7']
bool_df = all_df.Proteomics.isin(mcm)
df = all_df[bool_df]
df = df.loc[df['Cancer'] == 'EC'] 
print('Average of FDR p-values for MCM proteins in EC:', df.P_Value.mean())

Average of FDR p-values for MCM proteins in EC: 0.11949892063850759


The manuscript also mentions the strong FDR p-values for DCTN1 and ACTR1A in GBM as shown below.

In [41]:
# P-values for DCTN1 and ACTR1A in GBM
axon_health = ['DCTN1','ACTR1A']
bool_df = all_df.Proteomics.isin(axon_health)
df = all_df[bool_df]
df = df.loc[df['Cancer'] == 'GBM'] 
print('FDR p-values of DCTN1 nd ACTR1A in GBM:\n')
df

FDR p-values of DCTN1 nd ACTR1A in GBM:



Unnamed: 0,Proteomics,P_Value,Medians,Cancer
6,DCTN1,1.5e-05,-0.382988,GBM
9,ACTR1A,3.8e-05,-0.491126,GBM
