# Make Figure 6: Mitotic Cell Cycle

This notebooks takes all the trans genes that are significant in multiple cancers and runs a GSEA using Reactome. It then takes a subset of genes from the Mitotic Cell Cycle pathway and creates a heatmap. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import gseapy as gp

import cptac
import cptac.utils as u
import plot_utils as p

Creating directory C:\Users\mtstu\AppData\Local\bioservices\bioservices 


In [2]:
print('cptac version:', cptac.version())

cptac version: 1.0.0


# Step 1: Run GSEA

First read in sig_pval_heatmap.csv into a df. This csv file contains only genes with a significant p-value in at least one cancer. 

In [3]:
all_df = pd.read_csv('csv/all_heatmap_pancan.csv')
mult_sig_df = pd.read_csv('csv/mult_sig_pval_heatmap_pancan.csv') 

In [4]:
prot_list = list(mult_sig_df.Umich_Proteomics.unique()) # list of genes with a sig pval in mult cancers
prot_enr = gp.enrichr(gene_list = prot_list, description='Tumor_partition', gene_sets='Reactome_2016')



In [5]:
prot_enr.res2d.head(5)

Unnamed: 0,Gene_set,Term,Overlap,P-value,Adjusted P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes
0,Reactome_2016,Gene Expression Homo sapiens R-HSA-74160,60/1631,2.060955e-15,1.176805e-12,0,0,3.859325,130.505427,DDX47;ARID4B;CHD4;USP39;WDR46;SMG8;RRP9;SART1;...
1,Reactome_2016,Processing of Capped Intron-Containing Pre-mRN...,16/193,1.593612e-09,4.549761e-07,0,0,7.902744,160.087966,SF3B3;CPSF1;NCBP1;NUP133;DHX9;SRRT;DDX23;THOC3...
2,Reactome_2016,rRNA modification in the nucleus Homo sapiens ...,9/58,2.835299e-08,5.39652e-06,0,0,15.672674,272.36809,EMG1;UTP6;DDX47;MPHOSPH10;IMP4;WDR46;UTP18;DCA...
3,Reactome_2016,"Cell Cycle, Mitotic Homo sapiens R-HSA-69278",21/462,1.996236e-07,2.849627e-05,0,0,4.200696,64.80343,TOP2A;RFC5;RFC3;PCNA;NUP133;RFC2;DCTN1;DCTN3;N...
4,Reactome_2016,Cell Cycle Homo sapiens R-HSA-1640170,23/566,3.747998e-07,3.602662e-05,0,0,3.751059,55.503943,TOP2A;RFC5;RFC3;PCNA;NUP133;RFC2;DCTN1;DCTN3;S...


# Step 2: Get the list of significant genes 

In [6]:
index = 4
trans = prot_enr.res2d.Genes[index]
genes = trans.split(';')
print(prot_enr.res2d.Term[index])
print('Adjusted p-value:', prot_enr.res2d['Adjusted P-value'][index])
print('total genes:',len(genes))
genes

Cell Cycle Homo sapiens R-HSA-1640170
Adjusted p-value: 3.602662452454257e-05
total genes: 23


['TOP2A',
 'RFC5',
 'RFC3',
 'PCNA',
 'NUP133',
 'RFC2',
 'DCTN1',
 'DCTN3',
 'SMARCA5',
 'NUP160',
 'AURKB',
 'TPX2',
 'ORC4',
 'ACTR1A',
 'XPO1',
 'CDK4',
 'NUF2',
 'CDK2',
 'KIF20A',
 'TOPBP1',
 'RAE1',
 'CEP78',
 'SPC25']

In [7]:
ndf = all_df[all_df.Umich_Proteomics.isin(genes)]
plot_df = ndf.loc[ndf['P_Value'] <= 0.05] # Only plot sig genes

In [8]:
p.plotCircleHeatMap(plot_df, circle_var='P_Value', color_var='Medians', x_axis='Umich_Proteomics', y_axis='Cancer',
                    plot_height=350, plot_width=800, x_axis_lab='', font_size=12, legend_max=0.05, 
                    legend_med = 0.001, legend_min=0.00001, save_png='Fig_6_Cell_Cycle.png')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["size2"] = df[circle_var].apply(lambda x: -1*(np.log(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['size'] = (df["size2"])*3


In [9]:
# Catagorize genes
narrowed_cell_cycle = ['DCTN1', 'ACTR1A', 'MAD2L1', 'MCM4', 'PCNA', 'MCM5', 'MCM2','PCM1',
              'MCM7', 'TPR', 'MCM3', 'MCM6', 'NUP153', 'RFC3', 'CDK11B', 'XPO1','PRKCB', 'RFC2', 'NUF2', 
              'RAB8A', 'RFC4', 'RANBP2', 'CENPF', 'GINS4', 'NDC80', 'OPTN', 'PPP2R2A', 'PPP2R2D',
              'TPX2', 'GINS2', 'BUB1B', 'TOP2A', 'TOPBP1', 'NUP210', 'MSH2', 'MSH6', 'CDK11B', '']

dna_synth = ['MCM4', 'PCNA', 'MCM5', 'MCM2','PCM1', 'MCM7', 'MCM3', 'RFC3', 
             'RFC2', 'RFC4', 'GINS4', 'GINS2', 'TOP2A']

dynactin = ['DCTN1','ACTR1A']
spindle = ['BUB1B', 'CENPF', 'MAD2L1', 'NDC80', 'NUF2', 'PCM1', 'TPX2','DCTN1','ACTR1A']

checkpoints = ['BUB1B','PPP2R2A', 'PPP2R2D', 'TOPBP1','MAD2L1']

nuclear_pore_transport = ['TPR','NUP153','NUP210', 'XPO1','RANBP2']
other_transport = ['RAB8A']
other = ['PRKCB','OPTN','CDK11B']

path_list = {'DNA_Synth':dna_synth, 'Spindle':spindle, 'Dynactin': dynactin, 'Checkpoints':checkpoints,
            'Nuclear_Pore_Transport':nuclear_pore_transport}


In [13]:
bool_df = all_df.Umich_Proteomics.isin(narrowed_cell_cycle)
df = all_df[bool_df]
plot_df = df.loc[df['P_Value'] <= 0.05] # Only plot sig genes
plot_df

Unnamed: 0,Umich_Proteomics,P_Value,Medians,Cancer,EC_P_Value,EC_Median
8,ACTR1A,3e-06,-0.399173,GBM,,
11,DCTN1,6e-06,-0.273766,GBM,,
199,PPP2R2D,0.008199,-0.36584,GBM,,
550,PCNA,0.044768,0.479052,GBM,,
13021,GINS4,0.0313,0.486945,HNSCC,,
13027,DCTN1,0.037933,-0.159913,HNSCC,,
38535,CENPF,0.003334,0.411147,LSCC,,
38540,MSH6,0.003334,0.653973,LSCC,,
38542,MSH2,0.003334,0.464981,LSCC,,
38576,RFC3,0.006008,0.31646,LSCC,,


In [14]:
ordered_prot_list = ['MSH2', 'MSH6', 'GINS2', 'GINS4', 'MCM2', 'MCM3', 'MCM4', 'MCM5', 'MCM6', 'MCM7',
                     'RFC2', 'RFC3', 'RFC4', 'RFC5',  'PCNA', 'TOP2A', 'CDK11B', 'PPP2R2A', 'PPP2R2D', 
                     'TOPBP1', 'MAD2L1', 'BUB1B', 'CENPF', 'NDC80', 'NUF2', 'TPX2', 'PCM1', 'DCTN1','ACTR1A']

In [16]:
# Reorder index to reorder the protein order in heatmap
plot_df["Index"] = plot_df["Umich_Proteomics"] + "_" + plot_df["Cancer"]
plot_df = plot_df.set_index("Index")

ordered_list = []
cancer_list = ['BR', 'CO', 'EC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'OV']
for gene in ordered_prot_list:
    for cancer in cancer_list:
        ordered_list.append(gene+'_'+cancer)

ordered_df = plot_df.reindex(ordered_list)
ordered_df = ordered_df.loc[ordered_df['P_Value'] < 0.05] # Keep sig 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plot_df["Index"] = plot_df["Umich_Proteomics"] + "_" + plot_df["Cancer"]


In [17]:
print('min p-value:', ordered_df.P_Value.min())

min p-value: 3.341044588048425e-06


In [19]:
p.plotCircleHeatMap(ordered_df, circle_var='P_Value', color_var='Medians', x_axis='Umich_Proteomics', y_axis='Cancer',
                    plot_height=350, plot_width=800, x_axis_lab='', font_size=12, legend_max=0.05, 
                    legend_med = 0.001, legend_min=0.00001, save_png='Fig_6_Cell_Cycle.png')

The manuscript mentions the average of the FDR p-values for MCM complex proteins in EC. Here is where we calculate the average for these p-values.

In [21]:
# average FDR p-val of MCM2-7 in EC
mcm = ['MCM2','MCM3','MCM4','MCM5','MCM6','MCM7']
bool_df = all_df.Umich_Proteomics.isin(mcm)
df = all_df[bool_df]
df = df.loc[df['Cancer'] == 'EC'] 
print('Average of FDR p-values for MCM proteins in EC:', df.P_Value.mean())

Average of FDR p-values for MCM proteins in EC: 0.1462391099450344


The manuscript also mentions the strong FDR p-values for DCTN1 and ACTR1A in GBM as shown below.

In [22]:
# P-values for DCTN1 and ACTR1A in GBM
axon_health = ['DCTN1','ACTR1A']
bool_df = all_df.Umich_Proteomics.isin(axon_health)
df = all_df[bool_df]
df = df.loc[df['Cancer'] == 'GBM'] 
print('FDR p-values of DCTN1 nd ACTR1A in GBM:\n')
df

FDR p-values of DCTN1 nd ACTR1A in GBM:



Unnamed: 0,Umich_Proteomics,P_Value,Medians,Cancer,EC_P_Value,EC_Median
8,ACTR1A,3e-06,-0.399173,GBM,,
11,DCTN1,6e-06,-0.273766,GBM,,
