# 1 ReactomeSuperPathwaysDrivers - List of Objects

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import glob
import pickle 

### Reactome FI Network

In [3]:
df = pd.read_csv("1 input/Reactome2019_FIsInGene_020720_with_annotations.txt",sep='\t')
ReactomeFI = nx.from_pandas_edgelist(df,source='Gene1',target='Gene2')
ReactomeFI.name = 'Reactome'
print(nx.info(ReactomeFI),end='\n\n')

Name: Reactome
Type: Graph
Number of nodes: 14071
Number of edges: 268857
Average degree:  38.2143



In [4]:
CC = list(nx.connected_components(ReactomeFI))
ReactomeFI_LCC= nx.subgraph(ReactomeFI,max(CC,key=len))
ReactomeFI_LCC.name = 'ReactomeFI_LCC'
print(nx.info(ReactomeFI_LCC),end='\n\n')

Name: ReactomeFI_LCC
Type: Graph
Number of nodes: 13877
Number of edges: 268733
Average degree:  38.7307



### 711 Know Cancer Genes and 250 False Positive

In [5]:
#http://ncg.kcl.ac.uk/statistics.php
knowDriversDF=pd.read_csv('1 input/NCG6_711 Known cancer genes - DRIVERS.tsv',sep='\t',usecols=['symbol'])
knowDriversSetNcg=set(knowDriversDF['symbol'])

#https://www.intogen.org/search
knowDriversDF = pd.read_csv('1 input/Compendium_Cancer_Genes.tsv',sep='\t')
knowDriversSetIntogen=set(knowDriversDF['SYMBOL'])

print('knowDriversIntogen: ',len(knowDriversSetIntogen))
print('knowDriversNCG: ',len(knowDriversSetNcg))
knowDriversSetNcg.union(knowDriversSetIntogen)
knowDriversUnion = knowDriversSetNcg.union(knowDriversSetIntogen)
print('knowDriversUnion: ',len(knowDriversUnion))

knowDriversIntogen:  568
knowDriversNCG:  711
knowDriversUnion:  866


In [6]:
a_file = open("1 output/knowDriversUnion.pkl", "wb")
pickle.dump(knowDriversUnion, a_file)
a_file.close()

### Pathway Gene Sets

In [8]:
f = open("1 input/ReactomePathways.gmt", "r")
pathwaysGeneSets={}
pathwaysGeneSetsByName={}
for x in f:
    line=x.strip().split('\t')
    name=line[0]
    pathId=line[1]
    geneSet=set(line[2:])
    pathwaysGeneSets[pathId]=[name,geneSet] 
    pathwaysGeneSetsByName[name]=[pathId,geneSet] 
f.close()

print('Total Pathways: ',len(pathwaysGeneSets))

Total Pathways:  2484


### SuperPathways
https://academic.oup.com/nar/article/48/D1/D498/5613674?login=true
<br>
"[...] These reactions are grouped into 1803 pathways grouped into 26 <b>superpathways</b> (e.g. immune system, metabolism and autophagy) that describe normal cellular functions. 
<br><br>
An additional ‘disease’ superpathway groups 484 annotations of disease counterparts of these normal cellular processes. 

In [9]:
class SuperPathwayClass:
    def __init__(self, name,idPathway,geneSet,networkWithCCs,networkLCC,drivers):
        self.name = name
        self.idPathway = idPathway
        self.geneSet = geneSet
        self.networkCCs = networkWithCCs
        self.networkLCC = networkLCC
        self.drivers = drivers
        self.topologicalFeaturesLCC = None
        self.layout_LCC_kamada_kawai= None
        self.layout_CCs_spring = None
        
    def info(self):
        info={}
        info['Name'] = self.name
        info['idPathway'] = self.idPathway        
        info['lenSet'] = len(self.geneSet)
        info['lenCCs'] = str(len(self.networkCCs.nodes)) +' ('+str(round(len(self.networkCCs.nodes)/len(self.geneSet)*100))+'%)'
        info['lenLCC'] = str(len(self.networkLCC.nodes)) +' ('+str(round(len(self.networkLCC.nodes)/len(self.geneSet)*100))+'%)'       
        driversLen = len(self.drivers)
        info['driversLCC'] = driversLen
        info['drivers %'] = round(driversLen/len(self.networkLCC.nodes)*100)
        
        return info

In [10]:
#The "Complete list of pathways" file maps the Reactome Stable identifier (ST_ID) to a pathway name and corresponding species.
allPathways = pd.read_csv('1 input/Complete List of Pathways.txt',sep='\t',names=['ST_ID','pwName','species'])
print('allPathways: ',allPathways.shape)
humansPathways=allPathways[allPathways['species']=='Homo sapiens']
print('humansPathways: ',humansPathways.shape)
humansPathways.head(3)

allPathways:  (20567, 3)
humansPathways:  (2516, 3)


Unnamed: 0,ST_ID,pwName,species
9940,R-HSA-164843,2-LTR circle formation,Homo sapiens
9941,R-HSA-73843,5-Phosphoribose 1-diphosphate biosynthesis,Homo sapiens
9942,R-HSA-1971475,A tetrasaccharide linker sequence is required ...,Homo sapiens


In [11]:
#The "Pathway hierarchy relationship" file consists of two columns of Reactome Stable identifiers (ST_ID), defining the relationship between pathways within the pathway hierarchy. The first column provides the parent pathway stable identifier, whereas the second column provides the child pathway stable identifier.
hierarchy = pd.read_csv('1 input/Pathways hierarchy relationship.txt',sep='\t',names=['ST_ID_Father','ST_ID_Child'])
print('hierarchy',hierarchy.shape)
humansHierarchy=hierarchy[hierarchy["ST_ID_Father"].isin(list(humansPathways.ST_ID))]
print('humansHierarchy',humansHierarchy.shape)
humansHierarchy.head(3)

hierarchy (20668, 2)
humansHierarchy (2535, 2)


Unnamed: 0,ST_ID_Father,ST_ID_Child
9999,R-HSA-109581,R-HSA-109606
10000,R-HSA-109581,R-HSA-169911
10001,R-HSA-109581,R-HSA-5357769


In [12]:
#The Creation of the SuperWays as Trees was made in other notebook.
#Here I just create a G for each one and get the info (name and id) from the root node
superPathwaysObjs={}
for file in glob.glob('1 input/superPathwaysAsTrees/*'):
    fileName = file.split('\\')[1].replace('.gml','')
    if(fileName!='allPathwaysHierarchyGraph'):
        fileName=fileName.split('_')
        name=fileName[1]
        idPath=fileName[0]
        
        #G = nx.read_gml(file)        
        name=name
        idPathway=idPath
        geneSet=pathwaysGeneSets[idPath][1]
        subG=nx.subgraph(ReactomeFI,geneSet)
        CC = list(nx.connected_components(subG))
        LCC= nx.subgraph(subG,max(CC,key=len))

        obj = SuperPathwayClass(
            name,
            idPathway,
            geneSet,
            subG,
            LCC,
            set(LCC.nodes) & knowDriversUnion
        )

        superPathwaysObjs[name]=obj

### Save Reactome as a Obj

###### Get all pathways gene sets together

In [13]:
allGenesSets=set()
allGenesList=[]
for gSet in pathwaysGeneSets:
    allGenesList.extend(pathwaysGeneSets[gSet][1])
    
    #As far as know, there is no 'extend' for Sets, so:
    for gene in pathwaysGeneSets[gSet][1]:
        allGenesSets.add(gene)       
    
print('Total in All Superpathways Gene Sets whitout repetition: ',len(allGenesSets))    
print('Total in All Superpathways Gene Sets whith repetition: ',len(allGenesList))    

Total in All Superpathways Gene Sets whitout repetition:  11375
Total in All Superpathways Gene Sets whith repetition:  120605


In [14]:
name='ReactomeFI_LCC_AllSuperPathways'
idPathway=''
geneSet=allGenesSets
subG=nx.subgraph(ReactomeFI_LCC,geneSet)
CC = list(nx.connected_components(subG))
LCC= nx.subgraph(subG,max(CC,key=len))

obj = SuperPathwayClass(
    name,
    idPathway,
    geneSet,
    subG,
    LCC,
    set(LCC.nodes) & knowDriversUnion
)

superPathwaysObjs[name]=obj

In [15]:
file = open("1 output/ReactomeFI_LCC_AllSuperPathwaysObj.pkl", "wb")
pickle.dump(obj, file)
file.close()

a_file = open("1 output/superPathwaysObjs.pkl", "wb")
pickle.dump(superPathwaysObjs, a_file)
a_file.close()

### Super Pathways Overview

In [16]:
overView={}
for k in sorted(superPathwaysObjs):
    overView[k]=superPathwaysObjs[k].info()
    
overView=pd.DataFrame.from_dict(overView,orient='index')
overView.set_index(overView.columns[0],inplace=True)

a_file = open("1 output/overView.pkl", "wb")
pickle.dump(overView, a_file)
a_file.close()

In [17]:
overView.drop(['Disease']).sort_values(by=['drivers %','lenSet'],ascending=False).drop('idPathway', 1)

Unnamed: 0_level_0,lenSet,lenCCs,lenLCC,driversLCC,drivers %
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chromatin organization,240,218 (91%),206 (86%),45,22
Circadian Clock,70,69 (99%),64 (91%),12,19
DNA Repair,312,290 (93%),284 (91%),48,17
Reproduction,114,95 (83%),81 (71%),14,17
Gene expression (Transcription),1536,1392 (91%),1367 (89%),194,14
Developmental Biology,1097,972 (89%),962 (88%),137,14
Programmed Cell Death,216,208 (96%),201 (93%),27,13
Cell-Cell communication,122,117 (96%),115 (94%),15,13
Signal Transduction,2542,2399 (94%),2363 (93%),285,12
Cellular responses to external stimuli,665,619 (93%),604 (91%),73,12
