In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import pickle
import gudhi as gd  

In [2]:
class PathwayClass:
    def __init__(self, name,idPathway,geneSet,networkWithCCs,networkLCC,drivers):
        self.name = name
        self.id = idPathway
        self.geneSet = geneSet
        self.networkCCs = networkWithCCs
        self.networkLCC = networkLCC
        self.drivers = drivers
        self.topologicalFeaturesLCC = None
        self.layout_LCC_kamada_kawai= None
        self.layout_CCs_spring = None
        self.father = None
        self.children = {}

class PathwaysWrapper:
    def __init__(self, pathwaysObjsIds,pathwaysObjsNames,pathwaysNamesForest,listOfObjects):
        self.pathwaysIds=pathwaysObjsIds
        self.pathwaysNames=pathwaysObjsNames
        self.pathwaysNamesForest=superPathwaysNamesForest
        self.listOfRoots=listOfRoots
        
file = open("../../Super Pathways - Hierarquia/1 - Output/SuperPathwaysAndPathways.pkl", "rb")
pathwaysWrapper = pickle.load(file)

# LookUp Tables

In [3]:
names={
    'DNA':'DNA Repair',
    'CHR':'Chromatin organization',
    'PCD':'Programmed Cell Death'
}

paths={
    'DNA':'/matriz_reactome_missense_repair.csv',
    'CHR':'/matriz_reactome_missense_chromatin.csv',
    'PCD':'/matriz_reactome_missense_death.csv'
}
cancerType={
    'Cabeça':'Head and Neck',
    'Estomago':'Stomach',
    'Mama':'Breast',
    'Pele':'Skin',
    'Pulmao':'Lung',
    'Urothelial':'Bladder'
}

# Networks Size

In [4]:
networks={}
resumeNodes={}
resumeEdges={}
for name in ['CHR','DNA','PCD']:
    resumeNodes[name]={}
    resumeEdges[name]={}
    networks[name]=pathwaysWrapper.pathwaysNames[names[name]].networkLCC
    resumeNodes[name]['Original']=networks[name].number_of_nodes()
    resumeEdges[name]['Original']=networks[name].number_of_edges()
    
    for ct in cancerType:
        geneSet=set(pd.read_csv("1 input/"+ct+paths[name],index_col=[0]).index)
        G=networks[name].copy()
        ct=cancerType[ct]
        G.name=ct
        networks[ct]=nx.subgraph(G,geneSet)
        resumeNodes[name][ct]=networks[ct].number_of_nodes()
        resumeEdges[name][ct]=networks[ct].number_of_edges()

In [5]:
percentuals={}
for sp in resumeNodes:
    percentuals[sp]=[]
    for net in resumeNodes[sp]:
        if net != 'Original':
            perc = int(100*resumeNodes[sp][net]/resumeNodes[sp]['Original'])
            resumeNodes[sp][net]=str(resumeNodes[sp][net])+' ('+str(perc)+'%)'
            percentuals[sp].append(perc)
display(pd.DataFrame(resumeNodes).T)
print('Média Percentual')
[(name, round(np.mean(values))) for name,values in percentuals.items()]

Unnamed: 0,Original,Head and Neck,Stomach,Breast,Skin,Lung,Bladder
CHR,221,160 (72%),148 (66%),151 (68%),175 (79%),166 (75%),165 (74%)
DNA,300,209 (69%),191 (63%),210 (70%),251 (83%),238 (79%),242 (80%)
PCD,206,152 (73%),137 (66%),132 (64%),186 (90%),169 (82%),174 (84%)


Média Percentual


[('CHR', 72), ('DNA', 74), ('PCD', 76)]

In [6]:
percentuals={}
for sp in resumeEdges:
    percentuals[sp]=[]
    for net in resumeEdges[sp]:
        if net != 'Original':
            perc = int(100*resumeEdges[sp][net]/resumeEdges[sp]['Original'])
            resumeEdges[sp][net]=str(resumeEdges[sp][net])+' ('+str(perc)+'%)'
            percentuals[sp].append(perc)
display(pd.DataFrame(resumeEdges).T)
print('Média Percentual')
[(name,round(np.mean(values))) for name,values in percentuals.items()]

Unnamed: 0,Original,Head and Neck,Stomach,Breast,Skin,Lung,Bladder
CHR,2301,1279 (55%),1097 (47%),1099 (47%),1472 (63%),1267 (55%),1338 (58%)
DNA,6959,3712 (53%),3029 (43%),3827 (54%),5326 (76%),4875 (70%),4954 (71%)
PCD,2635,1550 (58%),1209 (45%),1133 (42%),2256 (85%),1754 (66%),2009 (76%)


Média Percentual


[('CHR', 54), ('DNA', 61), ('PCD', 62)]

## Driver per Networks

In [7]:
intoGen=set(pd.read_csv('99 input/intogen.tsv',sep='\t')['SYMBOL'])
ncg_canonical=set(pd.read_csv('99 input/ncg_canonical.txt',header=None)[0])
driverUnion = intoGen.union(ncg_canonical)
len(intoGen),len(ncg_canonical),len(driverUnion)

(568, 591, 787)

In [8]:
networks={}
resumeNodes={}
for name in ['CHR','DNA','PCD']:
    resumeNodes[name]={}
    networks[name]=pathwaysWrapper.pathwaysNames[names[name]].networkLCC
    resumeNodes[name]['Original']=len(set(networks[name].nodes())&driverUnion)
    
    for ct in cancerType:
        geneSet=set(pd.read_csv("1 input/"+ct+paths[name],index_col=[0]).index)
        G=networks[name].copy()
        ct=cancerType[ct]
        G.name=ct
        networks[ct]=nx.subgraph(G,geneSet)
        resumeNodes[name][ct]=len(set(networks[ct].nodes())&driverUnion)
        
        
percentuals={}
for sp in resumeNodes:
    percentuals[sp]=[]
    for net in resumeNodes[sp]:
        if net != 'Original':
            perc = int(100*resumeNodes[sp][net]/resumeNodes[sp]['Original'])
            resumeNodes[sp][net]=str(resumeNodes[sp][net])+' ('+str(perc)+'%)'
            percentuals[sp].append(perc)
display(pd.DataFrame(resumeNodes).T)
print('Média Percentual')
[(name, round(np.mean(values))) for name,values in percentuals.items()]       

Unnamed: 0,Original,Head and Neck,Stomach,Breast,Skin,Lung,Bladder
CHR,45,42 (93%),40 (88%),43 (95%),43 (95%),42 (93%),43 (95%)
DNA,46,41 (89%),42 (91%),43 (93%),44 (95%),44 (95%),44 (95%)
PCD,26,24 (92%),24 (92%),22 (84%),26 (100%),25 (96%),24 (92%)


Média Percentual


[('CHR', 93), ('DNA', 93), ('PCD', 93)]

## REPORT

In [9]:
def resume(dimension,SP):
    with open('2 output/'+SP+'/barCodes_original.pkl',mode='rb') as f:
        barCodesOriginal=pickle.load(f)
    
    with open('2 output/'+SP+'/barCodes_WithOutDrivers.pkl',mode='rb') as f:
        barCodesWithOutDrivers=pickle.load(f)

    with open('2 output/'+SP+'/barCodes_WithRandom.pkl',mode='rb') as f:
        barCodesWithRandom=pickle.load(f)
    
    randomValues={}    
    names=[SP,'Urothelial','Cabeça','Mama','Pulmao','Pele','Estomago']
    for name in names:
        randomValues[name]=[]
        #9 is the number of networks = 6 Cancer + 1 
        randomRemovals=int(len(barCodesWithRandom.keys())/7)

        for i in range(1,randomRemovals+1):
            randomValues[name].append(len([dim for dim,liveTime in barCodesWithRandom[name+' - '+str(i)] if dim==dimension]))

    resume={}
    for name in names:
        resume[name]={}
        resume[name]['Original']   =len([dim for dim,liveTime in barCodesOriginal[name] if dim==dimension])
        resume[name]['Driver']     =len([dim for dim,liveTime in barCodesWithOutDrivers[name] if dim==dimension])
        resume[name]['Random Mean']=round(np.mean([dim for dim in randomValues[name]]),2)
        resume[name]['Random Median'] =round(np.median([dim for dim in randomValues[name]]),2)
        resume[name]['Random Std'] =round(np.std([dim for dim in randomValues[name]]),2)
        resume[name]['Random Min'] =round(np.min([dim for dim in randomValues[name]]),2)
        resume[name]['Random Max'] =round(np.max([dim for dim in randomValues[name]]),2)

    return pd.DataFrame(resume).T.rename(index=cancerType).astype({"Original": int, "Driver": int, 'Random Median': int, "Random Min": int, "Random Max": int})

#### Dimension 0

In [10]:
resumeTables={}
for sp in ['CHR','DNA','PCD']:
    resumeTables[sp]={}
    print('\n\n--------',sp,'\n')
    for dim in [0,1,2]:
        resumeTables[sp][dim]=resume(dim,sp)
        print('\nDimension',dim)
        display(resume(dim,sp))



-------- CHR 


Dimension 0


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
CHR,221,171,173.73,174,2.38,164,176
Bladder,165,111,116.13,117,2.7,109,122
Head and Neck,160,105,113.27,113,2.08,108,118
Breast,151,99,103.03,104,2.58,98,107
Lung,166,112,118.63,119,2.33,113,124
Skin,175,120,127.3,128,2.18,122,132
Stomach,148,95,102.7,102,2.21,96,107



Dimension 1


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
CHR,18,13,15.57,16,2.68,11,21
Bladder,17,17,11.6,11,2.91,7,17
Head and Neck,15,17,9.77,10,2.8,5,16
Breast,19,15,13.43,13,3.04,8,21
Lung,19,16,14.23,14,3.89,8,28
Skin,19,19,13.23,13,2.89,9,21
Stomach,12,17,9.0,10,2.14,4,12



Dimension 2


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
CHR,4,1,2.27,2,1.18,0,4
Bladder,3,0,1.1,1,0.79,0,3
Head and Neck,2,0,1.1,1,0.87,0,3
Breast,1,0,0.47,0,0.62,0,2
Lung,2,0,0.6,0,0.66,0,2
Skin,3,0,1.17,1,0.93,0,3
Stomach,1,0,0.37,0,0.48,0,1




-------- DNA 


Dimension 0


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
DNA,300,254,252.2,252,1.6,250,254
Bladder,242,197,197.5,198,0.67,196,198
Head and Neck,209,167,167.1,167,0.94,165,168
Breast,210,167,166.7,167,0.64,165,167
Lung,238,193,193.2,193,0.4,193,194
Skin,251,207,206.5,206,0.5,206,207
Stomach,191,146,147.9,148,0.83,146,149



Dimension 1


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
DNA,19,9,15.8,16,2.36,13,20
Bladder,16,8,13.5,13,1.8,10,16
Head and Neck,13,9,9.1,9,1.76,7,13
Breast,16,9,11.5,12,2.06,8,14
Lung,14,8,10.7,11,1.68,8,14
Skin,16,8,12.2,12,0.98,11,14
Stomach,9,8,6.9,7,1.3,4,9



Dimension 2


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
DNA,7,3,5.2,5,1.47,2,7
Bladder,8,4,5.6,5,1.36,4,8
Head and Neck,4,1,3.7,3,1.0,2,5
Breast,2,0,2.0,2,0.77,1,3
Lung,6,2,4.5,5,1.12,3,6
Skin,6,1,4.2,4,0.87,3,5
Stomach,4,1,3.1,3,1.04,1,5




-------- PCD 


Dimension 0


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
PCD,206,165,179.8,180,0.48,178,180
Bladder,174,138,149.87,150,0.34,149,150
Head and Neck,152,116,127.6,128,0.55,126,128
Breast,132,98,109.9,110,0.3,109,110
Lung,169,129,143.73,144,0.51,142,144
Skin,186,147,159.67,160,0.54,158,160
Stomach,137,102,110.13,111,2.0,106,112



Dimension 1


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
PCD,27,41,23.57,23,2.29,18,28
Bladder,24,34,19.43,19,2.56,13,23
Head and Neck,20,28,15.27,15,2.02,10,19
Breast,15,21,11.67,12,1.85,8,15
Lung,22,33,17.17,17,2.54,12,23
Skin,24,34,20.03,20,2.5,15,24
Stomach,28,24,20.0,20,3.27,12,27



Dimension 2


Unnamed: 0,Original,Driver,Random Mean,Random Median,Random Std,Random Min,Random Max
PCD,11,0,9.83,10,2.02,6,15
Bladder,11,0,9.53,9,2.19,4,15
Head and Neck,13,0,10.23,10,1.87,7,14
Breast,8,0,6.33,6,2.04,3,11
Lung,8,0,6.67,7,1.6,4,10
Skin,10,0,9.37,9,2.02,5,13
Stomach,7,0,5.27,5,1.57,1,8


## The Mean of Averages

In [11]:
resumeTables[sp][dim]['Original'].mean()

9.714285714285714

In [12]:
means={}
for sp in ['CHR','DNA','PCD']:
    original=round(resumeTables[sp][1].drop(sp)['Original'].mean(),2)      
    random=round(resumeTables[sp][1].drop(sp)['Random Mean'].mean(),2)    
    driver=round(resumeTables[sp][1].drop(sp)['Driver'].mean(),2)
    
    pRandom = round(random/original*100)
    pDriver = round(driver/original*100)
    means[sp]={}
    means[sp]['Original']=str(original)
    means[sp]['Random']=str(random)+' ('+str(pRandom)+'%)'
    means[sp]['Driver']=str(driver)+' ('+str(pDriver)+'%)'
    
pd.DataFrame(means).T

Unnamed: 0,Original,Random,Driver
CHR,16.83,11.88 (71%),16.83 (100%)
DNA,14.0,10.65 (76%),8.33 (60%)
PCD,22.17,17.26 (78%),29.0 (131%)


In [13]:
means={}
for sp in ['CHR','DNA','PCD']:
    original=round(resumeTables[sp][2].drop(sp)['Original'].mean(),2)      
    random=round(resumeTables[sp][2].drop(sp)['Random Mean'].mean(),2)    
    driver=round(resumeTables[sp][2].drop(sp)['Driver'].mean(),2)
    
    pRandom = round(random/original*100)
    pDriver = round(driver/original*100)
    means[sp]={}
    means[sp]['Original']=str(original)
    means[sp]['Random']=str(random)+' ('+str(pRandom)+'%)'
    means[sp]['Driver']=str(driver)+' ('+str(pDriver)+'%)'
    
pd.DataFrame(means).T

Unnamed: 0,Original,Random,Driver
CHR,2.0,0.8 (40%),0.0 (0%)
DNA,5.0,3.85 (77%),1.5 (30%)
PCD,9.5,7.9 (83%),0.0 (0%)


## Number of Cycles and Tetrahedron - Cancer vs Randoms
To calculate the PH on random networks, we create sub-networks from each SuperPathways, considering the average number of nodes present on the cancer networks from the same super pathways.

In [14]:
def randomWalkUniqueSetOfNodes(startNode,setLen):
    uniqueSet=set()
    actualNode=startNode
    uniqueSet.add(actualNode)

    while len(uniqueSet) <setLen:
        n=list(G.neighbors(actualNode))
        actualNode=np.random.choice(n)
        uniqueSet.add(actualNode)

    return uniqueSet

def getNumberOfCircles(G):
    dm=pd.DataFrame(dict(nx.shortest_path_length(G))).sort_index()
    dm=dm[sorted(dm.columns)]
    skeleton = gd.RipsComplex(distance_matrix = dm.values) 

    Rips_simplex_tree = skeleton.create_simplex_tree(max_dimension = 2)
    BarCode = Rips_simplex_tree.persistence() 

    return len([bd for bd in BarCode if (bd[0]==1)])

def getNumberOfTetrahedron(G):
    dm=pd.DataFrame(dict(nx.shortest_path_length(G))).sort_index()
    dm=dm[sorted(dm.columns)]
    skeleton = gd.RipsComplex(distance_matrix = dm.values) 

    Rips_simplex_tree = skeleton.create_simplex_tree(max_dimension = 3)
    BarCode = Rips_simplex_tree.persistence() 

    return len([bd for bd in BarCode if (bd[0]==2)])

In [15]:
%%time
networks={}
networksRandom={}
resumeCancer={}
resumeRandom={}
for sp in ['CHR','DNA','PCD']:
    resumeCancer[sp]={}
    resumeRandom[sp]={}
    
    #Original SP
    G = pathwaysWrapper.pathwaysNames[names[sp]].networkLCC
    print(sp,'nodes',len(G.nodes))
    
    #######DATA FROM CANCER NETWORKS######## 
    totalCircle=resumeTables[sp][1].drop(sp)['Original'].sum()
    totalTetra=resumeTables[sp][2].drop(sp)['Original'].sum()
    #The average number of nodes found in the 6 cancer network from to actual SP
    avgNodes=int(round(resumeTables[sp][0].drop(sp)['Original'].mean(),0) )
    print('avgNodes totalCircle totalTetra:',avgNodes,totalCircle,totalTetra)
    
    resumeCancer[sp]['Total Circles']=totalCircle
    resumeCancer[sp]['Circles Mean']=round(resumeTables[sp][1].drop(sp)['Original'].mean(),2)
    resumeCancer[sp]['Circles STD']=round(resumeTables[sp][1].drop(sp)['Original'].std(),2)
    resumeCancer[sp]['Total Tetra']=totalTetra
    resumeCancer[sp]['Tetra Mean']=round(resumeTables[sp][2].drop(sp)['Original'].mean(),2)
    resumeCancer[sp]['Tetra STD']=round(resumeTables[sp][2].drop(sp)['Original'].std(),2)
    
    #######DATA FROM RANDOM NETWORKS######## 
    #get the hub:
    hub=(max(dict(G.degree).items(),key=lambda x:x[1]))[0]
    networksRandom[sp]={}
    networksRandom[sp]['Circles']=[]
    networksRandom[sp]['Tetra']=[]
    print('Random Networks:')
    for i in range(30):
        randSet=randomWalkUniqueSetOfNodes(hub,avgNodes)
        G_rand = nx.subgraph(G,randSet)
        print(nx.info(G_rand))
        networksRandom[sp][i+1]=G_rand
        networksRandom[sp]['Circles'].append(getNumberOfCircles(G_rand))
        networksRandom[sp]['Tetra'].append(getNumberOfTetrahedron(G_rand))
        
    print('Circles:',networksRandom[sp]['Circles'])
    print('Circles Total:',sum(networksRandom[sp]['Circles']))
    print('Tetra:',networksRandom[sp]['Tetra'])
    print('Tetra Total:',sum(networksRandom[sp]['Tetra']))
    
    resumeRandom[sp]['Total Circles']=sum(networksRandom[sp]['Circles'])
    resumeRandom[sp]['Circles Mean']=round(np.mean(networksRandom[sp]['Circles']),2)
    resumeRandom[sp]['Circles STD']=round(np.std(networksRandom[sp]['Circles']),2)
    
    resumeRandom[sp]['Total Tetra']=sum(networksRandom[sp]['Tetra'])
    resumeRandom[sp]['Tetra Mean']=round(np.mean(networksRandom[sp]['Tetra']),2)
    resumeRandom[sp]['Tetra STD']=round(np.std(networksRandom[sp]['Tetra']),2)
        
    print('\n')

CHR nodes 221
avgNodes totalCircle totalTetra: 161 101 12
Random Networks:
Graph named 'Pathway Network' with 161 nodes and 1813 edges
Graph named 'Pathway Network' with 161 nodes and 1796 edges
Graph named 'Pathway Network' with 161 nodes and 1939 edges
Graph named 'Pathway Network' with 161 nodes and 1901 edges
Graph named 'Pathway Network' with 161 nodes and 1936 edges
Graph named 'Pathway Network' with 161 nodes and 1822 edges
Graph named 'Pathway Network' with 161 nodes and 1888 edges
Graph named 'Pathway Network' with 161 nodes and 1842 edges
Graph named 'Pathway Network' with 161 nodes and 1863 edges
Graph named 'Pathway Network' with 161 nodes and 1891 edges
Graph named 'Pathway Network' with 161 nodes and 1896 edges
Graph named 'Pathway Network' with 161 nodes and 1922 edges
Graph named 'Pathway Network' with 161 nodes and 1885 edges
Graph named 'Pathway Network' with 161 nodes and 1853 edges
Graph named 'Pathway Network' with 161 nodes and 1897 edges
Graph named 'Pathway Netw

In [16]:
pd.DataFrame(resumeRandom).T

Unnamed: 0,Total Circles,Circles Mean,Circles STD,Total Tetra,Tetra Mean,Tetra STD
CHR,207.0,6.9,2.18,105.0,3.5,0.56
DNA,132.0,4.4,1.47,120.0,4.0,1.1
PCD,462.0,15.4,2.27,272.0,9.07,1.59


In [17]:
pd.DataFrame(resumeCancer).T

Unnamed: 0,Total Circles,Circles Mean,Circles STD,Total Tetra,Tetra Mean,Tetra STD
CHR,101.0,16.83,2.86,12.0,2.0,0.89
DNA,84.0,14.0,2.76,30.0,5.0,2.1
PCD,133.0,22.17,4.4,57.0,9.5,2.26


In [22]:
# open a file, where you ant to store the data
file = open('randomWalkNetworks.pickle', 'wb')

# dump information to that file
pickle.dump(networksRandom, file)

# close the file
file.close()