In [1]:
import pandas as pd
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import networkx as nx
import matplotlib.pyplot as plt
import statistics as st
# Python Lib POWER LAW: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0085777 
import powerlaw
from IPython.core.display import display, HTML
import datetime

# Scale Free Characterization - Part2
###### Barabasi show at Table 4.1 some measure to characterize the networks (showing only the measures for undirect graphs): 
N: Number of nodes <br>	L: Number of Edges<br>	〈k〉: Average degree (first moment of the distribution) <br>	〈k^2〉:  Second moment of the distribution (st.variance(degree) + st.mean(degree)**2)<br>γ: Degree Exponent (Gamma)
###### Here we use the Power Law Package to extract the Gamma and the PowerLawRatio that compare the target distribution to a pure power-law and a pure exponent distribution. This ratio has a pValue. To extract the Gamma, the package defines a kMinCut. We also add in the analysis the Kmax, which is the biggest hub.
###### More information on the Power Law Package and a visual representation of the distributions can be found in the Part1

# Functions

In [37]:
def scaleFreeMeasures(G):    
    degree=sorted(dict(G.degree()).values(),reverse=True)
    
    #Fitting with powerlaw lib #We let the lib choose the xmin
    fit = powerlaw.Fit(degree,verbose=False)
    # Comparing the distribution with a power law, and a exponential # R is the Ratio and p is the p-value
    R, p = fit.distribution_compare('power_law', 'exponential', normalized_ratio=True)    
    
    #General information
    info={}
    info['N']=len(G.nodes) # Number of Nodes
    info['L']=len(G.edges) # Number of Edges
    info['<k>']= round(st.mean(degree),2) #Average degree
    info['<k2>']= round(st.variance(degree) + st.mean(degree)**2,2) #The 2 moment of the degree
    info['Kmax']= max(degree) #Biggest hub
    
    kmin=fit.power_law.xmin #Nodes with degree < than kmin are discarded from the distribution in order the find the gamma
    gamma=fit.power_law.alpha # Degree Exponent (after xmin)  
    info['kMinCut'] = kmin    
    info['PowerLawRatio'] = round(R,3)
    info['pValue'] = round(p,4)
    info['Gamma'] = round(gamma,3)
    
    #The average_shortest_path_length is very timing consuming, so as a feed back, we print the time when a networks finish
    #We commented the code to run the Collaboration networks 
    
    #Get the biggest connect component
#     LCC = nx.subgraph(G,max(nx.connected_components(G),key=len))   
#     start = datetime.datetime.now()
#     info['<d>'] = nx.average_shortest_path_length(LCC)
#     end = datetime.datetime.now()    
#     print(G.name,' <d> time(H:M:S.MS): ',end-start)
    return info  

def createTableFromListOfString(lista,numCol):
    numCol=3
    col=0
    htmlTable="<table class='table table-striped'>"
    htmlTable+="<tr>"
    for item in lista:
        if(col==numCol):
            col=0
            htmlTable+="</tr>"

        htmlTable+="<td style='text-align: left; padding:10px; border:1px solid black'>"+item+"</td>"

        if(col==numCol):        
            htmlTable+="<tr>"
        else:
            col+=1        

    htmlTable+="</table>"
    htmlTable=htmlTable.replace("\n","<br>")
    display(HTML(htmlTable))

# Networks
###### To create a table like the 4.1 from the Network Science Book, we use 9 networks. 4 from the book (which is useful to compare the results), 2 from the biological domain, and 3 synthetic models ( 2 scale-free and 1 random) with around 10k nodes 

In [3]:
networks={}
#Networks from the book Network Science
networks['Collaboration']=nx.read_edgelist('datasets/NetworkScienceBook_collaboration.edgelist.txt')
networks['Metabolic']=nx.read_edgelist('datasets/NetworkScienceBook_metabolic.edgelist.txt')
networks['Powergrid']=nx.read_edgelist('datasets/NetworkScienceBook_powergrid.edgelist.txt')
networks['Protein']=nx.read_edgelist('datasets/NetworkScienceBook_protein.edgelist.txt')

#Other Real Networks
#Reactome FI: https://reactome.org/download-data
networks['Reactome']=nx.from_pandas_edgelist(pd.read_csv('datasets/ProteinProteinInteractionNetwork_reactomeFI2019.tsv',sep='\t'),source='Gene1',target='Gene2')
#This Bacteria is a "KlebsiellaPneumoniae" downloaded from STRING: https://string-db.org/ 
networks['Bacteria']=nx.read_gml('datasets/ProteinProteinInteractionNetwork_KlebsiellaPneumoniae.gml')

#Synthetic Models
networks['ScaleFree1']=nx.barabasi_albert_graph(10000,3)
networks['ScaleFree2']=nx.dorogovtsev_goltsev_mendes_graph(9)
networks['Random']=nx.erdos_renyi_graph(10000,0.0006)

#Set the NX name
for net in networks:
    networks[net].name=net

#Show as a HTML Table
listOfInfo=[nx.info(networks[k]) for k in networks]
createTableFromListOfString(listOfInfo,3)

0,1,2
Name: Collaboration Type: Graph Number of nodes: 23133 Number of edges: 93439 Average degree: 8.0784,Name: Metabolic Type: Graph Number of nodes: 1039 Number of edges: 4741 Average degree: 9.1261,Name: Powergrid Type: Graph Number of nodes: 4941 Number of edges: 6594 Average degree: 2.6691


# Table
###### Here we present a table (data frame) similar to Table 4.1 from the book. The Kmax, kMinCut, PowerLawRatio, and pValue are added as explained at the beginning of this notebook

In [39]:
measures={}
for net in ['Metabolic','Protein','Bacteria','Powergrid','ScaleFree1','ScaleFree2','Random','Reactome']:
    measures[net] = scaleFreeMeasures(networks[net])

Metabolic  <d> time(H:M:S.MS):  0:00:30.895111
Protein  <d> time(H:M:S.MS):  0:00:39.025497
Bacteria  <d> time(H:M:S.MS):  0:02:16.279022
Powergrid  <d> time(H:M:S.MS):  0:05:31.137341
ScaleFree1  <d> time(H:M:S.MS):  0:35:35.532381
ScaleFree2  <d> time(H:M:S.MS):  0:26:12.007610
Random  <d> time(H:M:S.MS):  0:35:03.106936
Reactome  <d> time(H:M:S.MS):  5:17:53.760130


<B> We had to run the Collaboration Network without the < d > because it takes to much time<b>

In [10]:
measures['Collaboration'] = scaleFreeMeasures(networks['Collaboration'])

In [41]:
table = pd.DataFrame.from_dict(measures,orient='index')
table

Unnamed: 0,N,L,<k>,<k2>,Kmax,kMinCut,PowerLawRatio,pValue,Gamma,<d>
Metabolic,1039,4741,9.13,949.24,638,7.0,4.599,0.0,2.993,2.4771
Protein,2018,2930,2.9,32.28,91,7.0,3.754,0.0002,3.181,5.611747
Bacteria,3546,12529,7.07,163.35,109,11.0,10.645,0.0,2.943,6.228673
Powergrid,4941,6594,2.67,10.33,19,10.0,2.53,0.0114,10.436,18.989185
ScaleFree1,10000,29991,6.0,113.48,305,10.0,6.692,0.0,3.003,4.272592
ScaleFree2,9843,19683,4.0,151.81,512,64.0,4.443,0.0,3.525,4.776976
Random,10000,30064,6.01,42.2,19,16.0,3.27,0.0011,47.552,5.335289
Reactome,14071,268857,38.21,6013.48,1201,160.0,4.112,0.0,3.74,3.417706
Collaboration,23133,93439,8.08,178.2,279,30.0,2.811,0.0049,3.584,


# Top 10 Hubs 
###### Considering that:
"In summary, to understand the properties of real networks, it is often sufficient to remember that in scale-free networks a few highly connected hubs coexist with a large number of small nodes. The presence of these hubs plays an important role in the system’s behavior."
###### We present a table with the degree of the top 10 hubs, as well the number of the (N) and the standard deviation between these 10 hubs

In [42]:
hubs={}
for net in networks:
    degree=sorted(dict(networks[net].degree()).values(),reverse=True)
    hubs[net]={}
    cont=1
    hubs[net]['N'] = len(degree)
    for hub in degree[:10]:
        hubs[net]['Hub'+str(cont)]=hub
        cont+=1
    #Standard Deviation
    hubs[net]['StDev'] = round(st.stdev(degree[:10]),2)
topHubs = pd.DataFrame.from_dict(hubs,orient='index')
topHubs.sort_values(by='N')

Unnamed: 0,N,Hub1,Hub2,Hub3,Hub4,Hub5,Hub6,Hub7,Hub8,Hub9,Hub10,StDev
Metabolic,1039,638,460,299,252,243,139,112,105,99,84,182.38
Protein,2018,91,82,81,52,46,42,37,32,30,29,23.65
Bacteria,3546,109,83,81,81,81,80,79,79,79,78,9.25
Powergrid,4941,19,18,14,14,14,13,13,13,13,13,2.22
ScaleFree2,9843,512,512,512,256,256,256,128,128,128,128,168.52
ScaleFree1,10000,305,267,213,174,173,169,153,141,132,122,59.78
Random,10000,19,16,16,16,16,16,16,16,15,15,1.1
Reactome,14071,1201,1017,979,929,915,913,810,800,795,755,133.59
Collaboration,23133,279,252,201,190,182,165,158,148,142,138,47.34
