In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

# Problem 1

Analyse the part of the web-graph given in edges.txt. In particular, find
1. giant weakly connected component's size
2. giant strongly connected component's size
3. giant In- and Out- components' sizes 
3. weakly connected components' sizes distribution
4. (in-,out-,tot-)degree distribution

In [None]:
filename = 'edges.txt'
G = nx.read_edgelist(filename, create_using = nx.MultiDiGraph)

In [None]:
n = len(G.nodes)
n_edges = len(G.edges)
print(f'number of nodes = {n}')
print(f'number of edges = {n_edges}')

In [None]:
wcc_list = list(nx.weakly_connected_components(G))
wcc_list = sorted(wcc_list, key = len, reverse=True)

print(f'number of weakly connected componets = {len(wcc_list)}')
print(f'size of GWCC = {len(wcc_list[0])/len(G.nodes)}')
print(f'size of second WCC = {len(wcc_list[1])/len(G.nodes)}')

In [None]:
scc_list = list(nx.strongly_connected_components(G))
scc_list = sorted(scc_list, key = len, reverse=True)

print(f'number of strongly connected componets = {len(scc_list)}')
print(f'size of GSCC = {len(scc_list[0])/len(G.nodes)}')
print(f'size of second SCC = {len(scc_list[1])/len(G.nodes)}')

In [None]:
nodes_in_gwcc = list(wcc_list[0])
nodes_in_gscc = list(scc_list[0])

in_nodes = []
out_nodes = []

for node in nodes_in_gwcc:
    if not node in nodes_in_gscc:
        if nx.has_path(G, node, nodes_in_gscc[0]):
            in_nodes.append(node)
        elif nx.has_path(G, nodes_in_gscc[0], node):
            out_nodes.append(node)

print(f'In component size = {len(in_nodes)/n}')
print(f'Out component size = {len(out_nodes)/n}')
print(f'In + Out + SCC size to WCC = {(len(out_nodes) + len(in_nodes) + len(nodes_in_gscc))/len(nodes_in_gwcc)}')

In [None]:
wcc_sizes =[len(x) for x in wcc_list]
wcc_sizes_counts = Counter(wcc_sizes)

In [None]:
plt.loglog(list(wcc_sizes_counts.keys()), 
           list(wcc_sizes_counts.values()), ls='None', marker='.')
plt.xlabel('WCC size')
plt.ylabel('Counts')
plt.title('WCC sizes distribution')

In [None]:
in_degrees = dict(G.in_degree)
in_degrees_counts = Counter(list(in_degrees.values()))

In [None]:
plt.loglog(list(in_degrees_counts.keys()), 
           list(in_degrees_counts.values()), ls='None', marker='.')
plt.xlabel('In-degree')
plt.ylabel('Counts')
plt.title('In-degree distribution')

In [None]:
out_degrees = dict(G.out_degree)
out_degrees_counts = Counter(list(out_degrees.values()))

In [None]:
plt.loglog(list(out_degrees_counts.keys()), 
           list(out_degrees_counts.values()), ls='None', marker='.')
plt.xlabel('Out-degree')
plt.ylabel('Counts')
plt.title('Out-degree distribution')

In [None]:
degrees = dict(G.degree)
degrees_counts = Counter(list(degrees.values()))

In [None]:
plt.loglog(list(degrees_counts.keys()), 
           list(degrees_counts.values()), ls='None', marker='.')
plt.xlabel('Total-degree')
plt.ylabel('Counts')
plt.title('Total-degree distribution')

In [None]:
from scipy.optimize import curve_fit
import numpy as np

def power_law(d, c, gamma):
    return c / (d ** gamma)

degrees_list = list(degrees_counts.keys())
counts_list = list(degrees_counts.values())

h1, h2 = 8, 110

xdata = np.array([x for x in degrees_list if x <= h2 and x>= h1])
ydata = np.array([counts_list[i] for i in range(len(counts_list)) 
                  if degrees_list[i] >= h1 and degrees_list[i] <= h2])
popt, pcov = curve_fit(power_law, xdata, ydata)
c, gamma = popt

In [None]:
c

In [None]:
gamma

In [None]:
plt.loglog(list(degrees_counts.keys()), 
           list(degrees_counts.values()), ls='None', marker='.')
estimated = [power_law(x, c, gamma) for x in degrees_list]
plt.loglog(degrees_list, estimated, ls = '--', color = 'r')
plt.xlabel('Total-degree')
plt.ylabel('Counts')
plt.title('Total-degree distribution')

# Problem 2

Analyze stability and vulnerability of the giant (weakly) connected component in the part of the web-graph given in edges.txt.

In [None]:
import random

p = 0.5

H = G.copy()
H = nx.Graph(H)

comps = list(nx.connected_components(H))
comps = sorted(comps, key = len, reverse=True)
n = len(H.nodes)
print(f'n = {n} || giant component size = {len(comps[0])/n}')

In [None]:
nodes = list(H.nodes)
for node in nodes:
    if random.random() < p:
        H.remove_node(node)
comps = list(nx.connected_components(H))
comps = sorted(comps, key = len, reverse=True)
n = len(H.nodes)
print(f'n = {n} || giant component size = {len(comps[0])/n}')

In [None]:
H = G.copy()
H = nx.Graph(H)

degrees = dict(H.degree)
degrees = sorted(degrees.items(), key = lambda x:x[1], reverse=True)
n = len(G.nodes)
c_list = []
comp_size = []
i = 0
for node,_ in degrees[:6000]:
    i += 1
    H.remove_node(node)
    if i%100 == 0:
        print(i)
        comps = list(nx.connected_components(H))
        comps = sorted(comps, key = len, reverse=True)
        c_list.append(i/n)
        comp_size.append(len(comps[0])/len(H.nodes))
    

In [None]:
plt.plot(c_list,comp_size)

# Problem 3

Analyze degree distribution of a random graph in Erdős–Rényi model with the same nodes count as in the part of the web-graph analyzed above

In [None]:
n = 10000
p = 0.003
ER = nx.erdos_renyi_graph(n,p)
er_degrees = dict(ER.degree)
er_degrees_counts = Counter(list(er_degrees.values()))

In [None]:
degrees = dict(G.degree)
degrees_counts = Counter(list(degrees.values()))

In [None]:
plt.loglog(list(degrees_counts.keys()), 
           list(degrees_counts.values()), ls='None', marker='.')
plt.loglog(list(er_degrees_counts.keys()), 
           list(er_degrees_counts.values()), ls='None', marker='.', color='r')
plt.xlabel('Total-degree')
plt.ylabel('Counts')
plt.title('Total-degree distribution')

# Problem 4

Analyze stability and vulnerability of the giant connected component in the Erdős–Rényi random graph

In [None]:
ER_2 = ER.copy()
comps = sorted(list(nx.connected_components(ER_2)), key = len, reverse=True)
print(f'Giant component size = {len(comps[0])/n}')

In [None]:
comps_sizes_counts = Counter([len(x) for x in  comps])
plt.loglog(list(comps_sizes_counts.keys()), 
           list(comps_sizes_counts.values()), ls='None', marker='.')
plt.xlabel('CC size')
plt.ylabel('Counts')
plt.title('CC sizes distribution')

In [None]:
nodes = list(ER_2.nodes)
p_remove = 0.5
for node in nodes:
    if random.random() < p_remove:
        ER_2.remove_node(node)
comps = list(nx.connected_components(ER_2))
comps = sorted(comps, key = len, reverse=True)
n = len(ER_2.nodes)
print(f'n = {n} || giant component size = {len(comps[0])/n}')

In [None]:
ER_2 = ER.copy()
degrees = dict(ER_2.degree)
degrees = sorted(degrees.items(), key = lambda x:x[1], reverse=True)
n = len(ER_2.nodes)
c_list = []
comp_size = []
i = 0
for node,_ in degrees[:2000]:
    i += 1
    ER_2.remove_node(node)
    if i%100 == 0:
#         print(i)
        comps = list(nx.connected_components(ER_2))
        comps = sorted(comps, key = len, reverse=True)
        c_list.append(i/n)
        comp_size.append(len(comps[0])/len(ER_2.nodes))

In [None]:
plt.plot(c_list,comp_size)