**Imports**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%reload_ext autoreload
%autoreload 2

In [20]:
import json
import networkx as nx
import numpy as np
import pandas as pd
import scipy
from sklearn.neighbors import kneighbors_graph
from tqdm import tqdm_notebook, tqdm
from collections import Counter, defaultdict
import pickle
import community
import multiprocessing as mp
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode, plot
init_notebook_mode(connected=True)
from utils import *
import numpy as np
import scipy.sparse
import math

**Loading graphs and vectors datasets**

### Cora dataset

In [93]:
G = nx.read_edgelist('./Datasets/Cora/cora_edges.txt')

vect = {}
sim_file = open("./Datasets/Cora/cora_features","r")
line_iter = 1
for line in tqdm_notebook(sim_file, total=len(G.nodes()), leave=False):
    node_id = int(line.split()[0])
    features = np.asarray(list(map(float,line.split()[1:-1])))
    vect.update({str(node_id):features})
    line_iter +=1
sim_file.close()

HBox(children=(FloatProgress(value=0.0, max=2708.0), HTML(value='')))

### Sinanet dataset

In [18]:
G = nx.read_edgelist('./Datasets/Sinanet/Sinanet-master/edge.txt')

vect = {}
sim_file = open("./Datasets/Sinanet/Sinanet-master/content.txt","r")
line_iter = 1
for line in tqdm_notebook(sim_file, total=len(G.nodes()), leave=False):
    features = np.asarray(list(map(float,line.split())))
    vect.update({str(line_iter):features})
    line_iter +=1
sim_file.close()

HBox(children=(FloatProgress(value=0.0, max=3467.0), HTML(value='')))

### Political Blogs dataset

In [5]:
G = nx.read_gml('./Datasets/Political Blogs/polblogs.gml').to_undirected()

polit_vect = {}
for key, value in dict(G.nodes(data=True)).items():
    appending = 0
    if int(value['value']) == 0:
        appending = 1
    polit_vect.update({key: np.asarray([int(value['value']), appending])})
    
vect = polit_vect

### WebKB Wisc dataset

In [103]:
G = nx.read_edgelist('./Datasets/WebKB/Wisc.cites.txt').to_undirected()

vect = {}
sim_file = open("../Datasets/WebKB/Wisc.content.txt","r")
for line in tqdm_notebook(sim_file, total=len(G.nodes()), leave=False):
    node = line.split()[0]
    label = line.split()[-1]
    features = np.asarray(list(map(int,line.split()[1:-2])))
    vect.update({str(node):features})
sim_file.close()

HBox(children=(IntProgress(value=0, max=265), HTML(value='')))

### Graph info

In [6]:
print('Number of nodes - {0}, Number of edges - {1}'.format(G.number_of_nodes(), G.number_of_edges()))

Number of nodes - 1490, Number of edges - 16718


### Synthetic networks

### ER-BA networks

In [10]:
G_first = nx.gnp_random_graph(500, 0.05, seed=3)
G_second = nx.barabasi_albert_graph(500, 26, seed=3)

### ER-Star graph

In [15]:
G_first = nx.gnp_random_graph(500, 0.05, seed=3)
G_second = nx.star_graph(499)

### Edge swap

In [17]:
G_first = nx.gnp_random_graph(500, 0.05, seed=3)
G_second = nx.double_edge_swap(G_first.copy(), nswap = 800, max_tries = 24000)

**Removing nodes without features**

In [6]:
nodes = list(G.nodes)

for node in nodes:
    if node not in vect:
        G.remove_node(node)
        
print('Number of nodes - {0}, Number of edges - {1}'.format(G.number_of_nodes(), G.number_of_edges()))

Number of nodes - 1490, Number of edges - 16718


**Removing nodes without edges**

In [7]:
nodes = list(G.nodes)
for node in nodes:
    degree = G.degree(node)
    if degree == 0:
        G.remove_node(node)
        
print('Number of nodes - {0}, Number of edges - {1}'.format(G.number_of_nodes(), G.number_of_edges()))

Number of nodes - 1224, Number of edges - 16718


**Calculating significance normalization & common interests statistics**

In [19]:
significance_normalized_vect, omega = significance_normalization(vect)

_, mu, sigma = calc_common_interests_stats(G, vect)
mu, sigma

(0.9057901662878335, 0.2921204219908161)

#### Weighted graphs for synthetic data

In [53]:
for e in G_first.edges():
    G_first[e[0]][e[1]]['weight'] = 1
    
print('Graph is weighted - {0}'.format(nx.is_weighted(G_first)))


for e in G_second.edges():
    G_second[e[0]][e[1]]['weight'] = 1
    
print('Graph is weighted - {0}'.format(nx.is_weighted(G_second)))

Graph is weighted - True
Graph is weighted - True


**Creating weighted graph**

In [28]:
modified_G = G.copy()

for e in modified_G.edges():
    modified_G[e[0]][e[1]]['weight'] = 1
    
print('Graph is weighted - {0}'.format(nx.is_weighted(modified_G)))

Graph is weighted - True


**If topology is not fixed and similarity matrix file doesn't exist, then calculating similarity matrix and saving it in the file**

In [13]:
fixed_topology = True
similarity_file_exists = True

similarity_matrix_file_name = 'similarity_matrix_polit_cos.txt'
if (fixed_topology == False and similarity_file_exists == False):
    not_fixed_topology_graph(G, vect, omega, cosine_sim, similarity_matrix_file_name)  

#### Constructing graph from the vector data

In [37]:
attr_G = nx.Graph()

In [39]:
from random import shuffle

sim_file = open(similarity_matrix_file_name,"r")

for line in tqdm_notebook(sim_file, total=len(G.nodes()), leave=False):
    sim_tuple = eval(line)
    node = sim_tuple[0]
    res_tuple = sim_tuple[1][1:]
    for i in range(len(G.nodes())-1):
        if (node != res_tuple[i][0]):
            if (not attr_G.has_edge(node, res_tuple[i][0])):
                attr_G.add_edge(node, res_tuple[i][0], weight=res_tuple[i][1])
sim_file.close()

HBox(children=(FloatProgress(value=0.0, max=1224.0), HTML(value='')))

In [40]:
print('Number of nodes - {0}, Number of edges - {1}'.format(attr_G.number_of_nodes(), attr_G.number_of_edges()))

Number of nodes - 1224, Number of edges - 748476


In [41]:
attr_G.size(weight='weight')

374508.0

In [35]:
attr_G.size(weight='weight')

748476.0

**If topology is not fixed, then add edges to the top K neighbors of each node with zero weight, if there was no edge before**

In [73]:
top_k_neighbors = 499
from random import shuffle

if fixed_topology == False:
    sim_file = open(similarity_matrix_file_name,"r")

    for line in tqdm_notebook(sim_file, total=len(G.nodes()), leave=False):
        sim_tuple = eval(line)
        node = sim_tuple[0]
        print(node)
        res_tuple = sim_tuple[1][1:]
        shuffle(res_tuple)
        res_tuple.sort(key = lambda x: x[1], reverse=True)
        for i in range(top_k_neighbors):
            if (node != res_tuple[i][0]):
                if (not modified_G.has_edge(node, res_tuple[i][0])):
                    modified_G.add_edge(node, res_tuple[i][0], weight=0)
    sim_file.close()

In [61]:
print('Number of nodes - {0}, Number of edges - {1}'.format(modified_G.number_of_nodes(), modified_G.number_of_edges()))

Number of nodes - 500, Number of edges - 124750


**Detecting communities and metrics calculation, based on input parameters**

In [24]:
alphas =  np.arange(0,1.1,0.1)

In [54]:
similarity_metrics = [cosine_sim]
metric_names = ['cosine_sim']'
gaussian_weighting = [False]
algorithm = ['louvain']
metrics_report = []
clusters = 0
partition = []
clusts = {}
for n, similarity in enumerate(tqdm_notebook(similarity_metrics)):
    for alpha in tqdm_notebook(alphas, leave=False):
        for k, gaussian in enumerate(tqdm_notebook(gaussian_weighting, leave=False)):
            for algo in algorithm:
                fname = 'algorithm-{}||metric_name-{}||alpha-{}||gaussian_weighting-{}'.format(algo, metric_names[n], alpha, gaussian_weighting[k])
 
                report,clusters,G,partition = calculate_metrics_graphs(G_first, G_second, alpha, algo=algo, viz=False)

                report['alpha'] = alpha
                report['algorithm'] = algo
                metrics_report.append(report)
                clusts.update({alpha:partition})


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4524.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2431.0), HTML(value='')))




### Save Results

In [30]:
topology = 'f' #f - fixed, nf - not fixed
sim = 'cos' #cos - cosine similarity, hamm - hamming similarity
top_k = '' #kpercent - top k percent (0, 0.25, 0.5, 0.75)
dataset = 'final_consen_graphs_cornell' #dataset name
result_file_name = dataset+'_'+sim+'_'+topology+top_k+'.txt'
hs = open(result_file_name,'w+')
hs.write(str(metrics_report))
hs.close()

### Load Results

In [36]:
hs = open(result_file_name,'r')

for line in hs:
    metrics_report = eval(line)

In [49]:
mods_mean = []
mods_std = []
ents_mean = []
ents_std = []
attr_mods_mean = []
attr_mods_std = []
modified_mods_mean = []
modified_mods_std = []

for i in metrics_report:
    mods_mean.append(i['modularity_mean'])
    ents_mean.append(1 - i['graph_entropy_mean'])
    attr_mods_mean.append(i['attr_modularity_mean'])
    attr_mods_std.append(i['attr_modularity_std'])
    mods_std.append(i['modularity_std'])
    ents_std.append(i['graph_entropy_std'])
    modified_mods_mean.append(i['mod_modularity_mean'])
    modified_mods_std.append(i['mod_modularity_std'])

### Compute Modularities

In [51]:
alphas =  np.arange(0,1.05,0.05)
mods_sum = []
mods_alpha = []
mods_alpha_std = []
attr_mods_alpha = []
attr_mods_alpha_std = []
mods_mix = []
differential_mean = []
differential_std = []
for i in range(len(mods_mean)):
    mods_alpha.append(mods_mean[i] * alphas[i])
    mods_alpha_std.append(mods_std[i])
    attr_mods_alpha.append(attr_mods_mean[i] * (1-alphas[i]))
    attr_mods_alpha_std.append(attr_mods_std[i])    
    mods_sum.append(mods_alpha[i] + attr_mods_alpha[i])
    if alphas[i] == 0 or alphas[i] == 1:
        modss.append(0)
        mods_mix.append(0)
        modss_std.append(0)
    else:
        al = (2/(alphas[i] * (1-alphas[i])))
        diff_mean = (modified_mods_mean[i] - (alphas[i] * mods_mean[i] + (1-alphas[i]) * attr_mods_mean[i]))
        differential_mean.append(diff_mean)
        diff_std = math.sqrt(modified_mods_std[i]**2 + (mods_alpha_std[i]**2 + attr_mods_alpha_std[i]**2))
        differential_std.append(diff_std)
        mix = al * mods
        mods_mix.append(mix)

### Results plot

In [53]:
import plotly 

import plotly.graph_objs as go

trace0 = go.Scatter(
    x = alphas,
    y = modified_mods_mean,
    mode = 'lines+markers',
    error_y=dict(type='data', array=modified_mods_std),
    name = 'Composite',
    line=dict(color='dodgerblue', width=2,dash='dash')
)
trace1 = go.Scatter(
    x = alphas,
    y = mods_alpha,
    mode = 'lines+markers',
    error_y=dict(type='data', array=mods_alpha_std),
    name = 'Structural',
    line=dict(color='crimson', width=2)
)
trace2 = go.Scatter(
    x = alphas,
    y = attr_mods_alpha,
    error_y=dict(type='data', array=attr_mods_alpha_std),
    mode = 'lines+markers',
    name = 'Attributive',
    line=dict(color='salmon', width=2)
)
trace3 = go.Scatter(
    x = alphas,
    y = differential_mean,
    error_y=dict(type='data', array=differential_std),
    mode = 'lines+markers',
    name = 'Differential',
    line=dict(color='darkblue', width=2, dash='dash')
)
data = [trace0, trace1, trace2, trace3]

layout = go.Layout(
    yaxis=dict(
        title='Modularity'
    ),
    xaxis=dict(
        title=r'$\alpha$'    
    )
)

fig = go.Figure(data=data, layout=layout)
fig.update_layout(showlegend=False)
fig.show()
