In [28]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import SpectralClustering, AgglomerativeClustering, KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
from cdlib import algorithms
import matplotlib.pyplot as plt

Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'graph_tool'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [21]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph, nodetype=int)

In [22]:
nodes = []
l = list(nx.selfloop_edges(G))
for x, _ in l:
    if(G.degree(x) == 2): 
        G.remove_node(x)
        nodes.append(x)

In [23]:
len(G.nodes())

778

In [24]:
len(G.edges())

50492

In [34]:
nodes = sorted(G.nodes())
comms = algorithms.leiden(G)
comms_dict = comms.to_node_community_map()

In [42]:
### Reading communities
data = []
for node in sorted(G.nodes()):
    data.append(comms_dict[node][0])

---
# Spectral clustering

## AVPRA pred

In [44]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

In [45]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for res in obj:
    X_data = res[1]
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()), assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [46]:
pd.Series(scores).describe()

count    21.000000
mean      0.673303
std       0.299259
min       0.011711
25%       0.582413
50%       0.792400
75%       0.881873
max       0.885927
dtype: float64

## Node2Vec

In [51]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [52]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()),
                  assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [53]:
pd.Series(scores).describe()

count    24.000000
mean      0.923093
std       0.011652
min       0.905277
25%       0.915186
50%       0.923428
75%       0.929155
max       0.947721
dtype: float64

## DW

In [54]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [55]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()),
                  assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [56]:
pd.Series(scores).describe()

count    15.000000
mean      0.560066
std       0.245529
min       0.010802
25%       0.552977
50%       0.577412
75%       0.743288
max       0.778407
dtype: float64

## MNMF

In [57]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [58]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()),
                  assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [59]:
pd.Series(scores).describe()

count    10.000000
mean      0.837727
std       0.051359
min       0.704776
25%       0.828794
50%       0.848170
75%       0.864736
max       0.885120
dtype: float64

---
# KMeans

## AVPRA pred

In [60]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

In [61]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for res in obj:
    X_data = res[1]
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [62]:
pd.Series(scores).describe()

count    21.000000
mean      0.648563
std       0.294960
min       0.011692
25%       0.576427
50%       0.779850
75%       0.862650
max       0.878494
dtype: float64

## Node2Vec

In [63]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [64]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [65]:
pd.Series(scores).describe()

count    24.000000
mean      0.922875
std       0.012164
min       0.900015
25%       0.912948
50%       0.924181
75%       0.933499
max       0.942530
dtype: float64

## DW

In [66]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [67]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [68]:
pd.Series(scores).describe()

count    15.000000
mean      0.857627
std       0.096292
min       0.578449
25%       0.798926
50%       0.895320
75%       0.923232
max       0.931262
dtype: float64

## MNMF

In [69]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [70]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [71]:
pd.Series(scores).describe()

count    10.000000
mean      0.836896
std       0.045449
min       0.725168
25%       0.823842
50%       0.847486
75%       0.856651
max       0.891506
dtype: float64

---
# Agglomerative clustering

## AVPRA pred

In [73]:
obj = pd.read_pickle("./AVPRA_pred.pickled")

In [74]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for res in obj:
    X_data = res[1]
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [75]:
pd.Series(scores).describe()

count    21.000000
mean      0.620134
std       0.326224
min       0.011692
25%       0.563714
50%       0.777862
75%       0.865559
max       0.876934
dtype: float64

## Node2Vec

In [76]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [77]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [78]:
pd.Series(scores).describe()

count    24.000000
mean      0.849865
std       0.013639
min       0.813657
25%       0.843968
50%       0.850843
75%       0.857602
max       0.871736
dtype: float64

## DW

In [79]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [80]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [81]:
pd.Series(scores).describe()

count    15.000000
mean      0.805443
std       0.066135
min       0.694520
25%       0.739168
50%       0.837289
75%       0.855612
max       0.878090
dtype: float64

## MNMF

In [82]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [83]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [84]:
pd.Series(scores).describe()

count    10.000000
mean      0.824245
std       0.047157
min       0.700570
25%       0.823323
50%       0.836082
75%       0.850426
max       0.865114
dtype: float64