In [1]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import SpectralClustering, AgglomerativeClustering, KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
G = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph, nodetype=int)
H = nx.read_edgelist("./EDGES_FILE.csv", create_using=nx.DiGraph, nodetype=int)

In [3]:
for (x, y) in G.edges():
    if x == y:
        if H.has_node(x):
            H.remove_node(x)
G = H

In [4]:
len(G.nodes())

765

In [5]:
len(G.edges())

50294

In [6]:
### Reading communities
data = pd.read_csv("./PI_comms.csv", header=None, sep=";").sort_values(by=0)[1].values

---
# Spectral clustering

## AVPRA pred

In [8]:
obj = pd.read_pickle("../AVPRA_pred.pickled")

In [9]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for res in obj:
    X_data = res[1]
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()), assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [11]:
pd.Series(scores).describe()

count    21.000000
mean      0.737572
std       0.207223
min       0.011947
25%       0.593828
50%       0.791030
75%       0.888262
max       0.892796
dtype: float64

## Node2Vec

In [18]:
tests = 0
with open("../n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [17]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()),
                  assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [16]:
pd.Series(scores).describe()

count    24.000000
mean      0.926852
std       0.009191
min       0.909761
25%       0.919890
50%       0.926979
75%       0.931694
max       0.946199
dtype: float64

## DW

In [17]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [18]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()),
                  assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [19]:
pd.Series(scores).describe()

count    15.000000
mean      0.903192
std       0.043959
min       0.782500
25%       0.905339
50%       0.921003
75%       0.927275
max       0.932670
dtype: float64

## MNMF

In [20]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [21]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = SpectralClustering(n_clusters=len(pd.Series(data).unique()),
                  assign_labels='discretize', random_state=0).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [22]:
pd.Series(scores).describe()

count    10.000000
mean      0.864333
std       0.055778
min       0.714297
25%       0.863564
50%       0.874481
75%       0.889222
max       0.909307
dtype: float64

---
# KMeans

## AVPRA pred

In [19]:
obj = pd.read_pickle("../AVPRA_pred.pickled")

In [20]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for res in obj:
    X_data = res[1]
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [22]:
pd.Series(scores).describe()

count    21.000000
mean      0.650411
std       0.296213
min       0.011957
25%       0.576826
50%       0.776372
75%       0.866030
max       0.883197
dtype: float64

## Node2Vec

In [31]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [32]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [33]:
pd.Series(scores).describe()

count    24.000000
mean      0.932014
std       0.011377
min       0.910573
25%       0.923063
50%       0.933629
75%       0.940319
max       0.949023
dtype: float64

## DW

In [34]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [35]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [36]:
pd.Series(scores).describe()

count    15.000000
mean      0.926282
std       0.033712
min       0.818668
25%       0.918562
50%       0.934121
75%       0.939680
max       0.966026
dtype: float64

## MNMF

In [37]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [38]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = KMeans(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [39]:
pd.Series(scores).describe()

count    10.000000
mean      0.867234
std       0.048764
min       0.738645
25%       0.861293
50%       0.878172
75%       0.889627
max       0.918142
dtype: float64

---
# Agglomerative clustering

## AVPRA pred

In [27]:
obj = pd.read_pickle("../AVPRA_pred.pickled")

In [28]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for res in obj:
    X_data = res[1]
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [30]:
pd.Series(scores).describe()

count    21.000000
mean      0.624302
std       0.327552
min       0.011957
25%       0.576826
50%       0.777980
75%       0.874037
max       0.882575
dtype: float64

## Node2Vec

In [48]:
tests = 0
with open("./n2v/n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [49]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./n2v/n2v_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [50]:
pd.Series(scores).describe()

count    24.000000
mean      0.872850
std       0.012678
min       0.837588
25%       0.866319
50%       0.871789
75%       0.882491
max       0.894487
dtype: float64

## DW

In [51]:
tests = 0
with open("./dw/dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [52]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./dw/dw_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [53]:
pd.Series(scores).describe()

count    15.000000
mean      0.867466
std       0.028090
min       0.796700
25%       0.855911
50%       0.871539
75%       0.882024
max       0.907870
dtype: float64

## MNMF

In [54]:
tests = 0
with open("./mnmf/mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line: tests += 1

In [55]:
clf = RandomForestClassifier(n_estimators=70)
scores = []
for i in range(tests):
    X_data = pd.read_csv("./mnmf/mnmf_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
    X_data = [X_data[i] for i in range(len(X_data)) if (i + 1) in list(G.nodes())]
    
    y_data = AgglomerativeClustering(n_clusters=len(pd.Series(data).unique())).fit(X_data).labels_
    
    scores.append(normalized_mutual_info_score(data, y_data))

In [56]:
pd.Series(scores).describe()

count    10.000000
mean      0.845866
std       0.044883
min       0.729674
25%       0.845983
50%       0.864453
75%       0.869252
max       0.878041
dtype: float64