In [1]:
import pandas as pd
import networkx as nx
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import KMeans
import time

In [2]:
network = nx.read_edgelist("./HR_edges_norm.csv")
nodes = list(network.nodes())
len(nodes)

54573

# Louvain communities (resolution = 0.5)

In [3]:
comms = nx.algorithms.community.louvain_communities(network, resolution=0.5)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### Node2Vec

In [4]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HR_n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [5]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_n2v/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.331599473953247
Iteration completed in 7.7656614780426025
Iteration completed in 7.692887306213379
Iteration completed in 8.207183599472046
Iteration completed in 7.267174243927002
Iteration completed in 7.914538145065308
Iteration completed in 7.7392308712005615
Iteration completed in 7.62525749206543
Iteration completed in 7.806779623031616
Iteration completed in 8.404552459716797
Iteration completed in 8.116411447525024
Iteration completed in 7.536559104919434
Iteration completed in 7.004827499389648
Iteration completed in 7.985484600067139
Iteration completed in 7.535995006561279
Iteration completed in 7.53463339805603
Iteration completed in 7.027822971343994
Iteration completed in 7.978514909744263
Iteration completed in 7.679731130599976
Iteration completed in 8.172043800354004
Iteration completed in 7.488661050796509
Iteration completed in 7.393095016479492
Iteration completed in 7.737670183181763
Iteration completed in 7.453378200531006


In [6]:
pd.Series(scores).describe()

count    24.000000
mean      0.693513
std       0.018151
min       0.646814
25%       0.686709
50%       0.698186
75%       0.705034
max       0.716896
dtype: float64

In [7]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
9,0.646814,336.869814,0.5,0.5,80.0,5.0
1,0.659768,289.630887,1.0,1.0,80.0,5.0
3,0.669257,575.860253,1.0,1.0,80.0,10.0
4,0.671615,227.120782,0.5,1.0,40.0,5.0
19,0.67383,695.624031,1.0,2.0,80.0,10.0
18,0.680226,383.037323,1.0,2.0,40.0,10.0
6,0.68887,397.617808,0.5,1.0,40.0,10.0
21,0.689581,349.057279,2.0,1.0,80.0,5.0
16,0.692657,234.924095,1.0,2.0,40.0,5.0
7,0.693912,687.228601,0.5,1.0,80.0,10.0


### DeepWalk

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HR_dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [9]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_dw/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 6.879989862442017
Iteration completed in 6.695404052734375
Iteration completed in 5.786787033081055
Iteration completed in 5.9230241775512695
Iteration completed in 5.149111986160278
Iteration completed in 6.211912393569946
Iteration completed in 5.427591323852539
Iteration completed in 5.725321292877197
Iteration completed in 5.2873146533966064
Iteration completed in 5.594644069671631
Iteration completed in 5.745347738265991
Iteration completed in 5.780768394470215
Iteration completed in 5.908958911895752
Iteration completed in 5.734328746795654
Iteration completed in 6.013226270675659


In [10]:
pd.Series(scores).describe()

count    15.000000
mean      0.692844
std       0.029981
min       0.613713
25%       0.680944
50%       0.691713
75%       0.716735
max       0.727466
dtype: float64

In [11]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.613713,38.241227,10.0,5.0
11,0.660093,1482.992684,80.0,20.0
9,0.67502,379.942767,20.0,20.0
3,0.675683,288.967294,80.0,5.0
1,0.686205,72.052979,20.0,5.0
6,0.689487,341.508353,40.0,10.0
7,0.690149,670.22348,80.0,10.0
10,0.691713,734.545967,40.0,20.0
2,0.692148,142.735676,40.0,5.0
12,0.709823,310.089163,10.0,30.0


### M-NMF

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HR_mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [13]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_mnmf/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.116183519363403
Iteration completed in 4.099889516830444
Iteration completed in 3.402350664138794
Iteration completed in 3.3349621295928955
Iteration completed in 3.7601921558380127
Iteration completed in 4.108396768569946
Iteration completed in 5.0509560108184814


In [14]:
pd.Series(scores).describe()

count    7.000000
mean     0.584646
std      0.054394
min      0.503195
25%      0.560721
50%      0.579595
75%      0.611228
max      0.665838
dtype: float64

In [15]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
6,0.503195,18979.924733,64.0,100.0
5,0.560224,11782.491975,32.0,200.0
4,0.561218,6102.296241,32.0,100.0
0,0.579595,423.98246,8.0,100.0
1,0.580068,1231.952486,8.0,200.0
3,0.642387,3089.716942,16.0,200.0
2,0.665838,1873.740525,16.0,100.0


### DANMF

In [16]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HR_danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [17]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_danmf/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 3.8698770999908447
Iteration completed in 3.432105302810669
Iteration completed in 3.6697311401367188
Iteration completed in 3.8753888607025146
Iteration completed in 3.7169392108917236
Iteration completed in 3.917273759841919
Iteration completed in 3.5640556812286377
Iteration completed in 3.744117498397827
Iteration completed in 4.081553936004639
Iteration completed in 4.548980236053467
Iteration completed in 4.313671588897705
Iteration completed in 4.114750623703003
Iteration completed in 3.5640857219696045
Iteration completed in 3.488253116607666
Iteration completed in 4.5316548347473145


In [18]:
pd.Series(scores).describe()

count    15.000000
mean      0.178366
std       0.020713
min       0.138327
25%       0.170892
50%       0.181866
75%       0.193840
max       0.201114
dtype: float64

In [19]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
14,0.138327,2270.463403,"[128, 32]",50.0,100.0
11,0.144,1242.205607,"[128, 32]",100.0,100.0
10,0.145215,1088.522675,"[128, 32]",50.0,100.0
9,0.169514,775.608052,"[128, 32]",100.0,50.0
5,0.172271,382.492228,"[64, 16]",100.0,50.0
8,0.179421,615.958076,"[128, 32]",50.0,50.0
2,0.181451,268.152193,"[32, 8]",50.0,100.0
13,0.181866,1321.520216,"[64, 16]",100.0,50.0
0,0.188884,151.062392,"[32, 8]",50.0,50.0
7,0.191573,653.606651,"[64, 16]",100.0,100.0


### AVPRA

In [20]:
obj = pd.read_pickle("HR.pickled")

In [21]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.5422093868255615
Iteration completed in 10.288382053375244
Iteration completed in 10.794194221496582
Iteration completed in 10.142043113708496
Iteration completed in 8.301552057266235
Iteration completed in 13.014867305755615
Iteration completed in 8.536734580993652
Iteration completed in 9.018270254135132
Iteration completed in 7.813661575317383
Iteration completed in 9.172207117080688
Iteration completed in 8.290274381637573
Iteration completed in 8.393621921539307
Iteration completed in 8.873093366622925
Iteration completed in 8.038248300552368
Iteration completed in 7.657997131347656
Iteration completed in 7.560784339904785
Iteration completed in 6.796103477478027
Iteration completed in 6.638578176498413
Iteration completed in 5.932850122451782
Iteration completed in 6.83058762550354
Iteration completed in 5.884646654129028


In [22]:
pd.Series(scores).describe()

count    21.000000
mean      0.271793
std       0.141890
min       0.004307
25%       0.177403
50%       0.291835
75%       0.403649
max       0.443808
dtype: float64

In [23]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.4438082813786204, 28)

# Leiden communities

In [24]:
### Identifying Leiden communities
from cdlib import algorithms
comms = algorithms.leiden(network)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [25]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]
comms = list(pd.Series(comms_l).unique())

### Node2Vec

In [26]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HR_n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [27]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_n2v/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.842753648757935
Iteration completed in 9.393211603164673
Iteration completed in 9.450714826583862
Iteration completed in 15.647930383682251
Iteration completed in 13.868258953094482
Iteration completed in 10.558783292770386
Iteration completed in 11.820250988006592
Iteration completed in 11.773685932159424
Iteration completed in 9.442829608917236
Iteration completed in 10.349641561508179
Iteration completed in 10.021692037582397
Iteration completed in 12.367534399032593
Iteration completed in 9.77647852897644
Iteration completed in 10.783809661865234
Iteration completed in 9.563246965408325
Iteration completed in 11.334004402160645
Iteration completed in 11.112586259841919
Iteration completed in 11.670174837112427
Iteration completed in 10.6818208694458
Iteration completed in 11.392046928405762
Iteration completed in 9.39042592048645
Iteration completed in 9.700617551803589
Iteration completed in 9.870014190673828
Iteration completed in 9.688677072525024


In [28]:
pd.Series(scores).describe()

count    24.000000
mean      0.743911
std       0.011745
min       0.726088
25%       0.734651
50%       0.743062
75%       0.754545
max       0.763007
dtype: float64

In [29]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
2,0.726088,331.992402,1.0,1.0,40.0,10.0
5,0.726393,349.895019,0.5,1.0,80.0,5.0
3,0.730027,575.860253,1.0,1.0,80.0,10.0
1,0.732179,289.630887,1.0,1.0,80.0,5.0
22,0.733378,400.727971,2.0,1.0,40.0,10.0
14,0.734508,397.794394,1.0,0.5,40.0,10.0
19,0.734699,695.624031,1.0,2.0,80.0,10.0
21,0.735235,349.057279,2.0,1.0,80.0,5.0
17,0.735693,340.624793,1.0,2.0,80.0,5.0
9,0.736265,336.869814,0.5,0.5,80.0,5.0


### DeepWalk

In [30]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HR_dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [31]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_dw/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.447956085205078
Iteration completed in 7.680588006973267
Iteration completed in 8.184749603271484
Iteration completed in 7.3932812213897705
Iteration completed in 7.0993921756744385
Iteration completed in 6.734792470932007
Iteration completed in 7.201589822769165
Iteration completed in 7.298688650131226
Iteration completed in 7.130801200866699
Iteration completed in 8.313819646835327
Iteration completed in 7.527082920074463
Iteration completed in 7.510753631591797
Iteration completed in 6.990624904632568
Iteration completed in 7.300150632858276
Iteration completed in 7.432892322540283


In [32]:
pd.Series(scores).describe()

count    15.000000
mean      0.739018
std       0.030852
min       0.634941
25%       0.741707
50%       0.746017
75%       0.754416
max       0.763608
dtype: float64

In [33]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.634941,38.241227,10.0,5.0
1,0.720453,72.052979,20.0,5.0
13,0.729,699.778587,20.0,30.0
10,0.741226,734.545967,40.0,20.0
9,0.742187,379.942767,20.0,20.0
8,0.744241,188.168018,10.0,20.0
6,0.744477,341.508353,40.0,10.0
3,0.746017,288.967294,80.0,5.0
11,0.746409,1482.992684,80.0,20.0
2,0.748329,142.735676,40.0,5.0


### M-NMF

In [34]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HR_mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [35]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_mnmf/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 6.510114669799805
Iteration completed in 6.850919246673584
Iteration completed in 5.6729772090911865
Iteration completed in 5.4232096672058105
Iteration completed in 5.209472179412842
Iteration completed in 4.796982049942017
Iteration completed in 6.191199541091919


In [36]:
pd.Series(scores).describe()

count    7.000000
mean     0.622413
std      0.047168
min      0.565423
25%      0.577216
50%      0.643318
75%      0.658274
max      0.677170
dtype: float64

In [37]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
6,0.565423,18979.924733,64.0,100.0
1,0.576034,1231.952486,8.0,200.0
0,0.578398,423.98246,8.0,100.0
5,0.643318,11782.491975,32.0,200.0
2,0.656621,1873.740525,16.0,100.0
3,0.659927,3089.716942,16.0,200.0
4,0.67717,6102.296241,32.0,100.0


### DANMF

In [38]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HR_danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [39]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_danmf/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.165361166000366
Iteration completed in 5.801207780838013
Iteration completed in 4.73697304725647
Iteration completed in 4.635841608047485
Iteration completed in 4.585819482803345
Iteration completed in 4.813719987869263
Iteration completed in 4.689239501953125
Iteration completed in 4.880083322525024
Iteration completed in 5.233222961425781
Iteration completed in 6.515018701553345
Iteration completed in 5.459798574447632
Iteration completed in 5.426512002944946
Iteration completed in 4.518737316131592
Iteration completed in 4.7049877643585205
Iteration completed in 5.697383880615234


In [40]:
pd.Series(scores).describe()

count    15.000000
mean      0.229996
std       0.014230
min       0.207794
25%       0.221607
50%       0.226242
75%       0.242982
max       0.253431
dtype: float64

In [41]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
2,0.207794,268.152193,"[32, 8]",50.0,100.0
0,0.212062,151.062392,"[32, 8]",50.0,50.0
14,0.214423,2270.463403,"[128, 32]",50.0,100.0
8,0.220817,615.958076,"[128, 32]",50.0,50.0
10,0.222397,1088.522675,"[128, 32]",50.0,100.0
11,0.222578,1242.205607,"[128, 32]",100.0,100.0
1,0.225419,164.564006,"[32, 8]",100.0,50.0
9,0.226242,775.608052,"[128, 32]",100.0,50.0
12,0.23022,560.116846,"[32, 8]",50.0,50.0
3,0.232795,278.286608,"[32, 8]",100.0,100.0


### AVPRA

In [42]:
obj = pd.read_pickle("HR.pickled")

In [43]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.91072940826416
Iteration completed in 17.722856283187866
Iteration completed in 14.562046766281128
Iteration completed in 13.523773431777954
Iteration completed in 13.171441316604614
Iteration completed in 12.517785549163818
Iteration completed in 11.143616199493408
Iteration completed in 11.796430587768555
Iteration completed in 12.180866241455078
Iteration completed in 11.074919700622559
Iteration completed in 12.153450012207031
Iteration completed in 10.714354991912842
Iteration completed in 9.595331192016602
Iteration completed in 9.568687677383423
Iteration completed in 8.844791889190674
Iteration completed in 8.337555170059204
Iteration completed in 9.8772132396698
Iteration completed in 11.351671934127808
Iteration completed in 8.968091487884521
Iteration completed in 9.197301626205444
Iteration completed in 8.531840562820435


In [44]:
pd.Series(scores).describe()

count    21.000000
mean      0.315302
std       0.157648
min       0.006362
25%       0.204387
50%       0.362907
75%       0.454024
max       0.472775
dtype: float64

In [45]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.47277496103941996, 30)

# Standard Louvain

In [46]:
comms = nx.algorithms.community.louvain_communities(network)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### Node2Vec

In [47]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HR_n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [48]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_n2v/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.723774433135986
Iteration completed in 8.43609070777893
Iteration completed in 8.008605480194092
Iteration completed in 10.673197507858276
Iteration completed in 8.19767689704895
Iteration completed in 8.877493143081665
Iteration completed in 8.634859085083008
Iteration completed in 9.051877498626709
Iteration completed in 7.612519264221191
Iteration completed in 8.614978075027466
Iteration completed in 9.703436374664307
Iteration completed in 8.304780960083008
Iteration completed in 7.675602436065674
Iteration completed in 8.56263542175293
Iteration completed in 8.373178958892822
Iteration completed in 8.440598249435425
Iteration completed in 8.859447717666626
Iteration completed in 10.155167818069458
Iteration completed in 8.544877290725708
Iteration completed in 9.33073115348816
Iteration completed in 9.048550128936768
Iteration completed in 8.942191123962402
Iteration completed in 8.754749059677124
Iteration completed in 9.309175252914429


In [49]:
pd.Series(scores).describe()

count    24.000000
mean      0.713009
std       0.016205
min       0.664110
25%       0.709347
50%       0.712898
75%       0.723076
max       0.737054
dtype: float64

In [50]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
5,0.66411,349.895019,0.5,1.0,80.0,5.0
22,0.688027,400.727971,2.0,1.0,40.0,10.0
1,0.694371,289.630887,1.0,1.0,80.0,5.0
17,0.697139,340.624793,1.0,2.0,80.0,5.0
19,0.702277,695.624031,1.0,2.0,80.0,10.0
6,0.708936,397.617808,0.5,1.0,40.0,10.0
0,0.709484,181.323636,1.0,1.0,40.0,5.0
2,0.71033,331.992402,1.0,1.0,40.0,10.0
4,0.710796,227.120782,0.5,1.0,40.0,5.0
9,0.711072,336.869814,0.5,0.5,80.0,5.0


### DeepWalk

In [51]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HR_dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [52]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_dw/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.1090829372406
Iteration completed in 6.982439041137695
Iteration completed in 6.0959153175354
Iteration completed in 6.407828092575073
Iteration completed in 6.106272459030151
Iteration completed in 6.261347532272339
Iteration completed in 5.993591070175171
Iteration completed in 5.83030891418457
Iteration completed in 7.431742429733276
Iteration completed in 6.10495400428772
Iteration completed in 5.982682228088379
Iteration completed in 6.62027645111084
Iteration completed in 6.705248117446899
Iteration completed in 6.540389060974121
Iteration completed in 6.378283262252808


In [53]:
pd.Series(scores).describe()

count    15.000000
mean      0.712887
std       0.031083
min       0.619383
25%       0.703060
50%       0.722224
75%       0.729070
max       0.750982
dtype: float64

In [54]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.619383,38.241227,10.0,5.0
1,0.685029,72.052979,20.0,5.0
13,0.695593,699.778587,20.0,30.0
6,0.696022,341.508353,40.0,10.0
2,0.710099,142.735676,40.0,5.0
3,0.7146,288.967294,80.0,5.0
14,0.721718,1403.005183,40.0,30.0
12,0.722224,310.089163,10.0,30.0
9,0.724239,379.942767,20.0,20.0
7,0.725745,670.22348,80.0,10.0


### M-NMF

In [55]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HR_mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [56]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_mnmf/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 6.420207977294922
Iteration completed in 5.195116281509399
Iteration completed in 4.257173299789429
Iteration completed in 4.4861226081848145
Iteration completed in 4.165724039077759
Iteration completed in 4.23779559135437
Iteration completed in 5.66154408454895


In [57]:
pd.Series(scores).describe()

count    7.000000
mean     0.604183
std      0.045864
min      0.542218
25%      0.562656
50%      0.635734
75%      0.640076
max      0.645866
dtype: float64

In [58]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
6,0.542218,18979.924733,64.0,100.0
1,0.560877,1231.952486,8.0,200.0
0,0.564434,423.98246,8.0,100.0
4,0.635734,6102.296241,32.0,100.0
5,0.637231,11782.491975,32.0,200.0
3,0.64292,3089.716942,16.0,200.0
2,0.645866,1873.740525,16.0,100.0


### DANMF

In [59]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HR_danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [60]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HR_danmf/HR_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.346450567245483
Iteration completed in 4.098658561706543
Iteration completed in 3.8401408195495605
Iteration completed in 4.1244940757751465
Iteration completed in 4.304577589035034
Iteration completed in 4.415486574172974
Iteration completed in 4.215148687362671
Iteration completed in 5.341899394989014
Iteration completed in 4.6489996910095215
Iteration completed in 4.782965660095215
Iteration completed in 4.603335857391357
Iteration completed in 4.693725824356079
Iteration completed in 4.023404359817505
Iteration completed in 4.163527011871338
Iteration completed in 5.866884708404541


In [61]:
pd.Series(scores).describe()

count    15.000000
mean      0.225286
std       0.018272
min       0.189156
25%       0.214202
50%       0.224071
75%       0.241991
max       0.253102
dtype: float64

In [62]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
9,0.189156,775.608052,"[128, 32]",100.0,50.0
2,0.208011,268.152193,"[32, 8]",50.0,100.0
0,0.210246,151.062392,"[32, 8]",50.0,50.0
14,0.213698,2270.463403,"[128, 32]",50.0,100.0
10,0.214705,1088.522675,"[128, 32]",50.0,100.0
8,0.214718,615.958076,"[128, 32]",50.0,50.0
11,0.214886,1242.205607,"[128, 32]",100.0,100.0
12,0.224071,560.116846,"[32, 8]",50.0,50.0
1,0.227309,164.564006,"[32, 8]",100.0,50.0
3,0.229707,278.286608,"[32, 8]",100.0,100.0


### AVPRA

In [63]:
obj = pd.read_pickle("HR.pickled")

In [64]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.91971230506897
Iteration completed in 14.413098335266113
Iteration completed in 12.814177513122559
Iteration completed in 13.218384027481079
Iteration completed in 14.002095460891724
Iteration completed in 10.241180419921875
Iteration completed in 12.63731074333191
Iteration completed in 10.438766241073608
Iteration completed in 12.012093544006348
Iteration completed in 12.016782999038696
Iteration completed in 9.613791227340698
Iteration completed in 9.45266079902649
Iteration completed in 8.725764989852905
Iteration completed in 8.725199699401855
Iteration completed in 10.98530650138855
Iteration completed in 9.91573429107666
Iteration completed in 8.122362852096558
Iteration completed in 8.714717388153076
Iteration completed in 9.112902641296387
Iteration completed in 10.616956949234009
Iteration completed in 8.107795476913452


In [65]:
pd.Series(scores).describe()

count    21.000000
mean      0.303908
std       0.152447
min       0.006207
25%       0.208002
50%       0.348698
75%       0.442792
max       0.476735
dtype: float64

In [66]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.4767346337451851, 30)