In [1]:
import pandas as pd
import networkx as nx
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import KMeans
import time

In [2]:
network = nx.read_edgelist("./HU_edges_norm.csv")
nodes = list(network.nodes())
len(nodes)

47538

# Louvain communities (resolution = 0.5)

In [3]:
comms = nx.algorithms.community.louvain_communities(network, resolution=0.5)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### Node2Vec

In [4]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HU_n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [5]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_n2v/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 12.028431415557861
Iteration completed in 9.056847333908081
Iteration completed in 6.97290301322937
Iteration completed in 7.858198404312134
Iteration completed in 6.8143393993377686
Iteration completed in 7.303172588348389
Iteration completed in 7.8601603507995605
Iteration completed in 7.327114582061768
Iteration completed in 7.4711151123046875
Iteration completed in 6.921138048171997
Iteration completed in 6.735317945480347
Iteration completed in 6.787053108215332
Iteration completed in 7.327625751495361
Iteration completed in 6.949277639389038
Iteration completed in 7.4474382400512695
Iteration completed in 7.383962154388428
Iteration completed in 6.961954116821289
Iteration completed in 6.729768514633179
Iteration completed in 7.218799114227295
Iteration completed in 6.879249811172485
Iteration completed in 6.595710277557373
Iteration completed in 7.079077959060669
Iteration completed in 7.52295446395874
Iteration completed in 7.086114883422852


In [6]:
pd.Series(scores).describe()

count    24.000000
mean      0.564445
std       0.018576
min       0.526067
25%       0.547891
50%       0.567265
75%       0.580527
max       0.593120
dtype: float64

In [7]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
5,0.526067,219.425555,0.5,1.0,80.0,5.0
7,0.537533,485.59117,0.5,1.0,80.0,10.0
16,0.539821,124.044357,1.0,2.0,40.0,5.0
10,0.543001,254.258841,0.5,0.5,40.0,10.0
22,0.546588,248.845797,2.0,1.0,40.0,10.0
9,0.546816,214.085608,0.5,0.5,80.0,5.0
21,0.548249,217.007686,2.0,1.0,80.0,5.0
17,0.5529,208.078459,1.0,2.0,80.0,5.0
1,0.556306,192.255094,1.0,1.0,80.0,5.0
4,0.56094,125.554072,0.5,1.0,40.0,5.0


### DeepWalk

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HU_dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [9]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_dw/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.156540870666504
Iteration completed in 5.332842826843262
Iteration completed in 5.432244777679443
Iteration completed in 4.8408544063568115
Iteration completed in 5.13166618347168
Iteration completed in 4.865458726882935
Iteration completed in 4.9569432735443115
Iteration completed in 4.903080701828003
Iteration completed in 4.868448495864868
Iteration completed in 4.842025518417358
Iteration completed in 5.087102651596069
Iteration completed in 4.714668035507202
Iteration completed in 5.218735933303833
Iteration completed in 4.624604940414429
Iteration completed in 4.933584928512573


In [10]:
pd.Series(scores).describe()

count    15.000000
mean      0.554751
std       0.069009
min       0.319676
25%       0.561377
50%       0.571788
75%       0.584464
max       0.604623
dtype: float64

In [11]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.319676,36.506493,10.0,5.0
1,0.504856,68.210582,20.0,5.0
3,0.554651,266.070136,80.0,5.0
2,0.561104,133.75116,40.0,5.0
4,0.56165,87.543224,10.0,10.0
7,0.565262,681.768971,80.0,10.0
11,0.56692,1184.07436,80.0,20.0
10,0.571788,623.870464,40.0,20.0
13,0.574594,476.520527,20.0,30.0
14,0.580729,972.867248,40.0,30.0


### M-NMF

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HU_mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [13]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_mnmf/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.7319841384887695
Iteration completed in 2.883859157562256
Iteration completed in 2.7270898818969727
Iteration completed in 2.669318437576294
Iteration completed in 3.2785675525665283
Iteration completed in 3.431370258331299
Iteration completed in 4.074173212051392
Iteration completed in 4.103356599807739


In [14]:
pd.Series(scores).describe()

count    8.000000
mean     0.399678
std      0.079331
min      0.275918
25%      0.347306
50%      0.425289
75%      0.452991
max      0.494852
dtype: float64

In [15]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
6,0.275918,3373.030116,64.0,100.0
7,0.299121,5875.845202,64.0,200.0
5,0.363367,2139.761206,32.0,200.0
4,0.413088,1211.796356,32.0,100.0
0,0.43749,95.28386,8.0,100.0
1,0.449189,285.491338,8.0,200.0
3,0.464397,701.013073,16.0,200.0
2,0.494852,421.509037,16.0,100.0


### DANMF

In [16]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HU_danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [17]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_danmf/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 2.653794765472412
Iteration completed in 2.827075719833374
Iteration completed in 2.564420461654663
Iteration completed in 2.7988288402557373
Iteration completed in 2.9529922008514404
Iteration completed in 3.2909326553344727
Iteration completed in 3.1967074871063232
Iteration completed in 3.185584306716919
Iteration completed in 3.782071828842163
Iteration completed in 4.49941086769104
Iteration completed in 3.9415977001190186
Iteration completed in 3.7725577354431152
Iteration completed in 3.253748655319214
Iteration completed in 3.420621156692505
Iteration completed in 4.189456224441528


In [18]:
pd.Series(scores).describe()

count    15.000000
mean      0.054202
std       0.011787
min       0.033869
25%       0.045134
50%       0.058743
75%       0.062478
max       0.072087
dtype: float64

In [19]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
9,0.033869,438.769644,"[128, 32]",100.0,50.0
10,0.037903,453.801099,"[128, 32]",50.0,100.0
2,0.03841,108.151318,"[32, 8]",50.0,100.0
0,0.041557,59.408877,"[32, 8]",50.0,50.0
8,0.048711,295.975242,"[128, 32]",50.0,50.0
3,0.051341,115.569006,"[32, 8]",100.0,100.0
5,0.054563,174.576857,"[64, 16]",100.0,50.0
11,0.058743,591.802661,"[128, 32]",100.0,100.0
7,0.059059,254.67847,"[64, 16]",100.0,100.0
6,0.060119,220.051627,"[64, 16]",50.0,100.0


### AVPRA

In [20]:
obj = pd.read_pickle("HU.pickled")

In [21]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 3.8694090843200684
Iteration completed in 6.477968215942383
Iteration completed in 10.752213716506958
Iteration completed in 6.856592416763306
Iteration completed in 7.4364423751831055
Iteration completed in 6.49326753616333
Iteration completed in 7.081757068634033
Iteration completed in 6.529452323913574
Iteration completed in 6.27727198600769
Iteration completed in 6.799944639205933
Iteration completed in 5.7529473304748535
Iteration completed in 4.89371919631958
Iteration completed in 5.027863502502441
Iteration completed in 6.778052568435669
Iteration completed in 5.713987588882446
Iteration completed in 5.471696615219116
Iteration completed in 5.529600620269775
Iteration completed in 5.04375147819519
Iteration completed in 4.877014636993408
Iteration completed in 4.458091735839844
Iteration completed in 4.6913673877716064


In [22]:
pd.Series(scores).describe()

count    21.000000
mean      0.094791
std       0.072985
min       0.001710
25%       0.032083
50%       0.071565
75%       0.167964
max       0.202565
dtype: float64

In [23]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.20256519393760458, 26)

# Leiden communities

In [24]:
### Identifying Leiden communities
from cdlib import algorithms
comms = algorithms.leiden(network)

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [25]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]
comms = list(pd.Series(comms_l).unique())

### Node2Vec

In [26]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HU_n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [27]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_n2v/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.238068342208862
Iteration completed in 10.056778907775879
Iteration completed in 10.534879922866821
Iteration completed in 10.391219139099121
Iteration completed in 10.231378316879272
Iteration completed in 12.408665180206299
Iteration completed in 12.2688729763031
Iteration completed in 13.00015902519226
Iteration completed in 11.247504472732544
Iteration completed in 9.978865385055542
Iteration completed in 10.53772234916687
Iteration completed in 11.534544944763184
Iteration completed in 11.865022420883179
Iteration completed in 10.5569908618927
Iteration completed in 11.916696786880493
Iteration completed in 10.696899175643921
Iteration completed in 11.817073345184326
Iteration completed in 10.580912113189697
Iteration completed in 10.426180839538574
Iteration completed in 12.358215093612671
Iteration completed in 10.31209397315979
Iteration completed in 11.21254014968872
Iteration completed in 12.098194122314453
Iteration completed in 11.228184461593628


In [28]:
pd.Series(scores).describe()

count    24.000000
mean      0.648476
std       0.012891
min       0.628041
25%       0.638541
50%       0.646665
75%       0.657219
max       0.679190
dtype: float64

In [29]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
18,0.628041,248.378643,1.0,2.0,40.0,10.0
5,0.630813,219.425555,0.5,1.0,80.0,5.0
14,0.631256,257.42323,1.0,0.5,40.0,10.0
21,0.635697,217.007686,2.0,1.0,80.0,5.0
19,0.637213,461.019866,1.0,2.0,80.0,10.0
2,0.637914,235.38871,1.0,1.0,40.0,10.0
10,0.63875,254.258841,0.5,0.5,40.0,10.0
4,0.639974,125.554072,0.5,1.0,40.0,5.0
1,0.64114,192.255094,1.0,1.0,80.0,5.0
17,0.645039,208.078459,1.0,2.0,80.0,5.0


### DeepWalk

In [30]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HU_dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [31]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_dw/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 11.483800649642944
Iteration completed in 8.117223978042603
Iteration completed in 8.0434889793396
Iteration completed in 8.334552526473999
Iteration completed in 7.813381195068359
Iteration completed in 7.482179164886475
Iteration completed in 9.01228666305542
Iteration completed in 7.148815393447876
Iteration completed in 7.239252328872681
Iteration completed in 7.259500741958618
Iteration completed in 7.939733505249023
Iteration completed in 8.373581886291504
Iteration completed in 7.563658714294434
Iteration completed in 8.860290050506592
Iteration completed in 7.58933687210083


In [32]:
pd.Series(scores).describe()

count    15.000000
mean      0.628482
std       0.063369
min       0.421994
25%       0.627409
50%       0.648384
75%       0.660413
max       0.683439
dtype: float64

In [33]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.421994,36.506493,10.0,5.0
1,0.571351,68.210582,20.0,5.0
2,0.606254,133.75116,40.0,5.0
3,0.618345,266.070136,80.0,5.0
4,0.636472,87.543224,10.0,10.0
5,0.639122,170.240475,20.0,10.0
8,0.64804,185.26939,10.0,20.0
7,0.648384,681.768971,80.0,10.0
11,0.649761,1184.07436,80.0,20.0
10,0.654942,623.870464,40.0,20.0


### M-NMF

In [34]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HU_mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [35]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_mnmf/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.769985914230347
Iteration completed in 5.050445318222046
Iteration completed in 4.631993293762207
Iteration completed in 4.666110515594482
Iteration completed in 4.890218734741211
Iteration completed in 5.467757940292358
Iteration completed in 5.70485258102417
Iteration completed in 5.867469072341919


In [36]:
pd.Series(scores).describe()

count    8.000000
mean     0.538887
std      0.059121
min      0.466761
25%      0.490495
50%      0.534351
75%      0.588744
max      0.606991
dtype: float64

In [37]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
0,0.466761,95.28386,8.0,100.0
1,0.490175,285.491338,8.0,200.0
7,0.490601,5875.845202,64.0,200.0
6,0.491889,3373.030116,64.0,100.0
2,0.576813,421.509037,16.0,100.0
3,0.583557,701.013073,16.0,200.0
5,0.604307,2139.761206,32.0,200.0
4,0.606991,1211.796356,32.0,100.0


### DANMF

In [38]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HU_danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [39]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_danmf/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.2203733921051025
Iteration completed in 4.66333794593811
Iteration completed in 4.1436543464660645
Iteration completed in 4.351579189300537
Iteration completed in 4.59461236000061
Iteration completed in 5.4830710887908936
Iteration completed in 4.5650694370269775
Iteration completed in 4.5558764934539795
Iteration completed in 5.013767957687378
Iteration completed in 4.846024513244629
Iteration completed in 4.824814081192017
Iteration completed in 4.9342145919799805
Iteration completed in 4.03500509262085
Iteration completed in 5.543548107147217
Iteration completed in 4.6848978996276855


In [40]:
pd.Series(scores).describe()

count    15.000000
mean      0.134939
std       0.021864
min       0.092852
25%       0.128829
50%       0.136761
75%       0.143701
max       0.168628
dtype: float64

In [41]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
2,0.092852,108.151318,"[32, 8]",50.0,100.0
0,0.099066,59.408877,"[32, 8]",50.0,50.0
3,0.106873,115.569006,"[32, 8]",100.0,100.0
1,0.124199,71.049077,"[32, 8]",100.0,50.0
10,0.13346,453.801099,"[128, 32]",50.0,100.0
12,0.135323,216.932015,"[32, 8]",50.0,50.0
5,0.136604,174.576857,"[64, 16]",100.0,50.0
13,0.136761,516.619871,"[64, 16]",100.0,50.0
8,0.142028,295.975242,"[128, 32]",50.0,50.0
11,0.14207,591.802661,"[128, 32]",100.0,100.0


### AVPRA

In [42]:
obj = pd.read_pickle("HU.pickled")

In [43]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.931546211242676
Iteration completed in 13.769572019577026
Iteration completed in 16.151962995529175
Iteration completed in 14.329388856887817
Iteration completed in 15.666730642318726
Iteration completed in 15.771321296691895
Iteration completed in 14.649524927139282
Iteration completed in 12.014907836914062
Iteration completed in 14.026641130447388
Iteration completed in 11.689974784851074
Iteration completed in 11.289877891540527
Iteration completed in 10.052744626998901
Iteration completed in 10.540457010269165
Iteration completed in 9.207524538040161
Iteration completed in 9.919158935546875
Iteration completed in 8.346552610397339
Iteration completed in 8.637201309204102
Iteration completed in 8.798230409622192
Iteration completed in 8.68148136138916
Iteration completed in 7.635501146316528
Iteration completed in 7.44167947769165


In [44]:
pd.Series(scores).describe()

count    21.000000
mean      0.146480
std       0.093534
min       0.005819
25%       0.072440
50%       0.129157
75%       0.231064
max       0.300771
dtype: float64

In [45]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.3007713670575483, 30)

# Standard Louvain

In [46]:
comms = nx.algorithms.community.louvain_communities(network)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### Node2Vec

In [47]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HU_n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [48]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_n2v/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 11.007846593856812
Iteration completed in 10.196662425994873
Iteration completed in 13.03965711593628
Iteration completed in 11.87326979637146
Iteration completed in 10.913231372833252
Iteration completed in 11.31676721572876
Iteration completed in 10.597161054611206
Iteration completed in 11.940989255905151
Iteration completed in 10.352142095565796
Iteration completed in 10.567330598831177
Iteration completed in 11.932131052017212
Iteration completed in 11.216755867004395
Iteration completed in 10.315903902053833
Iteration completed in 12.105885028839111
Iteration completed in 12.001434326171875
Iteration completed in 9.925824880599976
Iteration completed in 11.711879968643188
Iteration completed in 10.872157096862793
Iteration completed in 10.609456777572632
Iteration completed in 11.697978496551514
Iteration completed in 9.380534172058105
Iteration completed in 12.098865509033203
Iteration completed in 10.66870641708374
Iteration completed in 11.56436562538147

In [49]:
pd.Series(scores).describe()

count    24.000000
mean      0.601676
std       0.010657
min       0.584326
25%       0.594251
50%       0.601215
75%       0.609278
max       0.626622
dtype: float64

In [50]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
9,0.584326,214.085608,0.5,0.5,80.0,5.0
17,0.586529,208.078459,1.0,2.0,80.0,5.0
13,0.587402,214.618617,1.0,0.5,80.0,5.0
19,0.59113,461.019866,1.0,2.0,80.0,10.0
21,0.591533,217.007686,2.0,1.0,80.0,5.0
8,0.59362,120.73153,0.5,0.5,40.0,5.0
16,0.594462,124.044357,1.0,2.0,40.0,5.0
15,0.594752,499.899741,1.0,0.5,80.0,10.0
3,0.595125,436.778427,1.0,1.0,80.0,10.0
5,0.595811,219.425555,0.5,1.0,80.0,5.0


### DeepWalk

In [51]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HU_dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [52]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_dw/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 11.477624416351318
Iteration completed in 8.8224618434906
Iteration completed in 9.112504959106445
Iteration completed in 7.831369876861572
Iteration completed in 8.887439966201782
Iteration completed in 6.841326951980591
Iteration completed in 7.100483655929565
Iteration completed in 7.300417900085449
Iteration completed in 7.064269781112671
Iteration completed in 7.981332063674927
Iteration completed in 7.542966604232788
Iteration completed in 9.163360595703125
Iteration completed in 7.463568210601807
Iteration completed in 6.932739496231079
Iteration completed in 7.220555782318115


In [53]:
pd.Series(scores).describe()

count    15.000000
mean      0.585410
std       0.056299
min       0.400262
25%       0.591415
50%       0.605200
75%       0.613030
max       0.619329
dtype: float64

In [54]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.400262,36.506493,10.0,5.0
1,0.525365,68.210582,20.0,5.0
2,0.575279,133.75116,40.0,5.0
3,0.589316,266.070136,80.0,5.0
5,0.593514,170.240475,20.0,10.0
10,0.597965,623.870464,40.0,20.0
13,0.60193,476.520527,20.0,30.0
7,0.6052,681.768971,80.0,10.0
11,0.606994,1184.07436,80.0,20.0
6,0.611693,334.773742,40.0,10.0


### M-NMF

In [55]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HU_mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [56]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_mnmf/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.130364656448364
Iteration completed in 5.462433576583862
Iteration completed in 5.023857593536377
Iteration completed in 4.775134563446045
Iteration completed in 5.940056800842285
Iteration completed in 4.649792909622192
Iteration completed in 5.7215070724487305
Iteration completed in 5.685013771057129


In [57]:
pd.Series(scores).describe()

count    8.000000
mean     0.505487
std      0.055111
min      0.437006
25%      0.454691
50%      0.509484
75%      0.549239
max      0.572641
dtype: float64

In [58]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
0,0.437006,95.28386,8.0,100.0
7,0.453415,5875.845202,64.0,200.0
1,0.455117,285.491338,8.0,200.0
6,0.477413,3373.030116,64.0,100.0
2,0.541554,421.509037,16.0,100.0
3,0.545103,701.013073,16.0,200.0
4,0.561647,1211.796356,32.0,100.0
5,0.572641,2139.761206,32.0,200.0


### DANMF

In [59]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HU_danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [60]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./HU_danmf/HU_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.264808893203735
Iteration completed in 4.372205972671509
Iteration completed in 4.525172472000122
Iteration completed in 5.783617734909058
Iteration completed in 5.068538188934326
Iteration completed in 4.750735521316528
Iteration completed in 4.656635284423828
Iteration completed in 4.450993299484253
Iteration completed in 4.902626991271973
Iteration completed in 4.920568466186523
Iteration completed in 5.139606952667236
Iteration completed in 6.028838634490967
Iteration completed in 4.279291391372681
Iteration completed in 4.2906334400177
Iteration completed in 4.917752504348755


In [61]:
pd.Series(scores).describe()

count    15.000000
mean      0.132686
std       0.025269
min       0.085450
25%       0.121527
50%       0.137895
75%       0.147189
max       0.174517
dtype: float64

In [62]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
2,0.08545,108.151318,"[32, 8]",50.0,100.0
0,0.093209,59.408877,"[32, 8]",50.0,50.0
3,0.099155,115.569006,"[32, 8]",100.0,100.0
10,0.120946,453.801099,"[128, 32]",50.0,100.0
1,0.122109,71.049077,"[32, 8]",100.0,50.0
8,0.130286,295.975242,"[128, 32]",50.0,50.0
12,0.135151,216.932015,"[32, 8]",50.0,50.0
11,0.137895,591.802661,"[128, 32]",100.0,100.0
5,0.13851,174.576857,"[64, 16]",100.0,50.0
13,0.142827,516.619871,"[64, 16]",100.0,50.0


### AVPRA

In [63]:
obj = pd.read_pickle("HU.pickled")

In [64]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 6.483170509338379
Iteration completed in 15.112923860549927
Iteration completed in 16.498359441757202
Iteration completed in 13.650741338729858
Iteration completed in 15.015783548355103
Iteration completed in 16.06110644340515
Iteration completed in 14.088645458221436
Iteration completed in 16.017553329467773
Iteration completed in 15.03245234489441
Iteration completed in 11.932594537734985
Iteration completed in 11.998290061950684
Iteration completed in 11.217200517654419
Iteration completed in 11.809659957885742
Iteration completed in 9.529192924499512
Iteration completed in 8.702170372009277
Iteration completed in 10.002746820449829
Iteration completed in 9.4221351146698
Iteration completed in 8.16770315170288
Iteration completed in 8.521818161010742
Iteration completed in 7.488947629928589
Iteration completed in 8.85522985458374


In [65]:
pd.Series(scores).describe()

count    21.000000
mean      0.128810
std       0.084461
min       0.004935
25%       0.060311
50%       0.112388
75%       0.202541
max       0.271034
dtype: float64

In [66]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.2710336964691788, 30)