In [2]:
import pandas as pd
import networkx as nx
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import KMeans
import time

In [3]:
network = nx.read_edgelist("./RO_edges_norm.csv")
nodes = list(network.nodes())
len(nodes)

41773

# Louvain communities (resolution = 0.5)

In [4]:
comms = nx.algorithms.community.louvain_communities(network, resolution=0.5)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### Node2Vec

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./RO_n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [10]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_n2v/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.491493225097656
Iteration completed in 9.699246168136597
Iteration completed in 9.506585359573364
Iteration completed in 9.250694274902344
Iteration completed in 8.981053829193115
Iteration completed in 10.0979585647583
Iteration completed in 11.040759086608887
Iteration completed in 10.63192868232727
Iteration completed in 8.569806337356567
Iteration completed in 9.359720468521118
Iteration completed in 9.399878740310669
Iteration completed in 8.71059775352478
Iteration completed in 10.361395359039307
Iteration completed in 9.158201694488525
Iteration completed in 9.308228969573975
Iteration completed in 11.214880228042603
Iteration completed in 8.652141809463501
Iteration completed in 9.64376950263977
Iteration completed in 11.678454399108887
Iteration completed in 9.16599178314209
Iteration completed in 8.67477822303772
Iteration completed in 9.464624166488647
Iteration completed in 8.884791851043701
Iteration completed in 11.3349130153656


In [11]:
pd.Series(scores).describe()

count    24.000000
mean      0.556596
std       0.017207
min       0.522991
25%       0.547722
50%       0.555412
75%       0.568197
max       0.588118
dtype: float64

In [12]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
17,0.522991,154.053255,1.0,2.0,80.0,5.0
4,0.524052,90.649346,0.5,1.0,40.0,5.0
16,0.530144,89.106015,1.0,2.0,40.0,5.0
1,0.537372,131.534873,1.0,1.0,80.0,5.0
9,0.544362,152.856183,0.5,0.5,80.0,5.0
13,0.54686,159.525569,1.0,0.5,80.0,5.0
0,0.548009,76.404218,1.0,1.0,40.0,5.0
6,0.551994,179.560204,0.5,1.0,40.0,10.0
18,0.552483,184.81163,1.0,2.0,40.0,10.0
5,0.552731,154.161536,0.5,1.0,80.0,5.0


### DeepWalk

In [13]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./RO_dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [14]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_dw/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 8.828006505966187
Iteration completed in 8.00375509262085
Iteration completed in 6.940388441085815
Iteration completed in 7.279048919677734
Iteration completed in 7.810828447341919
Iteration completed in 7.10980749130249
Iteration completed in 7.884000301361084
Iteration completed in 6.795982360839844
Iteration completed in 7.811688423156738
Iteration completed in 6.315476655960083
Iteration completed in 6.825783967971802
Iteration completed in 7.517693281173706
Iteration completed in 7.429354429244995
Iteration completed in 6.60468602180481
Iteration completed in 7.091675281524658


In [15]:
pd.Series(scores).describe()

count    15.000000
mean      0.524241
std       0.075234
min       0.298486
25%       0.516397
50%       0.555453
75%       0.566799
max       0.586540
dtype: float64

In [16]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.298486,25.814546,10.0,5.0
1,0.423897,49.297842,20.0,5.0
2,0.4865,94.867974,40.0,5.0
3,0.502409,178.922319,80.0,5.0
4,0.530385,59.892339,10.0,10.0
7,0.538937,446.217201,80.0,10.0
5,0.541912,119.407379,20.0,10.0
10,0.555453,514.222867,40.0,20.0
8,0.558562,130.357413,10.0,20.0
9,0.561664,256.848043,20.0,20.0


### M-NMF

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./RO_mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_mnmf/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.257342338562012
Iteration completed in 4.726877450942993
Iteration completed in 5.860655307769775
Iteration completed in 4.10392165184021
Iteration completed in 4.05483603477478
Iteration completed in 4.2406907081604
Iteration completed in 4.907581329345703
Iteration completed in 4.880887031555176


In [19]:
pd.Series(scores).describe()

count    8.000000
mean     0.465357
std      0.038202
min      0.421642
25%      0.434275
50%      0.458917
75%      0.490901
max      0.522258
dtype: float64

In [21]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
0,0.421642,47.664274,8.0,100.0
1,0.434208,144.387542,8.0,200.0
6,0.434298,1141.203295,64.0,100.0
7,0.43771,1883.460323,64.0,200.0
2,0.480123,207.895045,16.0,100.0
3,0.485495,336.308679,16.0,200.0
4,0.50712,487.69828,32.0,100.0
5,0.522258,772.527251,32.0,200.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./RO_danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_danmf/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 3.7245354652404785
Iteration completed in 3.854214906692505
Iteration completed in 3.693364143371582
Iteration completed in 4.92300009727478
Iteration completed in 3.8989744186401367
Iteration completed in 3.77597713470459
Iteration completed in 3.6386895179748535
Iteration completed in 3.6491336822509766
Iteration completed in 4.382140398025513
Iteration completed in 4.49935507774353
Iteration completed in 4.471053600311279
Iteration completed in 5.524165153503418
Iteration completed in 3.758983612060547
Iteration completed in 3.8209760189056396
Iteration completed in 4.391146421432495


In [24]:
pd.Series(scores).describe()

count    15.000000
mean      0.117922
std       0.004924
min       0.109276
25%       0.114139
50%       0.118929
75%       0.120539
max       0.125175
dtype: float64

In [25]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
14,0.109276,712.865189,"[128, 32]",200.0,200.0
8,0.111224,215.285758,"[128, 32]",50.0,50.0
12,0.112829,136.182096,"[32, 8]",200.0,200.0
11,0.113046,436.150805,"[128, 32]",100.0,100.0
1,0.115231,50.047976,"[32, 8]",100.0,50.0
3,0.115659,77.28074,"[32, 8]",100.0,100.0
2,0.118563,68.263189,"[32, 8]",50.0,100.0
7,0.118929,169.919596,"[64, 16]",100.0,100.0
6,0.11946,139.50002,"[64, 16]",50.0,100.0
9,0.119576,313.552414,"[128, 32]",100.0,50.0


### AVPRA

In [43]:
obj = pd.read_pickle("RO.pickled")

In [44]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.398656368255615
Iteration completed in 10.191475868225098
Iteration completed in 17.655574083328247
Iteration completed in 11.754129648208618
Iteration completed in 14.037821769714355
Iteration completed in 11.539588451385498
Iteration completed in 10.945523738861084
Iteration completed in 10.056859016418457
Iteration completed in 13.391870260238647
Iteration completed in 9.485108852386475
Iteration completed in 7.911432981491089
Iteration completed in 8.918169736862183
Iteration completed in 8.796579360961914
Iteration completed in 7.128889322280884
Iteration completed in 6.577194690704346
Iteration completed in 6.924615859985352
Iteration completed in 7.465944051742554
Iteration completed in 6.423766136169434
Iteration completed in 5.853344440460205
Iteration completed in 5.9321129322052
Iteration completed in 5.878757953643799


In [45]:
pd.Series(scores).describe()

count    21.000000
mean      0.150880
std       0.091509
min       0.008563
25%       0.075493
50%       0.139391
75%       0.241940
max       0.280005
dtype: float64

In [46]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.28000479911281273, 30)

# Leiden communities

In [84]:
### Identifying Leiden communities
from cdlib import algorithms
comms = algorithms.leiden(network)

In [85]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]
comms = list(pd.Series(comms_l).unique())

### Node2Vec

In [86]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./RO_n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [87]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_n2v/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.725978136062622
Iteration completed in 11.839703798294067
Iteration completed in 12.666911602020264
Iteration completed in 14.628453493118286
Iteration completed in 10.384884357452393
Iteration completed in 12.039611101150513
Iteration completed in 12.691972732543945
Iteration completed in 11.650309324264526
Iteration completed in 10.965243577957153
Iteration completed in 12.30301308631897
Iteration completed in 10.727742433547974
Iteration completed in 13.491225481033325
Iteration completed in 11.343313455581665
Iteration completed in 12.229260921478271
Iteration completed in 11.748340845108032
Iteration completed in 12.203614950180054
Iteration completed in 10.17048454284668
Iteration completed in 11.6322660446167
Iteration completed in 14.24555492401123
Iteration completed in 13.182060718536377
Iteration completed in 12.286027908325195
Iteration completed in 12.162220478057861
Iteration completed in 12.232765197753906
Iteration completed in 12.0726706981658

In [88]:
pd.Series(scores).describe()

count    24.000000
mean      0.662520
std       0.017732
min       0.623814
25%       0.651162
50%       0.663210
75%       0.675333
max       0.690276
dtype: float64

In [89]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
5,0.623814,154.161536,0.5,1.0,80.0,5.0
17,0.633064,154.053255,1.0,2.0,80.0,5.0
16,0.639181,89.106015,1.0,2.0,40.0,5.0
4,0.643766,90.649346,0.5,1.0,40.0,5.0
8,0.644823,86.482981,0.5,0.5,40.0,5.0
9,0.645968,152.856183,0.5,0.5,80.0,5.0
1,0.652893,131.534873,1.0,1.0,80.0,5.0
21,0.656392,169.450061,2.0,1.0,80.0,5.0
6,0.659393,179.560204,0.5,1.0,40.0,10.0
0,0.661434,76.404218,1.0,1.0,40.0,5.0


### DeepWalk

In [90]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./RO_dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [91]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_dw/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 13.096147775650024
Iteration completed in 12.843493700027466
Iteration completed in 10.968533039093018
Iteration completed in 11.565074682235718
Iteration completed in 9.746225357055664
Iteration completed in 10.93900990486145
Iteration completed in 9.09444284439087
Iteration completed in 8.5029616355896
Iteration completed in 9.246677875518799
Iteration completed in 9.05257773399353
Iteration completed in 8.917019605636597
Iteration completed in 10.17641806602478
Iteration completed in 9.3229341506958
Iteration completed in 9.75093674659729
Iteration completed in 8.146090745925903


In [92]:
pd.Series(scores).describe()

count    15.000000
mean      0.613622
std       0.078931
min       0.386319
25%       0.604415
50%       0.652326
75%       0.659082
max       0.667507
dtype: float64

In [93]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.386319,25.814546,10.0,5.0
1,0.495905,49.297842,20.0,5.0
2,0.560628,94.867974,40.0,5.0
3,0.588142,178.922319,80.0,5.0
4,0.620688,59.892339,10.0,10.0
5,0.633841,119.407379,20.0,10.0
11,0.645661,1010.234067,80.0,20.0
6,0.652326,232.0977,40.0,10.0
7,0.654135,446.217201,80.0,10.0
10,0.656463,514.222867,40.0,20.0


### M-NMF

In [94]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./RO_mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [95]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_mnmf/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 8.349448919296265
Iteration completed in 7.469577074050903
Iteration completed in 6.761230945587158
Iteration completed in 6.395488500595093
Iteration completed in 5.449428081512451
Iteration completed in 5.594452142715454
Iteration completed in 6.271721124649048
Iteration completed in 5.92054009437561


In [96]:
pd.Series(scores).describe()

count    8.000000
mean     0.542476
std      0.060877
min      0.451954
25%      0.515433
50%      0.555685
75%      0.582328
max      0.612204
dtype: float64

In [97]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
0,0.451954,47.664274,8.0,100.0
1,0.453456,144.387542,8.0,200.0
2,0.536091,207.895045,16.0,100.0
3,0.542678,336.308679,16.0,200.0
6,0.568691,1141.203295,64.0,100.0
7,0.57729,1883.460323,64.0,200.0
4,0.59744,487.69828,32.0,100.0
5,0.612204,772.527251,32.0,200.0


### DANMF

In [98]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./RO_danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [99]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_danmf/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.722409963607788
Iteration completed in 5.588999509811401
Iteration completed in 4.506207466125488
Iteration completed in 4.884371042251587
Iteration completed in 4.809827566146851
Iteration completed in 4.980556011199951
Iteration completed in 4.657325506210327
Iteration completed in 4.6626362800598145
Iteration completed in 5.400195360183716
Iteration completed in 6.533634424209595
Iteration completed in 5.645020961761475
Iteration completed in 5.378397226333618
Iteration completed in 4.483187913894653
Iteration completed in 4.4436094760894775
Iteration completed in 5.192597389221191


In [100]:
pd.Series(scores).describe()

count    15.000000
mean      0.159335
std       0.012332
min       0.135363
25%       0.154548
50%       0.162609
75%       0.167206
max       0.173895
dtype: float64

In [101]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
12,0.135363,136.182096,"[32, 8]",200.0,200.0
3,0.139173,77.28074,"[32, 8]",100.0,100.0
1,0.141468,50.047976,"[32, 8]",100.0,50.0
2,0.148323,68.263189,"[32, 8]",50.0,100.0
0,0.160772,39.073625,"[32, 8]",50.0,50.0
4,0.161121,81.475821,"[64, 16]",50.0,50.0
10,0.161809,318.435426,"[128, 32]",50.0,100.0
11,0.162609,436.150805,"[128, 32]",100.0,100.0
14,0.163586,712.865189,"[128, 32]",200.0,200.0
6,0.164565,139.50002,"[64, 16]",50.0,100.0


### AVPRA

In [102]:
obj = pd.read_pickle("RO.pickled")

In [103]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 6.623449325561523
Iteration completed in 17.85555624961853
Iteration completed in 17.555610179901123
Iteration completed in 24.040161609649658
Iteration completed in 19.354718923568726
Iteration completed in 16.062849521636963
Iteration completed in 18.037023305892944
Iteration completed in 13.455401182174683
Iteration completed in 13.259505033493042
Iteration completed in 15.006057977676392
Iteration completed in 15.068992137908936
Iteration completed in 14.244449615478516
Iteration completed in 12.646777868270874
Iteration completed in 10.937339305877686
Iteration completed in 12.07513976097107
Iteration completed in 11.509159088134766
Iteration completed in 10.717851877212524
Iteration completed in 9.827609062194824
Iteration completed in 10.531992197036743
Iteration completed in 8.742370367050171
Iteration completed in 9.08379316329956


In [104]:
pd.Series(scores).describe()

count    21.000000
mean      0.169609
std       0.093179
min       0.012855
25%       0.095601
50%       0.175777
75%       0.259463
max       0.285345
dtype: float64

In [105]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.2853451222581785, 26)

# Standard Louvain

In [106]:
comms = nx.algorithms.community.louvain_communities(network)
comms_dict = {}
for i in range(len(comms)):
    for k in comms[i]:
        comms_dict[k] = i
comms_l = [comms_dict[str(i)] for i in range(1, len(nodes) + 1)]

### Node2Vec

In [107]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./RO_n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [108]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_n2v/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 13.840743064880371
Iteration completed in 11.61056113243103
Iteration completed in 12.616752624511719
Iteration completed in 12.13883376121521
Iteration completed in 11.390191078186035
Iteration completed in 12.293436765670776
Iteration completed in 13.702432632446289
Iteration completed in 12.455113172531128
Iteration completed in 11.916029453277588
Iteration completed in 13.45830225944519
Iteration completed in 11.996872663497925
Iteration completed in 12.100021362304688
Iteration completed in 11.54023289680481
Iteration completed in 14.523725509643555
Iteration completed in 13.08907175064087
Iteration completed in 13.248086929321289
Iteration completed in 12.213622331619263
Iteration completed in 12.660017728805542
Iteration completed in 12.578555583953857
Iteration completed in 14.994449377059937
Iteration completed in 14.0946946144104
Iteration completed in 14.995540618896484
Iteration completed in 15.284980773925781
Iteration completed in 14.58026909828186


In [109]:
pd.Series(scores).describe()

count    24.000000
mean      0.626787
std       0.014152
min       0.596040
25%       0.616535
50%       0.631365
75%       0.637042
max       0.648835
dtype: float64

In [110]:
pd.DataFrame(list(zip(scores, exec_time, p, q, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,p,q,walk_num,walk_len
4,0.59604,90.649346,0.5,1.0,40.0,5.0
17,0.599328,154.053255,1.0,2.0,80.0,5.0
5,0.608449,154.161536,0.5,1.0,80.0,5.0
0,0.611586,76.404218,1.0,1.0,40.0,5.0
8,0.612728,86.482981,0.5,0.5,40.0,5.0
16,0.616292,89.106015,1.0,2.0,40.0,5.0
9,0.616616,152.856183,0.5,0.5,80.0,5.0
1,0.62009,131.534873,1.0,1.0,80.0,5.0
6,0.623141,179.560204,0.5,1.0,40.0,10.0
20,0.625123,97.455682,2.0,1.0,40.0,5.0


### DeepWalk

In [111]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./RO_dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [112]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_dw/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 16.489747762680054
Iteration completed in 15.63152527809143
Iteration completed in 15.157397985458374
Iteration completed in 13.00760793685913
Iteration completed in 13.586605548858643
Iteration completed in 12.196266651153564
Iteration completed in 11.970504522323608
Iteration completed in 11.550731420516968
Iteration completed in 12.070453643798828
Iteration completed in 11.426562786102295
Iteration completed in 10.995787143707275
Iteration completed in 13.80608868598938
Iteration completed in 10.462146997451782
Iteration completed in 10.904747247695923
Iteration completed in 11.045874834060669


In [113]:
pd.Series(scores).describe()

count    15.000000
mean      0.586811
std       0.071873
min       0.386542
25%       0.574476
50%       0.618482
75%       0.628977
max       0.637466
dtype: float64

In [114]:
pd.DataFrame(list(zip(scores, exec_time, walk_num, walk_len)),
               columns =['NMI', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,walk_num,walk_len
0,0.386542,25.814546,10.0,5.0
1,0.473919,49.297842,20.0,5.0
2,0.532366,94.867974,40.0,5.0
3,0.561326,178.922319,80.0,5.0
4,0.587626,59.892339,10.0,10.0
6,0.615318,232.0977,40.0,10.0
5,0.617393,119.407379,20.0,10.0
7,0.618482,446.217201,80.0,10.0
10,0.624777,514.222867,40.0,20.0
11,0.625656,1010.234067,80.0,20.0


### M-NMF

In [115]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./RO_mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [116]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_mnmf/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.03308916091919
Iteration completed in 9.302292823791504
Iteration completed in 9.707044839859009
Iteration completed in 9.318488836288452
Iteration completed in 7.747678995132446
Iteration completed in 8.014523029327393
Iteration completed in 6.900279998779297
Iteration completed in 6.845454931259155


In [117]:
pd.Series(scores).describe()

count    8.000000
mean     0.516168
std      0.061787
min      0.425251
25%      0.485559
50%      0.531171
75%      0.560421
max      0.582518
dtype: float64

In [118]:
pd.DataFrame(list(zip(scores, exec_time, dim, it)),
               columns =['NMI', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Dimensions,Iterations
0,0.425251,47.664274,8.0,100.0
1,0.426895,144.387542,8.0,200.0
2,0.505114,207.895045,16.0,100.0
3,0.509608,336.308679,16.0,200.0
6,0.552734,1141.203295,64.0,100.0
7,0.557232,1883.460323,64.0,200.0
4,0.569989,487.69828,32.0,100.0
5,0.582518,772.527251,32.0,200.0


### DANMF

In [119]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./RO_danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [120]:
scores = []
for i in range(tests_num):
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(pd.read_csv("./RO_danmf/RO_emb_vectors"\
                                                           + str(i) + ".csv", header=None,\
                                                           delimiter=";").values.tolist())
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.580210447311401
Iteration completed in 6.042437791824341
Iteration completed in 5.778846263885498
Iteration completed in 5.7438695430755615
Iteration completed in 6.205195188522339
Iteration completed in 5.882572174072266
Iteration completed in 7.307689428329468
Iteration completed in 6.313513994216919
Iteration completed in 6.998629093170166
Iteration completed in 6.8258795738220215
Iteration completed in 7.024554491043091
Iteration completed in 7.195798873901367
Iteration completed in 5.6534998416900635
Iteration completed in 6.079115152359009
Iteration completed in 7.234593391418457


In [121]:
pd.Series(scores).describe()

count    15.000000
mean      0.170374
std       0.018132
min       0.140019
25%       0.154852
50%       0.176305
75%       0.180336
max       0.194480
dtype: float64

In [122]:
pd.DataFrame(list(zip(scores, exec_time, lay, pre_it, it)),
               columns =['NMI', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="NMI")

Unnamed: 0,NMI,Exec time,Layers,Pre-terations,Iterations
12,0.140019,136.182096,"[32, 8]",200.0,200.0
1,0.14446,50.047976,"[32, 8]",100.0,50.0
3,0.14491,77.28074,"[32, 8]",100.0,100.0
2,0.146433,68.263189,"[32, 8]",50.0,100.0
0,0.163272,39.073625,"[32, 8]",50.0,50.0
4,0.174095,81.475821,"[64, 16]",50.0,50.0
6,0.175498,139.50002,"[64, 16]",50.0,100.0
8,0.176305,215.285758,"[128, 32]",50.0,50.0
7,0.178955,169.919596,"[64, 16]",100.0,100.0
13,0.179419,313.978146,"[64, 16]",200.0,200.0


### AVPRA

In [123]:
obj = pd.read_pickle("RO.pickled")

In [124]:
scores = []
for res in obj:
    start_time = time.time()
    kmeans = KMeans(n_clusters=len(comms)).fit(res[1])
    clusters = kmeans.labels_
    scores.append(normalized_mutual_info_score(comms_l, clusters))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.629186630249023
Iteration completed in 25.714229106903076
Iteration completed in 30.6862576007843
Iteration completed in 26.324862003326416
Iteration completed in 26.517621517181396
Iteration completed in 26.4939386844635
Iteration completed in 24.620343923568726
Iteration completed in 20.0354962348938
Iteration completed in 19.08512830734253
Iteration completed in 18.93425726890564
Iteration completed in 16.46214246749878
Iteration completed in 15.695787906646729
Iteration completed in 14.2030029296875
Iteration completed in 18.122618198394775
Iteration completed in 13.168888330459595
Iteration completed in 13.217252016067505
Iteration completed in 12.265352964401245
Iteration completed in 12.959204912185669
Iteration completed in 12.98749852180481
Iteration completed in 11.435421466827393
Iteration completed in 10.461918354034424


In [125]:
pd.Series(scores).describe()

count    21.000000
mean      0.168113
std       0.092456
min       0.013844
25%       0.094657
50%       0.169857
75%       0.257608
max       0.287919
dtype: float64

In [126]:
max(scores), (list(range(0, 10)) + list(range(10, 32, 2)))[scores.index(max(scores))]

(0.2879185432852615, 30)