In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [4]:
G = nx.read_edgelist("./RO_edges_norm.csv")

In [5]:
### Calculating pagerank score for each node
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 8.153915405273438e-05


In [6]:
### Degree scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# Heterogeneous intervals

In [29]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [30]:
pd.Series(node_labels).value_counts()

17    8333
13    5632
1     5430
14    5056
18    4484
15    4404
16    3794
19    3096
20    1055
21     360
22     104
23      22
24       3
dtype: int64

### DW

In [35]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [36]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 41.18915605545044
Iteration completed in 45.59377312660217
Iteration completed in 40.98237657546997
Iteration completed in 39.27055335044861
Iteration completed in 33.35329270362854
Iteration completed in 33.56506872177124
Iteration completed in 30.821011066436768
Iteration completed in 33.328843116760254
Iteration completed in 31.466905117034912
Iteration completed in 34.707722187042236
Iteration completed in 38.17539978027344
Iteration completed in 31.247819662094116
Iteration completed in 31.261706590652466
Iteration completed in 31.67702865600586
Iteration completed in 31.184784173965454


In [37]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.241524,0.125991
std,0.022495,0.01745
min,0.205027,0.096644
25%,0.220886,0.11653
50%,0.250748,0.130371
75%,0.259007,0.138257
max,0.268941,0.154484


In [38]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
4,0.205027,59.892339,10.0,10.0
1,0.208378,49.297842,20.0,5.0
5,0.213525,119.407379,20.0,10.0
2,0.218193,94.867974,40.0,5.0
8,0.223579,130.357413,10.0,20.0
0,0.23064,25.814546,10.0,5.0
12,0.248474,201.832897,10.0,30.0
6,0.250748,232.0977,40.0,10.0
3,0.251227,178.922319,80.0,5.0
13,0.257451,398.917009,20.0,30.0


In [39]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.096644,49.297842,20.0,5.0
4,0.096869,59.892339,10.0,10.0
0,0.102089,25.814546,10.0,5.0
5,0.115653,119.407379,20.0,10.0
8,0.117408,130.357413,10.0,20.0
2,0.123768,94.867974,40.0,5.0
10,0.124531,514.222867,40.0,20.0
13,0.130371,398.917009,20.0,30.0
11,0.132166,1010.234067,80.0,20.0
7,0.133842,446.217201,80.0,10.0


### N2V

In [40]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [41]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 37.3010413646698
Iteration completed in 36.28186535835266
Iteration completed in 36.430126667022705
Iteration completed in 36.71477699279785
Iteration completed in 35.4477436542511
Iteration completed in 35.89234900474548
Iteration completed in 35.90138101577759
Iteration completed in 38.25239562988281
Iteration completed in 36.382261514663696
Iteration completed in 36.08791470527649
Iteration completed in 37.19866704940796
Iteration completed in 39.395142793655396
Iteration completed in 37.48548126220703
Iteration completed in 41.12639307975769
Iteration completed in 39.06504416465759
Iteration completed in 38.337937355041504
Iteration completed in 36.24987077713013
Iteration completed in 38.50272870063782
Iteration completed in 37.614858627319336
Iteration completed in 39.33120632171631
Iteration completed in 36.86940360069275
Iteration completed in 35.839380979537964
Iteration completed in 38.185890674591064
Iteration completed in 40.62711811065674


In [42]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.325828,0.170058
std,0.015423,0.030055
min,0.290006,0.120639
25%,0.31523,0.155245
50%,0.329204,0.171721
75%,0.333992,0.185351
max,0.352962,0.222085


In [43]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.290006,367.378012,1.0,0.5,80.0,10.0
11,0.298025,358.826658,0.5,0.5,80.0,10.0
23,0.305087,396.724931,2.0,1.0,80.0,10.0
3,0.309635,317.039041,1.0,1.0,80.0,10.0
7,0.310233,334.074217,0.5,1.0,80.0,10.0
19,0.314782,350.874268,1.0,2.0,80.0,10.0
16,0.31538,89.106015,1.0,2.0,40.0,5.0
17,0.322202,154.053255,1.0,2.0,80.0,5.0
10,0.324835,188.911285,0.5,0.5,40.0,10.0
5,0.327588,154.161536,0.5,1.0,80.0,5.0


In [44]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.120639,367.378012,1.0,0.5,80.0,10.0
11,0.123169,358.826658,0.5,0.5,80.0,10.0
3,0.128178,317.039041,1.0,1.0,80.0,10.0
23,0.128808,396.724931,2.0,1.0,80.0,10.0
7,0.130572,334.074217,0.5,1.0,80.0,10.0
19,0.147801,350.874268,1.0,2.0,80.0,10.0
14,0.157727,186.44177,1.0,0.5,40.0,10.0
2,0.160375,158.189576,1.0,1.0,40.0,10.0
6,0.16078,179.560204,0.5,1.0,40.0,10.0
10,0.161797,188.911285,0.5,0.5,40.0,10.0


### MNMF

In [45]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [46]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.703343868255615
Iteration completed in 4.958108186721802
Iteration completed in 8.31505537033081
Iteration completed in 7.522087574005127
Iteration completed in 8.112908363342285
Iteration completed in 7.423761606216431
Iteration completed in 10.670656681060791
Iteration completed in 9.411773443222046


In [47]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.19623,0.095075
std,0.013314,0.005101
min,0.182765,0.088848
25%,0.186296,0.091708
50%,0.190784,0.094051
75%,0.20781,0.097819
max,0.21544,0.104269


In [48]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.182765,144.387542,8.0,200.0
0,0.183603,47.664274,8.0,100.0
2,0.187193,207.895045,16.0,100.0
3,0.189348,336.308679,16.0,200.0
5,0.19222,772.527251,32.0,200.0
4,0.205984,487.69828,32.0,100.0
6,0.213285,1141.203295,64.0,100.0
7,0.21544,1883.460323,64.0,200.0


In [49]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.088848,144.387542,8.0,200.0
2,0.090335,207.895045,16.0,100.0
4,0.092165,487.69828,32.0,100.0
0,0.093915,47.664274,8.0,100.0
3,0.094187,336.308679,16.0,200.0
5,0.097197,772.527251,32.0,200.0
6,0.099682,1141.203295,64.0,100.0
7,0.104269,1883.460323,64.0,200.0


### DANMF

In [50]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [51]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.11715841293335
Iteration completed in 10.18346643447876
Iteration completed in 9.798781156539917
Iteration completed in 9.90868353843689
Iteration completed in 11.498745441436768
Iteration completed in 11.673469543457031
Iteration completed in 10.870182752609253
Iteration completed in 10.918553590774536
Iteration completed in 14.469986200332642
Iteration completed in 14.864763975143433
Iteration completed in 13.666516304016113
Iteration completed in 14.000642538070679
Iteration completed in 9.588245868682861
Iteration completed in 10.311032772064209
Iteration completed in 13.440401554107666


In [52]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.339789,0.241362
std,0.011526,0.020403
min,0.320048,0.213229
25%,0.328785,0.227755
50%,0.341233,0.240157
75%,0.347098,0.253764
max,0.35751,0.279752


In [53]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
3,0.320048,77.28074,"[32, 8]",100.0,100.0
12,0.325075,136.182096,"[32, 8]",200.0,200.0
1,0.32699,50.047976,"[32, 8]",100.0,50.0
2,0.327229,68.263189,"[32, 8]",50.0,100.0
13,0.330341,313.978146,"[64, 16]",200.0,200.0
0,0.33836,39.073625,"[32, 8]",50.0,50.0
6,0.340754,139.50002,"[64, 16]",50.0,100.0
7,0.341233,169.919596,"[64, 16]",100.0,100.0
9,0.344584,313.552414,"[128, 32]",100.0,50.0
4,0.345183,81.475821,"[64, 16]",50.0,50.0


In [54]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
3,0.213229,77.28074,"[32, 8]",100.0,100.0
13,0.214717,313.978146,"[64, 16]",200.0,200.0
6,0.215474,139.50002,"[64, 16]",50.0,100.0
12,0.222176,136.182096,"[32, 8]",200.0,200.0
7,0.233333,169.919596,"[64, 16]",100.0,100.0
1,0.234913,50.047976,"[32, 8]",100.0,50.0
4,0.237692,81.475821,"[64, 16]",50.0,50.0
10,0.240157,318.435426,"[128, 32]",50.0,100.0
2,0.240433,68.263189,"[32, 8]",50.0,100.0
8,0.247608,215.285758,"[128, 32]",50.0,50.0


# Heterogeneous intervals 2

In [7]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [8]:
pd.Series(node_labels).value_counts()

2     5632
1     5430
4     5056
6     4404
8     3794
10    3257
16    3181
11    2741
12    2335
15    1961
13    1868
14    1479
17     635
dtype: int64

### DW

In [9]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [10]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 29.870899438858032
Iteration completed in 27.628281116485596
Iteration completed in 23.784443616867065
Iteration completed in 22.09376335144043
Iteration completed in 24.20620584487915
Iteration completed in 22.254886388778687
Iteration completed in 23.451910257339478
Iteration completed in 21.99951148033142
Iteration completed in 21.511908769607544
Iteration completed in 21.324143171310425
Iteration completed in 21.444751024246216
Iteration completed in 22.73041605949402
Iteration completed in 24.7690327167511
Iteration completed in 24.83761215209961
Iteration completed in 23.516135692596436


In [11]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.180914,0.126064
std,0.018151,0.020142
min,0.148294,0.08968
25%,0.16942,0.119068
50%,0.185278,0.133456
75%,0.193836,0.138608
max,0.205386,0.152606


In [12]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.148294,49.297842,20.0,5.0
4,0.154877,59.892339,10.0,10.0
0,0.155715,25.814546,10.0,5.0
5,0.168761,119.407379,20.0,10.0
2,0.170078,94.867974,40.0,5.0
8,0.175105,130.357413,10.0,20.0
13,0.182166,398.917009,20.0,30.0
12,0.185278,201.832897,10.0,30.0
6,0.187911,232.0977,40.0,10.0
9,0.190305,256.848043,20.0,20.0


In [13]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.08968,49.297842,20.0,5.0
0,0.089737,25.814546,10.0,5.0
4,0.093811,59.892339,10.0,10.0
5,0.119027,119.407379,20.0,10.0
8,0.11911,130.357413,10.0,20.0
2,0.125993,94.867974,40.0,5.0
13,0.131432,398.917009,20.0,30.0
12,0.133456,201.832897,10.0,30.0
9,0.136419,256.848043,20.0,20.0
6,0.137092,232.0977,40.0,10.0


### N2V

In [14]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [15]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 31.297869443893433
Iteration completed in 32.944129943847656
Iteration completed in 36.23192882537842
Iteration completed in 35.58752942085266
Iteration completed in 32.187859773635864
Iteration completed in 33.04634952545166
Iteration completed in 36.82083010673523
Iteration completed in 39.35503578186035
Iteration completed in 36.92699313163757
Iteration completed in 38.286959648132324
Iteration completed in 39.49258065223694
Iteration completed in 47.330089807510376
Iteration completed in 40.09919047355652
Iteration completed in 37.39872169494629
Iteration completed in 38.856419801712036
Iteration completed in 44.942622661590576
Iteration completed in 36.434019327163696
Iteration completed in 36.49421739578247
Iteration completed in 36.86852669715881
Iteration completed in 38.16060781478882
Iteration completed in 36.287532806396484
Iteration completed in 37.469218492507935
Iteration completed in 38.18002939224243
Iteration completed in 39.235544204711914


In [16]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.256528,0.167434
std,0.01073,0.021274
min,0.23854,0.133089
25%,0.251436,0.156157
50%,0.257929,0.168404
75%,0.264542,0.17785
max,0.274207,0.201262


In [17]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
7,0.23854,334.074217,0.5,1.0,80.0,10.0
11,0.239258,358.826658,0.5,0.5,80.0,10.0
19,0.240455,350.874268,1.0,2.0,80.0,10.0
23,0.241293,396.724931,2.0,1.0,80.0,10.0
15,0.241412,367.378012,1.0,0.5,80.0,10.0
3,0.243447,317.039041,1.0,1.0,80.0,10.0
17,0.254099,154.053255,1.0,2.0,80.0,5.0
4,0.254817,90.649346,0.5,1.0,40.0,5.0
16,0.254817,89.106015,1.0,2.0,40.0,5.0
1,0.255895,131.534873,1.0,1.0,80.0,5.0


In [18]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.133089,367.378012,1.0,0.5,80.0,10.0
23,0.135952,396.724931,2.0,1.0,80.0,10.0
11,0.136156,358.826658,0.5,0.5,80.0,10.0
3,0.137924,317.039041,1.0,1.0,80.0,10.0
7,0.142162,334.074217,0.5,1.0,80.0,10.0
19,0.144597,350.874268,1.0,2.0,80.0,10.0
2,0.160011,158.189576,1.0,1.0,40.0,10.0
1,0.160706,131.534873,1.0,1.0,80.0,5.0
10,0.164962,188.911285,0.5,0.5,40.0,10.0
9,0.165594,152.856183,0.5,0.5,80.0,5.0


### MNMF

In [19]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [20]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.474622964859009
Iteration completed in 4.551671981811523
Iteration completed in 7.84258508682251
Iteration completed in 7.130327939987183
Iteration completed in 8.136880874633789
Iteration completed in 6.92210578918457
Iteration completed in 9.815723180770874
Iteration completed in 8.574569702148438


In [21]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.146619,0.08924
std,0.009907,0.005356
min,0.135847,0.078888
25%,0.136475,0.087684
50%,0.146978,0.09101
75%,0.155536,0.093106
max,0.158947,0.094054


In [22]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
3,0.135847,336.308679,16.0,200.0
1,0.136206,144.387542,8.0,200.0
0,0.136565,47.664274,8.0,100.0
2,0.14231,207.895045,16.0,100.0
5,0.151646,772.527251,32.0,200.0
4,0.155356,487.69828,32.0,100.0
6,0.156074,1141.203295,64.0,100.0
7,0.158947,1883.460323,64.0,200.0


In [23]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.078888,47.664274,8.0,100.0
1,0.083506,144.387542,8.0,200.0
3,0.089076,336.308679,16.0,200.0
4,0.090831,487.69828,32.0,100.0
2,0.091189,207.895045,16.0,100.0
5,0.093024,772.527251,32.0,200.0
6,0.093351,1141.203295,64.0,100.0
7,0.094054,1883.460323,64.0,200.0


### DANMF

In [24]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [25]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.438443183898926
Iteration completed in 9.412120580673218
Iteration completed in 9.220573902130127
Iteration completed in 9.225736379623413
Iteration completed in 10.656833410263062
Iteration completed in 10.753304481506348
Iteration completed in 10.076052188873291
Iteration completed in 10.178069353103638
Iteration completed in 13.181257963180542
Iteration completed in 13.723856210708618
Iteration completed in 12.577872514724731
Iteration completed in 12.912231206893921
Iteration completed in 8.703471183776855
Iteration completed in 9.634128093719482
Iteration completed in 12.537882566452026


In [26]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.272627,0.22211
std,0.011912,0.009632
min,0.251466,0.204961
25%,0.264273,0.215682
50%,0.277319,0.224042
75%,0.281388,0.228866
max,0.28857,0.235255


In [27]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
12,0.251466,136.182096,"[32, 8]",200.0,200.0
1,0.255057,50.047976,"[32, 8]",100.0,50.0
3,0.262597,77.28074,"[32, 8]",100.0,100.0
2,0.262837,68.263189,"[32, 8]",50.0,100.0
0,0.265709,39.073625,"[32, 8]",50.0,50.0
13,0.266068,313.978146,"[64, 16]",200.0,200.0
7,0.266308,169.919596,"[64, 16]",100.0,100.0
5,0.277319,116.574285,"[64, 16]",100.0,50.0
6,0.278636,139.50002,"[64, 16]",50.0,100.0
11,0.278875,436.150805,"[128, 32]",100.0,100.0


In [28]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
1,0.204961,50.047976,"[32, 8]",100.0,50.0
12,0.2053,136.182096,"[32, 8]",200.0,200.0
2,0.212793,68.263189,"[32, 8]",50.0,100.0
13,0.214628,313.978146,"[64, 16]",200.0,200.0
3,0.216736,77.28074,"[32, 8]",100.0,100.0
0,0.219967,39.073625,"[32, 8]",50.0,50.0
7,0.221796,169.919596,"[64, 16]",100.0,100.0
5,0.224042,116.574285,"[64, 16]",100.0,50.0
6,0.225362,139.50002,"[64, 16]",50.0,100.0
11,0.22676,436.150805,"[128, 32]",100.0,100.0
