In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./RO_edges_norm.csv")

In [3]:
### Calculating pagerank score for each node
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 0.5009195804595947


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [6]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [7]:
pd.Series(node_labels).value_counts()

1     18472
2     14612
3      5594
4      1696
5       704
6       341
7       145
8        73
9        48
10       32
11       14
12       13
13       10
14        5
15        4
20        3
16        3
17        2
19        1
18        1
dtype: int64

### DW

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 21.447399616241455
Iteration completed in 25.852694034576416
Iteration completed in 26.772785425186157
Iteration completed in 24.61933445930481
Iteration completed in 23.165640592575073
Iteration completed in 23.05930519104004
Iteration completed in 24.69600534439087
Iteration completed in 22.168159008026123
Iteration completed in 23.70447611808777
Iteration completed in 22.94480061531067
Iteration completed in 24.363035202026367
Iteration completed in 23.470039129257202
Iteration completed in 22.361435651779175
Iteration completed in 22.343160390853882
Iteration completed in 24.00101375579834


In [25]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.489232,0.077207
std,0.023955,0.009023
min,0.443088,0.056893
25%,0.474028,0.073098
50%,0.497905,0.075615
75%,0.505625,0.086849
max,0.52316,0.08837


In [26]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
4,0.443088,59.892339,10.0,10.0
1,0.452902,49.297842,20.0,5.0
2,0.455416,94.867974,40.0,5.0
3,0.473369,178.922319,80.0,5.0
5,0.474686,119.407379,20.0,10.0
8,0.486415,130.357413,10.0,20.0
14,0.497307,833.834561,40.0,30.0
10,0.497905,514.222867,40.0,20.0
7,0.500898,446.217201,80.0,10.0
6,0.504249,232.0977,40.0,10.0


In [27]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.056893,49.297842,20.0,5.0
4,0.066117,59.892339,10.0,10.0
10,0.072467,514.222867,40.0,20.0
5,0.072558,119.407379,20.0,10.0
2,0.073638,94.867974,40.0,5.0
0,0.074355,25.814546,10.0,5.0
12,0.074949,201.832897,10.0,30.0
9,0.075615,256.848043,20.0,20.0
8,0.076032,130.357413,10.0,20.0
14,0.078117,833.834561,40.0,30.0


### N2V

In [28]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [32]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 27.079452514648438
Iteration completed in 28.29560923576355
Iteration completed in 27.508986473083496
Iteration completed in 30.99925684928894
Iteration completed in 27.39255142211914
Iteration completed in 27.878321170806885
Iteration completed in 26.62145185470581
Iteration completed in 29.420247077941895
Iteration completed in 28.12168598175049
Iteration completed in 28.221131324768066
Iteration completed in 28.3894305229187
Iteration completed in 31.31156849861145
Iteration completed in 27.275516033172607
Iteration completed in 28.57336735725403
Iteration completed in 29.76218056678772
Iteration completed in 33.34655046463013
Iteration completed in 28.19192361831665
Iteration completed in 28.13232135772705
Iteration completed in 26.851027011871338
Iteration completed in 31.6556179523468
Iteration completed in 29.502801418304443
Iteration completed in 29.88929581642151
Iteration completed in 29.08273458480835
Iteration completed in 32.629250288009644


In [33]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.686031,0.132071
std,0.050477,0.031441
min,0.592938,0.081906
25%,0.658348,0.110935
50%,0.695691,0.136205
75%,0.724746,0.148096
max,0.754638,0.190117


In [34]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.592938,367.378012,1.0,0.5,80.0,10.0
11,0.605147,358.826658,0.5,0.5,80.0,10.0
23,0.605984,396.724931,2.0,1.0,80.0,10.0
3,0.609695,317.039041,1.0,1.0,80.0,10.0
7,0.623818,334.074217,0.5,1.0,80.0,10.0
19,0.628725,350.874268,1.0,2.0,80.0,10.0
14,0.668223,186.44177,1.0,0.5,40.0,10.0
10,0.671694,188.911285,0.5,0.5,40.0,10.0
6,0.679354,179.560204,0.5,1.0,40.0,10.0
2,0.680551,158.189576,1.0,1.0,40.0,10.0


In [35]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
3,0.081906,317.039041,1.0,1.0,80.0,10.0
23,0.08555,396.724931,2.0,1.0,80.0,10.0
15,0.088162,367.378012,1.0,0.5,80.0,10.0
7,0.092186,334.074217,0.5,1.0,80.0,10.0
19,0.092393,350.874268,1.0,2.0,80.0,10.0
10,0.109324,188.911285,0.5,0.5,40.0,10.0
11,0.111472,358.826658,0.5,0.5,80.0,10.0
6,0.113372,179.560204,0.5,1.0,40.0,10.0
2,0.11953,158.189576,1.0,1.0,40.0,10.0
18,0.121599,184.81163,1.0,2.0,40.0,10.0


### MNMF

In [36]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [37]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.100853681564331
Iteration completed in 3.5905792713165283
Iteration completed in 6.083541393280029
Iteration completed in 6.895989656448364
Iteration completed in 6.461395978927612
Iteration completed in 5.750373601913452
Iteration completed in 8.215923309326172
Iteration completed in 8.843704223632812


In [38]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.440649,0.060015
std,0.012351,0.005742
min,0.423938,0.052613
25%,0.433064,0.054027
50%,0.440694,0.061651
75%,0.448983,0.064188
max,0.460203,0.06714


In [39]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
3,0.423938,336.308679,16.0,200.0
1,0.425972,144.387542,8.0,200.0
0,0.435428,47.664274,8.0,100.0
5,0.437582,772.527251,32.0,200.0
2,0.443806,207.895045,16.0,100.0
4,0.448833,487.69828,32.0,100.0
7,0.449431,1883.460323,64.0,200.0
6,0.460203,1141.203295,64.0,100.0


In [40]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
6,0.052613,1141.203295,64.0,100.0
5,0.053622,772.527251,32.0,200.0
1,0.054163,144.387542,8.0,200.0
4,0.061247,487.69828,32.0,100.0
7,0.062054,1883.460323,64.0,200.0
3,0.063735,336.308679,16.0,200.0
2,0.065547,207.895045,16.0,100.0
0,0.06714,47.664274,8.0,100.0


### DANMF

In [41]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [42]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 8.107707500457764
Iteration completed in 8.951408386230469
Iteration completed in 8.384973287582397
Iteration completed in 7.865986585617065
Iteration completed in 10.253839015960693
Iteration completed in 9.362311601638794
Iteration completed in 9.63219165802002
Iteration completed in 9.471336364746094
Iteration completed in 13.232384204864502
Iteration completed in 11.948678016662598
Iteration completed in 11.01918649673462
Iteration completed in 11.161719560623169
Iteration completed in 9.36759090423584
Iteration completed in 9.222419738769531
Iteration completed in 11.981130838394165


In [43]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.5508,0.108673
std,0.009712,0.011218
min,0.536924,0.092214
25%,0.542669,0.10255
50%,0.548175,0.105083
75%,0.559485,0.113059
max,0.567804,0.135313


In [44]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
1,0.536924,50.047976,"[32, 8]",100.0,50.0
3,0.5386,77.28074,"[32, 8]",100.0,100.0
2,0.541472,68.263189,"[32, 8]",50.0,100.0
13,0.541472,313.978146,"[64, 16]",200.0,200.0
12,0.543866,136.182096,"[32, 8]",200.0,200.0
0,0.546379,39.073625,"[32, 8]",50.0,50.0
7,0.546858,169.919596,"[64, 16]",100.0,100.0
6,0.548175,139.50002,"[64, 16]",50.0,100.0
4,0.553321,81.475821,"[64, 16]",50.0,50.0
9,0.5538,313.552414,"[128, 32]",100.0,50.0


In [45]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
10,0.092214,318.435426,"[128, 32]",50.0,100.0
1,0.097682,50.047976,"[32, 8]",100.0,50.0
7,0.100388,169.919596,"[64, 16]",100.0,100.0
12,0.101454,136.182096,"[32, 8]",200.0,200.0
14,0.103645,712.865189,"[128, 32]",200.0,200.0
5,0.104649,116.574285,"[64, 16]",100.0,50.0
13,0.104665,313.978146,"[64, 16]",200.0,200.0
2,0.105083,68.263189,"[32, 8]",50.0,100.0
6,0.107018,139.50002,"[64, 16]",50.0,100.0
11,0.10776,436.150805,"[128, 32]",100.0,100.0


# Heterogeneous intervals

In [5]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [6]:
pd.Series(node_labels).value_counts()

16    7290
11    3756
12    3328
13    3035
14    2429
15    2064
4     2016
6     1992
10    1980
9     1969
5     1941
8     1858
7     1853
2     1819
3     1740
17    1399
1     1304
dtype: int64

### DW

In [7]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [8]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 25.024813890457153
Iteration completed in 28.12222671508789
Iteration completed in 24.758379220962524
Iteration completed in 23.683719158172607
Iteration completed in 25.215585231781006
Iteration completed in 23.930057525634766
Iteration completed in 23.096420526504517
Iteration completed in 23.21975874900818
Iteration completed in 25.317180633544922
Iteration completed in 25.248738288879395
Iteration completed in 23.838085412979126
Iteration completed in 24.46137762069702
Iteration completed in 26.463995933532715
Iteration completed in 25.724291801452637
Iteration completed in 26.070491790771484


In [9]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.183706,0.093694
std,0.012999,0.011651
min,0.159904,0.071989
25%,0.170557,0.085255
50%,0.188151,0.098699
75%,0.194135,0.103673
max,0.199402,0.1072


In [10]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.159904,49.297842,20.0,5.0
4,0.167205,59.892339,10.0,10.0
5,0.170078,119.407379,20.0,10.0
8,0.170197,130.357413,10.0,20.0
2,0.170916,94.867974,40.0,5.0
12,0.177858,201.832897,10.0,30.0
3,0.187792,178.922319,80.0,5.0
9,0.188151,256.848043,20.0,20.0
0,0.189228,25.814546,10.0,5.0
6,0.192938,232.0977,40.0,10.0


In [11]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.071989,49.297842,20.0,5.0
4,0.076455,59.892339,10.0,10.0
5,0.080458,119.407379,20.0,10.0
8,0.082914,130.357413,10.0,20.0
2,0.087595,94.867974,40.0,5.0
12,0.090149,201.832897,10.0,30.0
0,0.091412,25.814546,10.0,5.0
10,0.098699,514.222867,40.0,20.0
9,0.098722,256.848043,20.0,20.0
13,0.100061,398.917009,20.0,30.0


### N2V

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 29.478130102157593
Iteration completed in 30.385087490081787
Iteration completed in 29.52339744567871
Iteration completed in 32.73075079917908
Iteration completed in 30.258061408996582
Iteration completed in 30.054909467697144
Iteration completed in 28.6137433052063
Iteration completed in 29.21599841117859
Iteration completed in 27.280558586120605
Iteration completed in 27.6164333820343
Iteration completed in 27.828235387802124
Iteration completed in 29.63239097595215
Iteration completed in 28.293842554092407
Iteration completed in 29.974453449249268
Iteration completed in 30.285539627075195
Iteration completed in 33.60927605628967
Iteration completed in 32.21125292778015
Iteration completed in 31.314322233200073
Iteration completed in 31.313287258148193
Iteration completed in 33.026856422424316
Iteration completed in 31.22510290145874
Iteration completed in 30.72917914390564
Iteration completed in 31.690518379211426
Iteration completed in 34.134116888046265


In [14]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.298998,0.225309
std,0.037416,0.037594
min,0.231957,0.15438
25%,0.283722,0.210005
50%,0.30383,0.231925
75%,0.326152,0.253281
max,0.348294,0.27554


In [15]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.231957,367.378012,1.0,0.5,80.0,10.0
23,0.235308,396.724931,2.0,1.0,80.0,10.0
11,0.236864,358.826658,0.5,0.5,80.0,10.0
3,0.243926,317.039041,1.0,1.0,80.0,10.0
19,0.250509,350.874268,1.0,2.0,80.0,10.0
7,0.250868,334.074217,0.5,1.0,80.0,10.0
10,0.294674,188.911285,0.5,0.5,40.0,10.0
22,0.296709,206.661722,2.0,1.0,40.0,10.0
14,0.297307,186.44177,1.0,0.5,40.0,10.0
2,0.299342,158.189576,1.0,1.0,40.0,10.0


In [16]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.15438,367.378012,1.0,0.5,80.0,10.0
23,0.161132,396.724931,2.0,1.0,80.0,10.0
11,0.164072,358.826658,0.5,0.5,80.0,10.0
3,0.169828,317.039041,1.0,1.0,80.0,10.0
7,0.176873,334.074217,0.5,1.0,80.0,10.0
19,0.181634,350.874268,1.0,2.0,80.0,10.0
22,0.219462,206.661722,2.0,1.0,40.0,10.0
14,0.220485,186.44177,1.0,0.5,40.0,10.0
10,0.224154,188.911285,0.5,0.5,40.0,10.0
6,0.226686,179.560204,0.5,1.0,40.0,10.0


### MNMF

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 5.615429401397705
Iteration completed in 4.281301736831665
Iteration completed in 8.248645544052124
Iteration completed in 7.452203273773193
Iteration completed in 8.162068367004395
Iteration completed in 7.23846697807312
Iteration completed in 10.150762796401978
Iteration completed in 9.400005578994751


In [19]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.14988,0.08293
std,0.011987,0.004693
min,0.135248,0.077579
25%,0.141083,0.079945
50%,0.148713,0.081174
75%,0.161191,0.0875
max,0.164093,0.088967


In [20]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.135248,144.387542,8.0,200.0
3,0.136326,336.308679,16.0,200.0
0,0.142669,47.664274,8.0,100.0
2,0.142669,207.895045,16.0,100.0
5,0.154758,772.527251,32.0,200.0
4,0.160742,487.69828,32.0,100.0
7,0.162537,1883.460323,64.0,200.0
6,0.164093,1141.203295,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.077579,144.387542,8.0,200.0
3,0.077983,336.308679,16.0,200.0
4,0.080599,487.69828,32.0,100.0
0,0.081107,47.664274,8.0,100.0
2,0.08124,207.895045,16.0,100.0
5,0.087014,772.527251,32.0,200.0
6,0.088955,1141.203295,64.0,100.0
7,0.088967,1883.460323,64.0,200.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.799651145935059
Iteration completed in 9.610137224197388
Iteration completed in 9.32373309135437
Iteration completed in 9.479339361190796
Iteration completed in 11.4720139503479
Iteration completed in 10.9597749710083
Iteration completed in 10.511912822723389
Iteration completed in 10.549936056137085
Iteration completed in 14.131357192993164
Iteration completed in 14.34254240989685
Iteration completed in 13.032772302627563
Iteration completed in 13.148672103881836
Iteration completed in 8.99653172492981
Iteration completed in 9.874654531478882
Iteration completed in 12.618547916412354


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.229818,0.1567
std,0.006583,0.004447
min,0.21927,0.149258
25%,0.224357,0.153239
50%,0.230999,0.155967
75%,0.234829,0.160262
max,0.239497,0.165249


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
3,0.21927,77.28074,"[32, 8]",100.0,100.0
1,0.220826,50.047976,"[32, 8]",100.0,50.0
6,0.221664,139.50002,"[64, 16]",50.0,100.0
2,0.221903,68.263189,"[32, 8]",50.0,100.0
7,0.22681,169.919596,"[64, 16]",100.0,100.0
12,0.227289,136.182096,"[32, 8]",200.0,200.0
4,0.230999,81.475821,"[64, 16]",50.0,50.0
11,0.230999,436.150805,"[128, 32]",100.0,100.0
13,0.231119,313.978146,"[64, 16]",200.0,200.0
0,0.232675,39.073625,"[32, 8]",50.0,50.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
6,0.149258,139.50002,"[64, 16]",50.0,100.0
1,0.152483,50.047976,"[32, 8]",100.0,50.0
3,0.152753,77.28074,"[32, 8]",100.0,100.0
7,0.152973,169.919596,"[64, 16]",100.0,100.0
2,0.153506,68.263189,"[32, 8]",50.0,100.0
12,0.153591,136.182096,"[32, 8]",200.0,200.0
13,0.155006,313.978146,"[64, 16]",200.0,200.0
4,0.155967,81.475821,"[64, 16]",50.0,50.0
11,0.157194,436.150805,"[128, 32]",100.0,100.0
0,0.158823,39.073625,"[32, 8]",50.0,50.0


# Heterogeneous pt2

In [72]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [73]:
pd.Series(node_labels).value_counts()

18    8601
17    8135
19    5757
16    5562
15    3844
14    2579
20    2221
13    1314
12    1109
11     874
21     766
10     475
22     197
9      173
8       75
23      46
7       18
24      10
6        7
4        4
5        3
1        2
3        1
dtype: int64

### DW

In [78]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [79]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 26.460387468338013
Iteration completed in 27.230192184448242
Iteration completed in 26.443514585494995
Iteration completed in 25.417888164520264
Iteration completed in 26.701968669891357
Iteration completed in 26.09171986579895
Iteration completed in 24.98211121559143
Iteration completed in 24.796937227249146
Iteration completed in 26.416234970092773
Iteration completed in 25.73472261428833
Iteration completed in 24.832193613052368
Iteration completed in 25.15277910232544
Iteration completed in 26.329046726226807
Iteration completed in 25.317633628845215
Iteration completed in 25.40834617614746


In [80]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.215264,0.071189
std,0.008314,0.011069
min,0.198324,0.054729
25%,0.208618,0.063288
50%,0.217475,0.069782
75%,0.221245,0.079494
max,0.229324,0.090441


In [81]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.198324,49.297842,20.0,5.0
2,0.20778,94.867974,40.0,5.0
12,0.207899,201.832897,10.0,30.0
5,0.208259,119.407379,20.0,10.0
4,0.208977,59.892339,10.0,10.0
8,0.210293,130.357413,10.0,20.0
9,0.213046,256.848043,20.0,20.0
10,0.217475,514.222867,40.0,20.0
13,0.218312,398.917009,20.0,30.0
6,0.218791,232.0977,40.0,10.0


In [82]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
5,0.054729,119.407379,20.0,10.0
4,0.055089,59.892339,10.0,10.0
12,0.059195,201.832897,10.0,30.0
8,0.061899,130.357413,10.0,20.0
1,0.064678,49.297842,20.0,5.0
2,0.068809,94.867974,40.0,5.0
6,0.069543,232.0977,40.0,10.0
9,0.069782,256.848043,20.0,20.0
13,0.070678,398.917009,20.0,30.0
3,0.074249,178.922319,80.0,5.0


### N2V

In [83]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [84]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 28.835572719573975
Iteration completed in 29.61811399459839
Iteration completed in 28.943288326263428
Iteration completed in 29.635768175125122
Iteration completed in 28.82369351387024
Iteration completed in 29.545791149139404
Iteration completed in 29.036212682724
Iteration completed in 29.9149489402771
Iteration completed in 28.861021280288696
Iteration completed in 29.830989837646484
Iteration completed in 29.346426010131836
Iteration completed in 30.809333324432373
Iteration completed in 28.71877121925354
Iteration completed in 29.389601945877075
Iteration completed in 28.89900541305542
Iteration completed in 30.82696008682251
Iteration completed in 28.798459768295288
Iteration completed in 29.76260209083557
Iteration completed in 28.93027901649475
Iteration completed in 29.433732509613037
Iteration completed in 28.597168922424316
Iteration completed in 29.603020906448364
Iteration completed in 28.661301374435425
Iteration completed in 30.13076901435852


In [85]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.395831,0.200445
std,0.062914,0.032225
min,0.27684,0.136918
25%,0.371095,0.177627
50%,0.409037,0.208237
75%,0.440036,0.224117
max,0.487493,0.257224


In [86]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.27684,367.378012,1.0,0.5,80.0,10.0
23,0.287373,396.724931,2.0,1.0,80.0,10.0
11,0.289886,358.826658,0.5,0.5,80.0,10.0
3,0.296828,317.039041,1.0,1.0,80.0,10.0
7,0.314782,334.074217,0.5,1.0,80.0,10.0
19,0.331777,350.874268,1.0,2.0,80.0,10.0
14,0.384201,186.44177,1.0,0.5,40.0,10.0
10,0.385877,188.911285,0.5,0.5,40.0,10.0
2,0.395811,158.189576,1.0,1.0,40.0,10.0
22,0.39617,206.661722,2.0,1.0,40.0,10.0


In [87]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
3,0.136918,317.039041,1.0,1.0,80.0,10.0
7,0.148302,334.074217,0.5,1.0,80.0,10.0
23,0.150275,396.724931,2.0,1.0,80.0,10.0
15,0.156841,367.378012,1.0,0.5,80.0,10.0
19,0.162595,350.874268,1.0,2.0,80.0,10.0
11,0.169611,358.826658,0.5,0.5,80.0,10.0
14,0.1803,186.44177,1.0,0.5,40.0,10.0
22,0.189321,206.661722,2.0,1.0,40.0,10.0
21,0.196883,169.450061,2.0,1.0,80.0,5.0
18,0.199029,184.81163,1.0,2.0,40.0,10.0


### MNMF

In [88]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [89]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 4.817265510559082
Iteration completed in 4.971844673156738
Iteration completed in 7.1407012939453125
Iteration completed in 6.486069440841675
Iteration completed in 7.277744770050049
Iteration completed in 7.461440801620483
Iteration completed in 9.229758024215698
Iteration completed in 8.323115110397339


In [90]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.198459,0.095683
std,0.009238,0.02182
min,0.187313,0.072116
25%,0.193806,0.080035
50%,0.198444,0.095306
75%,0.201257,0.100989
max,0.216278,0.141312


In [91]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
3,0.187313,336.308679,16.0,200.0
0,0.187433,47.664274,8.0,100.0
2,0.195931,207.895045,16.0,100.0
1,0.198085,144.387542,8.0,200.0
4,0.198803,487.69828,32.0,100.0
5,0.200598,772.527251,32.0,200.0
7,0.203232,1883.460323,64.0,200.0
6,0.216278,1141.203295,64.0,100.0


In [92]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.072116,47.664274,8.0,100.0
1,0.07614,144.387542,8.0,200.0
3,0.081333,336.308679,16.0,200.0
2,0.092136,207.895045,16.0,100.0
5,0.098475,772.527251,32.0,200.0
7,0.100001,1883.460323,64.0,200.0
6,0.103953,1141.203295,64.0,100.0
4,0.141312,487.69828,32.0,100.0


### DANMF

In [93]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [94]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 8.855279207229614
Iteration completed in 8.81141185760498
Iteration completed in 9.465211391448975
Iteration completed in 8.410084962844849
Iteration completed in 9.779217720031738
Iteration completed in 9.733139514923096
Iteration completed in 10.349646091461182
Iteration completed in 9.494848251342773
Iteration completed in 12.486294746398926
Iteration completed in 12.715497970581055
Iteration completed in 12.343581438064575
Iteration completed in 11.80415678024292
Iteration completed in 8.267218112945557
Iteration completed in 8.822057008743286
Iteration completed in 12.177658319473267


In [95]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.261911,0.16626
std,0.009865,0.021913
min,0.245003,0.13848
25%,0.254817,0.147455
50%,0.260802,0.164792
75%,0.270736,0.177581
max,0.277558,0.21598


In [96]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
1,0.245003,50.047976,"[32, 8]",100.0,50.0
3,0.248953,77.28074,"[32, 8]",100.0,100.0
6,0.252304,139.50002,"[64, 16]",50.0,100.0
13,0.254578,313.978146,"[64, 16]",200.0,200.0
2,0.255057,68.263189,"[32, 8]",50.0,100.0
12,0.259126,136.182096,"[32, 8]",200.0,200.0
0,0.259485,39.073625,"[32, 8]",50.0,50.0
9,0.260802,313.552414,"[128, 32]",100.0,50.0
4,0.262837,81.475821,"[64, 16]",50.0,50.0
7,0.263794,169.919596,"[64, 16]",100.0,100.0


In [97]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
6,0.13848,139.50002,"[64, 16]",50.0,100.0
11,0.143668,436.150805,"[128, 32]",100.0,100.0
13,0.14459,313.978146,"[64, 16]",200.0,200.0
12,0.146651,136.182096,"[32, 8]",200.0,200.0
10,0.14826,318.435426,"[128, 32]",50.0,100.0
8,0.156695,215.285758,"[128, 32]",50.0,50.0
9,0.157968,313.552414,"[128, 32]",100.0,50.0
2,0.164792,68.263189,"[32, 8]",50.0,100.0
3,0.166326,77.28074,"[32, 8]",100.0,100.0
0,0.171103,39.073625,"[32, 8]",50.0,50.0
