In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HR_edges_norm.csv")

In [3]:
### Calculating pagerank score for each node
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 0.0001049041748046875


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# Heterogeneous intervals

In [27]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [28]:
pd.Series(node_labels).value_counts()

17    7602
16    7186
18    7146
15    6526
14    6289
13    4977
19    3084
9     2716
11    2693
12    2654
1     2330
20    1034
21     267
22      60
23       6
24       3
dtype: int64

### DW

In [29]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [30]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 41.972649574279785
Iteration completed in 50.70474600791931
Iteration completed in 42.63912844657898
Iteration completed in 39.74735188484192
Iteration completed in 49.32630157470703
Iteration completed in 42.29257583618164
Iteration completed in 39.79845213890076
Iteration completed in 39.260494232177734
Iteration completed in 39.0065860748291
Iteration completed in 36.245434045791626
Iteration completed in 34.92654085159302
Iteration completed in 35.71336507797241
Iteration completed in 38.553425312042236
Iteration completed in 39.42339730262756
Iteration completed in 39.99510860443115


In [31]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.170591,0.113687
std,0.013959,0.01735
min,0.143289,0.082049
25%,0.163353,0.098287
50%,0.174164,0.11849
75%,0.177691,0.126702
max,0.192304,0.135731


In [32]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.143289,72.052979,20.0,5.0
4,0.14787,90.918856,10.0,10.0
8,0.156115,188.168018,10.0,20.0
12,0.162987,310.089163,10.0,30.0
5,0.16372,171.306192,20.0,10.0
9,0.167568,379.942767,20.0,20.0
13,0.170957,699.778587,20.0,30.0
2,0.174164,142.735676,40.0,5.0
11,0.176088,1482.992684,80.0,20.0
14,0.176088,1403.005183,40.0,30.0


In [33]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.082049,72.052979,20.0,5.0
4,0.091483,90.918856,10.0,10.0
12,0.094917,310.089163,10.0,30.0
5,0.095435,171.306192,20.0,10.0
2,0.101139,142.735676,40.0,5.0
8,0.105132,188.168018,10.0,20.0
0,0.117371,38.241227,10.0,5.0
7,0.11849,670.22348,80.0,10.0
6,0.120739,341.508353,40.0,10.0
10,0.122201,734.545967,40.0,20.0


### N2V

In [34]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [35]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 50.204670667648315
Iteration completed in 56.33204650878906
Iteration completed in 59.94807720184326
Iteration completed in 53.60779356956482
Iteration completed in 51.446401596069336
Iteration completed in 58.629199743270874
Iteration completed in 57.88118553161621
Iteration completed in 53.58245897293091
Iteration completed in 48.83371114730835
Iteration completed in 54.16690444946289
Iteration completed in 54.54361605644226
Iteration completed in 48.463982582092285
Iteration completed in 46.44059085845947
Iteration completed in 48.34978795051575
Iteration completed in 46.41758704185486
Iteration completed in 44.469151973724365
Iteration completed in 44.33087611198425
Iteration completed in 56.511547803878784
Iteration completed in 61.39825391769409
Iteration completed in 54.00870418548584
Iteration completed in 49.51289367675781
Iteration completed in 54.21238613128662
Iteration completed in 54.403727293014526
Iteration completed in 52.99719190597534


In [36]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.210551,0.142332
std,0.022401,0.018949
min,0.185708,0.110274
25%,0.195671,0.126223
50%,0.201832,0.147434
75%,0.223568,0.159517
max,0.261567,0.172585


In [37]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
17,0.185708,340.624793,1.0,2.0,80.0,5.0
5,0.187082,349.895019,0.5,1.0,80.0,5.0
19,0.187723,695.624031,1.0,2.0,80.0,10.0
9,0.191754,336.869814,0.5,0.5,80.0,5.0
13,0.192854,346.081429,1.0,0.5,80.0,5.0
6,0.195053,397.617808,0.5,1.0,40.0,10.0
1,0.195877,289.630887,1.0,1.0,80.0,5.0
7,0.195969,687.228601,0.5,1.0,80.0,10.0
15,0.19716,696.772516,1.0,0.5,80.0,10.0
23,0.198626,709.758869,2.0,1.0,80.0,10.0


In [38]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
19,0.110274,695.624031,1.0,2.0,80.0,10.0
15,0.115052,696.772516,1.0,0.5,80.0,10.0
5,0.117141,349.895019,0.5,1.0,80.0,5.0
23,0.119669,709.758869,2.0,1.0,80.0,10.0
7,0.120771,687.228601,0.5,1.0,80.0,10.0
13,0.126028,346.081429,1.0,0.5,80.0,5.0
9,0.126288,336.869814,0.5,0.5,80.0,5.0
3,0.127087,575.860253,1.0,1.0,80.0,10.0
1,0.128384,289.630887,1.0,1.0,80.0,5.0
17,0.135286,340.624793,1.0,2.0,80.0,5.0


### MNMF

In [39]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [40]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.709742307662964
Iteration completed in 8.634826183319092
Iteration completed in 14.762834787368774
Iteration completed in 13.816940784454346
Iteration completed in 16.303472995758057
Iteration completed in 14.838946104049683
Iteration completed in 22.738414764404297


In [41]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,7.0,7.0
mean,0.145082,0.082579
std,0.006145,0.007103
min,0.13935,0.074126
25%,0.140403,0.076578
50%,0.14219,0.0822
75%,0.149519,0.088002
max,0.154191,0.092566


In [42]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.13935,423.98246,8.0,100.0
3,0.139716,3089.716942,16.0,200.0
1,0.14109,1231.952486,8.0,200.0
2,0.14219,1873.740525,16.0,100.0
5,0.146496,11782.491975,32.0,200.0
4,0.152542,6102.296241,32.0,100.0
6,0.154191,18979.924733,64.0,100.0


In [43]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.074126,423.98246,8.0,100.0
1,0.076254,1231.952486,8.0,200.0
2,0.076903,1873.740525,16.0,100.0
3,0.0822,3089.716942,16.0,200.0
4,0.087616,6102.296241,32.0,100.0
5,0.088389,11782.491975,32.0,200.0
6,0.092566,18979.924733,64.0,100.0


### DANMF

In [44]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [45]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 14.553501844406128
Iteration completed in 14.810252904891968
Iteration completed in 14.372995138168335
Iteration completed in 14.639358282089233
Iteration completed in 16.803764581680298
Iteration completed in 16.86903405189514
Iteration completed in 16.114516973495483
Iteration completed in 16.16460084915161
Iteration completed in 21.35441827774048
Iteration completed in 22.185779333114624
Iteration completed in 20.773714780807495
Iteration completed in 20.448589324951172
Iteration completed in 13.708588361740112
Iteration completed in 15.131217956542969
Iteration completed in 19.555410861968994


In [46]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.336515,0.290674
std,0.011543,0.029356
min,0.316079,0.236638
25%,0.330279,0.274105
50%,0.336693,0.289584
75%,0.343335,0.310757
max,0.357581,0.353919


In [47]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
7,0.316079,653.606651,"[64, 16]",100.0,100.0
6,0.319743,608.053479,"[64, 16]",50.0,100.0
13,0.322126,1321.520216,"[64, 16]",100.0,50.0
12,0.330188,560.116846,"[32, 8]",50.0,50.0
5,0.330371,382.492228,"[64, 16]",100.0,50.0
4,0.33541,336.803176,"[64, 16]",50.0,50.0
3,0.335685,278.286608,"[32, 8]",100.0,100.0
8,0.336693,615.958076,"[128, 32]",50.0,50.0
2,0.33935,268.152193,"[32, 8]",50.0,100.0
0,0.340266,151.062392,"[32, 8]",50.0,50.0


In [48]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
7,0.236638,653.606651,"[64, 16]",100.0,100.0
6,0.253677,608.053479,"[64, 16]",50.0,100.0
12,0.26365,560.116846,"[32, 8]",50.0,50.0
10,0.270276,1088.522675,"[128, 32]",50.0,100.0
5,0.277933,382.492228,"[64, 16]",100.0,50.0
4,0.283556,336.803176,"[64, 16]",50.0,50.0
13,0.286221,1321.520216,"[64, 16]",100.0,50.0
14,0.289584,2270.463403,"[128, 32]",50.0,100.0
0,0.292959,151.062392,"[32, 8]",50.0,50.0
11,0.29325,1242.205607,"[128, 32]",100.0,100.0


# Heterogeneous intervals 2

In [5]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [6]:
pd.Series(node_labels).value_counts()

1     7739
2     5214
3     4663
11    4171
4     4043
16    3965
5     3526
6     3000
12    2916
7     2586
8     2448
13    2422
9     2152
10    1902
14    1859
15    1478
17     489
dtype: int64

### DW

In [7]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [8]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 64.70161843299866
Iteration completed in 69.29632997512817
Iteration completed in 44.89647054672241
Iteration completed in 42.06488490104675
Iteration completed in 54.79595994949341
Iteration completed in 45.73439979553223
Iteration completed in 42.016987800598145
Iteration completed in 44.04422688484192
Iteration completed in 46.02486491203308
Iteration completed in 41.92940044403076
Iteration completed in 37.64289832115173
Iteration completed in 38.52453374862671
Iteration completed in 39.19635605812073
Iteration completed in 38.305323362350464
Iteration completed in 37.039862394332886


In [9]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.168435,0.087504
std,0.020535,0.012266
min,0.140357,0.063007
25%,0.146679,0.078474
50%,0.17618,0.091407
75%,0.184975,0.096537
max,0.198809,0.105259


In [10]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
5,0.140357,171.306192,20.0,10.0
1,0.142831,72.052979,20.0,5.0
8,0.143747,188.168018,10.0,20.0
4,0.146496,90.918856,10.0,10.0
2,0.146862,142.735676,40.0,5.0
12,0.15355,310.089163,10.0,30.0
9,0.173889,379.942767,20.0,20.0
3,0.17618,288.967294,80.0,5.0
6,0.176454,341.508353,40.0,10.0
0,0.176821,38.241227,10.0,5.0


In [11]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
4,0.063007,90.918856,10.0,10.0
1,0.071082,72.052979,20.0,5.0
5,0.074903,171.306192,20.0,10.0
2,0.078,142.735676,40.0,5.0
8,0.078947,188.168018,10.0,20.0
12,0.080547,310.089163,10.0,30.0
9,0.08928,379.942767,20.0,20.0
0,0.091407,38.241227,10.0,5.0
11,0.093414,1482.992684,80.0,20.0
14,0.096065,1403.005183,40.0,30.0


### N2V

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 46.971102237701416
Iteration completed in 50.776880979537964
Iteration completed in 54.3988471031189
Iteration completed in 51.90022802352905
Iteration completed in 49.04723906517029
Iteration completed in 59.46029305458069
Iteration completed in 61.4671745300293
Iteration completed in 59.85696744918823
Iteration completed in 57.533819913864136
Iteration completed in 73.70807218551636
Iteration completed in 68.11724257469177
Iteration completed in 61.77933692932129
Iteration completed in 64.02046084403992
Iteration completed in 78.00820088386536
Iteration completed in 76.35859274864197
Iteration completed in 74.50392556190491
Iteration completed in 61.148441791534424
Iteration completed in 64.63411235809326
Iteration completed in 65.84258317947388
Iteration completed in 57.760801792144775
Iteration completed in 54.294764041900635
Iteration completed in 60.35124158859253
Iteration completed in 62.082377910614014
Iteration completed in 58.926884174346924


In [14]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.193312,0.102437
std,0.026251,0.019792
min,0.15639,0.077835
25%,0.176798,0.088103
50%,0.189281,0.098882
75%,0.210765,0.114128
max,0.247366,0.144829


In [15]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
19,0.15639,695.624031,1.0,2.0,80.0,10.0
7,0.157765,687.228601,0.5,1.0,80.0,10.0
3,0.161246,575.860253,1.0,1.0,80.0,10.0
23,0.163903,709.758869,2.0,1.0,80.0,10.0
15,0.164911,696.772516,1.0,0.5,80.0,10.0
11,0.173156,700.444418,0.5,0.5,80.0,10.0
17,0.178012,340.624793,1.0,2.0,80.0,5.0
5,0.179478,349.895019,0.5,1.0,80.0,5.0
13,0.180577,346.081429,1.0,0.5,80.0,5.0
9,0.183234,336.869814,0.5,0.5,80.0,5.0


In [16]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
19,0.077835,695.624031,1.0,2.0,80.0,10.0
7,0.07972,687.228601,0.5,1.0,80.0,10.0
3,0.080379,575.860253,1.0,1.0,80.0,10.0
23,0.080984,709.758869,2.0,1.0,80.0,10.0
15,0.081059,696.772516,1.0,0.5,80.0,10.0
5,0.088025,349.895019,0.5,1.0,80.0,5.0
11,0.088129,700.444418,0.5,0.5,80.0,10.0
17,0.088267,340.624793,1.0,2.0,80.0,5.0
13,0.089549,346.081429,1.0,0.5,80.0,5.0
9,0.09338,336.869814,0.5,0.5,80.0,5.0


### MNMF

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.207851648330688
Iteration completed in 9.45390272140503
Iteration completed in 15.901008367538452
Iteration completed in 15.083305358886719
Iteration completed in 18.061274766921997
Iteration completed in 18.584495782852173
Iteration completed in 28.222505569458008


In [19]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,7.0,7.0
mean,0.129036,0.067059
std,0.01706,0.005199
min,0.106367,0.058212
25%,0.116125,0.06478
50%,0.127623,0.066403
75%,0.142419,0.071087
max,0.152176,0.073068


In [20]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.106367,1231.952486,8.0,200.0
0,0.115071,423.98246,8.0,100.0
3,0.117178,3089.716942,16.0,200.0
2,0.127623,1873.740525,16.0,100.0
4,0.14164,6102.296241,32.0,100.0
5,0.143197,11782.491975,32.0,200.0
6,0.152176,18979.924733,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.058212,1231.952486,8.0,200.0
0,0.063683,423.98246,8.0,100.0
2,0.065877,1873.740525,16.0,100.0
3,0.066403,3089.716942,16.0,200.0
4,0.070055,6102.296241,32.0,100.0
5,0.072119,11782.491975,32.0,200.0
6,0.073068,18979.924733,64.0,100.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 15.64420771598816
Iteration completed in 15.664472341537476
Iteration completed in 15.001234292984009
Iteration completed in 15.396608829498291
Iteration completed in 17.533756494522095
Iteration completed in 17.62829899787903
Iteration completed in 18.364481449127197
Iteration completed in 18.311678886413574
Iteration completed in 23.762133836746216
Iteration completed in 24.116300106048584
Iteration completed in 22.37822675704956
Iteration completed in 23.355974197387695
Iteration completed in 14.763397216796875
Iteration completed in 15.788581609725952
Iteration completed in 20.21233558654785


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.291342,0.209528
std,0.009058,0.010821
min,0.279066,0.195227
25%,0.28241,0.203291
50%,0.291525,0.207494
75%,0.297984,0.217252
max,0.304993,0.231009


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
7,0.279066,653.606651,"[64, 16]",100.0,100.0
6,0.280348,608.053479,"[64, 16]",50.0,100.0
5,0.280531,382.492228,"[64, 16]",100.0,50.0
12,0.281814,560.116846,"[32, 8]",50.0,50.0
13,0.283005,1321.520216,"[64, 16]",100.0,50.0
2,0.287861,268.152193,"[32, 8]",50.0,100.0
4,0.290884,336.803176,"[64, 16]",50.0,50.0
1,0.291525,164.564006,"[32, 8]",100.0,50.0
10,0.291617,1088.522675,"[128, 32]",50.0,100.0
3,0.295831,278.286608,"[32, 8]",100.0,100.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
13,0.195227,1321.520216,"[64, 16]",100.0,50.0
12,0.195371,560.116846,"[32, 8]",50.0,50.0
5,0.195388,382.492228,"[64, 16]",100.0,50.0
7,0.202987,653.606651,"[64, 16]",100.0,100.0
6,0.203596,608.053479,"[64, 16]",50.0,100.0
4,0.20362,336.803176,"[64, 16]",50.0,50.0
14,0.205315,2270.463403,"[128, 32]",50.0,100.0
8,0.207494,615.958076,"[128, 32]",50.0,50.0
10,0.209496,1088.522675,"[128, 32]",50.0,100.0
1,0.216805,164.564006,"[32, 8]",100.0,50.0
