In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HR_edges_norm.csv")

In [3]:
### Calculating pagerank score for each node
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 2.130267858505249


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v)
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [5]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [6]:
pd.Series(node_labels).value_counts()

1     33183
2     16824
3      3428
4       739
5       216
6        93
7        41
8        23
9        10
10        7
11        3
12        2
20        2
13        1
15        1
dtype: int64

### DW

In [7]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [8]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 48.43126845359802
Iteration completed in 60.47226572036743
Iteration completed in 48.39790630340576
Iteration completed in 45.981465339660645
Iteration completed in 61.62443566322327
Iteration completed in 47.98409938812256
Iteration completed in 46.33664345741272
Iteration completed in 44.399330377578735
Iteration completed in 45.831849813461304
Iteration completed in 45.15523958206177
Iteration completed in 43.91900134086609
Iteration completed in 45.407368898391724
Iteration completed in 45.46156072616577
Iteration completed in 46.855947494506836
Iteration completed in 46.558838844299316


In [9]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.638394,0.112588
std,0.017912,0.023827
min,0.600458,0.07374
25%,0.634952,0.097017
50%,0.642602,0.1134
75%,0.648191,0.122139
max,0.66175,0.163473


In [10]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
4,0.600458,90.918856,10.0,10.0
1,0.602016,72.052979,20.0,5.0
2,0.621255,142.735676,40.0,5.0
8,0.631516,188.168018,10.0,20.0
12,0.638388,310.089163,10.0,30.0
5,0.640586,171.306192,20.0,10.0
11,0.641961,1482.992684,80.0,20.0
13,0.642602,699.778587,20.0,30.0
9,0.644068,379.942767,20.0,20.0
7,0.647824,670.22348,80.0,10.0


In [11]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
4,0.07374,90.918856,10.0,10.0
1,0.085418,72.052979,20.0,5.0
6,0.092287,341.508353,40.0,10.0
10,0.096994,734.545967,40.0,20.0
2,0.09704,142.735676,40.0,5.0
11,0.100613,1482.992684,80.0,20.0
9,0.110057,379.942767,20.0,20.0
0,0.1134,38.241227,10.0,5.0
3,0.115556,288.967294,80.0,5.0
5,0.116812,171.306192,20.0,10.0


### N2V

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 54.24337387084961
Iteration completed in 51.986976146698
Iteration completed in 52.48609805107117
Iteration completed in 66.19632124900818
Iteration completed in 52.11771249771118
Iteration completed in 52.25530767440796
Iteration completed in 52.42127466201782
Iteration completed in 65.69098162651062
Iteration completed in 52.53951287269592
Iteration completed in 51.934669971466064
Iteration completed in 52.86302876472473
Iteration completed in 63.24782180786133
Iteration completed in 51.2488272190094
Iteration completed in 50.79641342163086
Iteration completed in 52.47851252555847
Iteration completed in 63.93342971801758
Iteration completed in 51.04504656791687
Iteration completed in 51.37673473358154
Iteration completed in 52.290300130844116
Iteration completed in 66.37624478340149
Iteration completed in 50.624030351638794
Iteration completed in 51.6905357837677
Iteration completed in 63.46743154525757
Iteration completed in 74.72592425346375


In [14]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.669847,0.115203
std,0.058714,0.031094
min,0.595511,0.064155
25%,0.626088,0.093667
50%,0.659047,0.117723
75%,0.7,0.136237
max,0.770866,0.177814


In [15]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
7,0.595511,687.228601,0.5,1.0,80.0,10.0
23,0.597618,709.758869,2.0,1.0,80.0,10.0
19,0.602382,695.624031,1.0,2.0,80.0,10.0
11,0.604672,700.444418,0.5,0.5,80.0,10.0
3,0.608062,575.860253,1.0,1.0,80.0,10.0
15,0.608704,696.772516,1.0,0.5,80.0,10.0
10,0.631883,388.473305,0.5,0.5,40.0,10.0
6,0.635639,397.617808,0.5,1.0,40.0,10.0
2,0.646175,331.992402,1.0,1.0,40.0,10.0
18,0.652405,383.037323,1.0,2.0,40.0,10.0


In [16]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.064155,696.772516,1.0,0.5,80.0,10.0
19,0.07094,695.624031,1.0,2.0,80.0,10.0
11,0.073904,700.444418,0.5,0.5,80.0,10.0
23,0.074366,709.758869,2.0,1.0,80.0,10.0
7,0.074499,687.228601,0.5,1.0,80.0,10.0
3,0.076736,575.860253,1.0,1.0,80.0,10.0
5,0.099311,349.895019,0.5,1.0,80.0,5.0
10,0.10794,388.473305,0.5,0.5,40.0,10.0
6,0.109913,397.617808,0.5,1.0,40.0,10.0
14,0.113018,397.794394,1.0,0.5,40.0,10.0


### MNMF

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 11.314960241317749
Iteration completed in 9.990081310272217
Iteration completed in 17.019996404647827
Iteration completed in 15.424174785614014
Iteration completed in 19.41280508041382
Iteration completed in 17.855528593063354
Iteration completed in 26.80041241645813


In [19]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,7.0,7.0
mean,0.593744,0.084934
std,0.008954,0.012953
min,0.580669,0.069857
25%,0.58841,0.078596
50%,0.594503,0.083475
75%,0.599725,0.086841
max,0.604764,0.110331


In [20]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.580669,1231.952486,8.0,200.0
0,0.582959,423.98246,8.0,100.0
2,0.593862,1873.740525,16.0,100.0
3,0.594503,3089.716942,16.0,200.0
4,0.59945,6102.296241,32.0,100.0
5,0.6,11782.491975,32.0,200.0
6,0.604764,18979.924733,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.069857,423.98246,8.0,100.0
1,0.073881,1231.952486,8.0,200.0
6,0.083312,18979.924733,64.0,100.0
5,0.083475,11782.491975,32.0,200.0
2,0.086571,1873.740525,16.0,100.0
3,0.087112,3089.716942,16.0,200.0
4,0.110331,6102.296241,32.0,100.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 13.692284345626831
Iteration completed in 13.157453775405884
Iteration completed in 12.76981520652771
Iteration completed in 13.180540800094604
Iteration completed in 15.978979110717773
Iteration completed in 16.04534935951233
Iteration completed in 15.394814252853394
Iteration completed in 15.824065446853638
Iteration completed in 21.784692764282227
Iteration completed in 21.98330545425415
Iteration completed in 20.809836864471436
Iteration completed in 20.381986379623413
Iteration completed in 12.990750312805176
Iteration completed in 15.111840009689331
Iteration completed in 19.303121089935303


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.735679,0.186151
std,0.007164,0.028336
min,0.723866,0.147634
25%,0.73115,0.168106
50%,0.735227,0.183073
75%,0.739304,0.201728
max,0.752268,0.250234


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
5,0.723866,382.492228,"[64, 16]",100.0,50.0
12,0.72634,560.116846,"[32, 8]",50.0,50.0
7,0.728814,653.606651,"[64, 16]",100.0,100.0
13,0.730463,1321.520216,"[64, 16]",100.0,50.0
6,0.731837,608.053479,"[64, 16]",50.0,100.0
4,0.734036,336.803176,"[64, 16]",50.0,50.0
14,0.735135,2270.463403,"[128, 32]",50.0,100.0
9,0.735227,775.608052,"[128, 32]",100.0,50.0
8,0.736418,615.958076,"[128, 32]",50.0,50.0
11,0.737792,1242.205607,"[128, 32]",100.0,100.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
11,0.147634,1242.205607,"[128, 32]",100.0,100.0
6,0.150515,608.053479,"[64, 16]",50.0,100.0
9,0.155793,775.608052,"[128, 32]",100.0,50.0
12,0.156485,560.116846,"[32, 8]",50.0,50.0
4,0.179726,336.803176,"[64, 16]",50.0,50.0
1,0.180278,164.564006,"[32, 8]",100.0,50.0
7,0.181879,653.606651,"[64, 16]",100.0,100.0
3,0.183073,278.286608,"[32, 8]",100.0,100.0
0,0.183786,151.062392,"[32, 8]",50.0,50.0
14,0.184256,2270.463403,"[128, 32]",50.0,100.0


# Heterogeneous intervals

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

11    4941
12    4221
16    4167
4     3936
5     3841
3     3718
6     3702
7     3592
13    3379
2     3265
8     3182
9     2905
10    2717
14    2513
1     2325
15    1770
17     399
dtype: int64

### DW

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [9]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 31.407286643981934
Iteration completed in 36.53315472602844
Iteration completed in 34.47620177268982
Iteration completed in 35.86431384086609
Iteration completed in 35.67830038070679
Iteration completed in 34.36532258987427
Iteration completed in 33.09548234939575
Iteration completed in 33.72660756111145
Iteration completed in 34.24086666107178
Iteration completed in 33.62365388870239
Iteration completed in 33.12623620033264
Iteration completed in 33.33815050125122
Iteration completed in 34.37244939804077
Iteration completed in 34.31217384338379
Iteration completed in 35.921796798706055


In [10]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.116549,0.097348
std,0.008735,0.007511
min,0.106093,0.088878
25%,0.11104,0.092694
50%,0.115346,0.094952
75%,0.11924,0.102489
max,0.141915,0.117501


In [11]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.106093,72.052979,20.0,5.0
8,0.107375,188.168018,10.0,20.0
2,0.109116,142.735676,40.0,5.0
5,0.109208,171.306192,20.0,10.0
13,0.112872,699.778587,20.0,30.0
12,0.113972,310.089163,10.0,30.0
9,0.114705,379.942767,20.0,20.0
4,0.115346,90.918856,10.0,10.0
11,0.116537,1482.992684,80.0,20.0
6,0.117361,341.508353,40.0,10.0


In [12]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
8,0.088878,188.168018,10.0,20.0
9,0.089542,379.942767,20.0,20.0
2,0.090892,142.735676,40.0,5.0
5,0.092407,171.306192,20.0,10.0
12,0.092981,310.089163,10.0,30.0
1,0.093012,72.052979,20.0,5.0
4,0.093381,90.918856,10.0,10.0
13,0.094952,699.778587,20.0,30.0
6,0.095675,341.508353,40.0,10.0
11,0.100244,1482.992684,80.0,20.0


### N2V

In [13]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 38.66967463493347
Iteration completed in 41.579580307006836
Iteration completed in 44.513519525527954
Iteration completed in 45.644200563430786
Iteration completed in 38.976370334625244
Iteration completed in 42.787004232406616
Iteration completed in 44.79375457763672
Iteration completed in 44.269792556762695
Iteration completed in 38.621713638305664
Iteration completed in 41.475958585739136
Iteration completed in 46.05716276168823
Iteration completed in 44.32974934577942
Iteration completed in 38.9140887260437
Iteration completed in 41.69163155555725
Iteration completed in 43.362054109573364
Iteration completed in 43.543259143829346
Iteration completed in 38.01498818397522
Iteration completed in 40.78573775291443
Iteration completed in 44.076995849609375
Iteration completed in 43.78540873527527
Iteration completed in 38.53304886817932
Iteration completed in 39.791279792785645
Iteration completed in 44.96688628196716
Iteration completed in 44.59403681755066


In [15]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.194411,0.16745
std,0.035598,0.029292
min,0.138891,0.122236
25%,0.177668,0.150019
50%,0.195373,0.16792
75%,0.214498,0.182187
max,0.247916,0.209225


In [16]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
7,0.138891,687.228601,0.5,1.0,80.0,10.0
3,0.140541,575.860253,1.0,1.0,80.0,10.0
15,0.140999,696.772516,1.0,0.5,80.0,10.0
23,0.146679,709.758869,2.0,1.0,80.0,10.0
11,0.148145,700.444418,0.5,0.5,80.0,10.0
19,0.150802,695.624031,1.0,2.0,80.0,10.0
5,0.186624,349.895019,0.5,1.0,80.0,5.0
10,0.189556,388.473305,0.5,0.5,40.0,10.0
6,0.190472,397.617808,0.5,1.0,40.0,10.0
9,0.192854,336.869814,0.5,0.5,80.0,5.0


In [17]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
7,0.122236,687.228601,0.5,1.0,80.0,10.0
3,0.123323,575.860253,1.0,1.0,80.0,10.0
15,0.123354,696.772516,1.0,0.5,80.0,10.0
11,0.127959,700.444418,0.5,0.5,80.0,10.0
23,0.128058,709.758869,2.0,1.0,80.0,10.0
19,0.13063,695.624031,1.0,2.0,80.0,10.0
5,0.156482,349.895019,0.5,1.0,80.0,5.0
17,0.163082,340.624793,1.0,2.0,80.0,5.0
9,0.16367,336.869814,0.5,0.5,80.0,5.0
1,0.165763,289.630887,1.0,1.0,80.0,5.0


### MNMF

In [18]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [19]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.795051097869873
Iteration completed in 7.408318519592285
Iteration completed in 12.130119323730469
Iteration completed in 11.499408960342407
Iteration completed in 13.906658172607422
Iteration completed in 12.81519865989685
Iteration completed in 19.438565731048584


In [20]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,7.0,7.0
mean,0.085583,0.070281
std,0.007628,0.005706
min,0.077966,0.06305
25%,0.079157,0.06649
50%,0.082639,0.06775
75%,0.092396,0.075582
max,0.095373,0.077025


In [21]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.077966,423.98246,8.0,100.0
1,0.078791,1231.952486,8.0,200.0
2,0.079524,1873.740525,16.0,100.0
3,0.082639,3089.716942,16.0,200.0
4,0.09006,6102.296241,32.0,100.0
5,0.094732,11782.491975,32.0,200.0
6,0.095373,18979.924733,64.0,100.0


In [22]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.06305,423.98246,8.0,100.0
1,0.065298,1231.952486,8.0,200.0
2,0.067683,1873.740525,16.0,100.0
3,0.06775,3089.716942,16.0,200.0
4,0.074283,6102.296241,32.0,100.0
5,0.076882,11782.491975,32.0,200.0
6,0.077025,18979.924733,64.0,100.0


### DANMF

In [23]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 12.492459535598755
Iteration completed in 13.019482851028442
Iteration completed in 11.88072943687439
Iteration completed in 12.505606412887573
Iteration completed in 14.409048080444336
Iteration completed in 14.109324216842651
Iteration completed in 13.944173097610474
Iteration completed in 13.763452291488647
Iteration completed in 18.657817602157593
Iteration completed in 19.25072717666626
Iteration completed in 17.574079751968384
Iteration completed in 16.44072961807251
Iteration completed in 10.372945070266724
Iteration completed in 11.894161462783813
Iteration completed in 14.55647349357605


In [25]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.173675,0.162242
std,0.005851,0.005868
min,0.163353,0.154507
25%,0.170133,0.157348
50%,0.174072,0.160667
75%,0.176775,0.166469
max,0.18754,0.174728


In [26]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
13,0.163353,1321.520216,"[64, 16]",100.0,50.0
6,0.167751,608.053479,"[64, 16]",50.0,100.0
12,0.167751,560.116846,"[32, 8]",50.0,50.0
4,0.170041,336.803176,"[64, 16]",50.0,50.0
7,0.170224,653.606651,"[64, 16]",100.0,100.0
5,0.170316,382.492228,"[64, 16]",100.0,50.0
14,0.173889,2270.463403,"[128, 32]",50.0,100.0
2,0.174072,268.152193,"[32, 8]",50.0,100.0
3,0.174164,278.286608,"[32, 8]",100.0,100.0
10,0.17508,1088.522675,"[128, 32]",50.0,100.0


In [27]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
6,0.154507,608.053479,"[64, 16]",50.0,100.0
13,0.155094,1321.520216,"[64, 16]",100.0,50.0
8,0.157205,615.958076,"[128, 32]",50.0,50.0
7,0.157219,653.606651,"[64, 16]",100.0,100.0
12,0.157478,560.116846,"[32, 8]",50.0,50.0
14,0.15941,2270.463403,"[128, 32]",50.0,100.0
5,0.159887,382.492228,"[64, 16]",100.0,50.0
11,0.160667,1242.205607,"[128, 32]",100.0,100.0
4,0.162386,336.803176,"[64, 16]",50.0,50.0
3,0.164879,278.286608,"[32, 8]",100.0,100.0


# Heterogeneous pt2

In [71]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [72]:
pd.Series(node_labels).value_counts()

17    10852
16     9571
18     9154
15     7382
14     5082
19     3578
13     3124
12     1853
11     1103
20      935
10      712
9       416
8       248
21      207
7       147
6        84
22       50
5        38
4        17
23        7
1         5
3         4
2         2
24        2
dtype: int64

### DW

In [77]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [78]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 35.991700410842896
Iteration completed in 37.04088497161865
Iteration completed in 32.7099826335907
Iteration completed in 29.9983172416687
Iteration completed in 30.82905125617981
Iteration completed in 31.640153408050537
Iteration completed in 29.49881076812744
Iteration completed in 30.036474227905273
Iteration completed in 29.88347363471985
Iteration completed in 29.449329137802124
Iteration completed in 30.915624141693115
Iteration completed in 29.42163586616516
Iteration completed in 29.58678650856018
Iteration completed in 29.94039297103882
Iteration completed in 31.5815908908844


In [79]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.217126,0.066935
std,0.012061,0.007016
min,0.189464,0.056218
25%,0.2153,0.061467
50%,0.21924,0.066447
75%,0.222858,0.071672
max,0.237746,0.077804


In [80]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.189464,72.052979,20.0,5.0
4,0.19377,90.918856,10.0,10.0
5,0.211544,171.306192,20.0,10.0
8,0.214475,188.168018,10.0,20.0
11,0.216125,1482.992684,80.0,20.0
14,0.216125,1403.005183,40.0,30.0
12,0.218232,310.089163,10.0,30.0
13,0.21924,699.778587,20.0,30.0
7,0.220797,670.22348,80.0,10.0
6,0.221988,341.508353,40.0,10.0


In [81]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
5,0.056218,171.306192,20.0,10.0
3,0.058461,288.967294,80.0,5.0
12,0.059729,310.089163,10.0,30.0
8,0.061433,188.168018,10.0,20.0
11,0.061502,1482.992684,80.0,20.0
1,0.061527,72.052979,20.0,5.0
13,0.065754,699.778587,20.0,30.0
6,0.066447,341.508353,40.0,10.0
7,0.06938,670.22348,80.0,10.0
4,0.069923,90.918856,10.0,10.0


### N2V

In [82]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [83]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 35.214709758758545
Iteration completed in 37.044636487960815
Iteration completed in 39.446361780166626
Iteration completed in 38.890400648117065
Iteration completed in 35.76163601875305
Iteration completed in 37.269383668899536
Iteration completed in 38.256773948669434
Iteration completed in 40.4636435508728
Iteration completed in 35.67170262336731
Iteration completed in 37.25701189041138
Iteration completed in 37.81233310699463
Iteration completed in 40.53065466880798
Iteration completed in 35.60635828971863
Iteration completed in 36.6661434173584
Iteration completed in 39.125120401382446
Iteration completed in 38.67731189727783
Iteration completed in 35.68360209465027
Iteration completed in 36.95314121246338
Iteration completed in 37.9499135017395
Iteration completed in 40.826730728149414
Iteration completed in 35.47918438911438
Iteration completed in 36.309470891952515
Iteration completed in 37.16052794456482
Iteration completed in 40.14266490936279


In [84]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.285078,0.141451
std,0.055298,0.02119
min,0.207604,0.10611
25%,0.253756,0.12404
50%,0.279066,0.151734
75%,0.310857,0.157507
max,0.374439,0.167658


In [85]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
3,0.207604,575.860253,1.0,1.0,80.0,10.0
19,0.21246,695.624031,1.0,2.0,80.0,10.0
7,0.212918,687.228601,0.5,1.0,80.0,10.0
23,0.214384,709.758869,2.0,1.0,80.0,10.0
15,0.223637,696.772516,1.0,0.5,80.0,10.0
11,0.224553,700.444418,0.5,0.5,80.0,10.0
10,0.263491,388.473305,0.5,0.5,40.0,10.0
5,0.266056,349.895019,0.5,1.0,80.0,5.0
9,0.272011,336.869814,0.5,0.5,80.0,5.0
6,0.272194,397.617808,0.5,1.0,40.0,10.0


In [86]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
11,0.10611,700.444418,0.5,0.5,80.0,10.0
3,0.10653,575.860253,1.0,1.0,80.0,10.0
19,0.108713,695.624031,1.0,2.0,80.0,10.0
23,0.110242,709.758869,2.0,1.0,80.0,10.0
7,0.113515,687.228601,0.5,1.0,80.0,10.0
9,0.118255,336.869814,0.5,0.5,80.0,5.0
15,0.125969,696.772516,1.0,0.5,80.0,10.0
5,0.126808,349.895019,0.5,1.0,80.0,5.0
10,0.131126,388.473305,0.5,0.5,40.0,10.0
1,0.142692,289.630887,1.0,1.0,80.0,5.0


### MNMF

In [87]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [88]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.290286302566528
Iteration completed in 6.530550003051758
Iteration completed in 12.423574924468994
Iteration completed in 10.140306234359741
Iteration completed in 12.177462339401245
Iteration completed in 11.129866600036621
Iteration completed in 17.102630853652954


In [89]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,7.0,7.0
mean,0.185315,0.063121
std,0.008397,0.007928
min,0.173523,0.054914
25%,0.178287,0.05637
50%,0.188914,0.061033
75%,0.192442,0.069946
max,0.193312,0.073267


In [90]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.173523,423.98246,8.0,100.0
1,0.176454,1231.952486,8.0,200.0
3,0.180119,3089.716942,16.0,200.0
2,0.188914,1873.740525,16.0,100.0
6,0.192396,18979.924733,64.0,100.0
4,0.192487,6102.296241,32.0,100.0
5,0.193312,11782.491975,32.0,200.0


In [91]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.054914,423.98246,8.0,100.0
3,0.055736,3089.716942,16.0,200.0
1,0.057004,1231.952486,8.0,200.0
6,0.061033,18979.924733,64.0,100.0
5,0.067006,11782.491975,32.0,200.0
2,0.072887,1873.740525,16.0,100.0
4,0.073267,6102.296241,32.0,100.0


### DANMF

In [92]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [93]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 12.884506464004517
Iteration completed in 13.155307054519653
Iteration completed in 12.255965232849121
Iteration completed in 12.14141035079956
Iteration completed in 14.529887437820435
Iteration completed in 13.134157657623291
Iteration completed in 12.407177448272705
Iteration completed in 12.622478246688843
Iteration completed in 17.93863558769226
Iteration completed in 17.31910490989685
Iteration completed in 15.781837940216064
Iteration completed in 18.292691230773926
Iteration completed in 10.54819130897522
Iteration completed in 11.604425191879272
Iteration completed in 14.725618600845337


In [94]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.291208,0.148604
std,0.006398,0.011462
min,0.280073,0.132627
25%,0.288869,0.139646
50%,0.2929,0.145866
75%,0.295877,0.155602
max,0.299496,0.17012


In [95]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
5,0.280073,382.492228,"[64, 16]",100.0,50.0
12,0.280257,560.116846,"[32, 8]",50.0,50.0
7,0.280715,653.606651,"[64, 16]",100.0,100.0
6,0.288685,608.053479,"[64, 16]",50.0,100.0
2,0.289052,268.152193,"[32, 8]",50.0,100.0
13,0.290884,1321.520216,"[64, 16]",100.0,50.0
10,0.2918,1088.522675,"[128, 32]",50.0,100.0
3,0.2929,278.286608,"[32, 8]",100.0,100.0
4,0.293266,336.803176,"[64, 16]",50.0,50.0
0,0.294549,151.062392,"[32, 8]",50.0,50.0


In [96]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
8,0.132627,615.958076,"[128, 32]",50.0,50.0
13,0.1351,1321.520216,"[64, 16]",100.0,50.0
0,0.138309,151.062392,"[32, 8]",50.0,50.0
6,0.13874,608.053479,"[64, 16]",50.0,100.0
14,0.140552,2270.463403,"[128, 32]",50.0,100.0
7,0.145482,653.606651,"[64, 16]",100.0,100.0
12,0.145573,560.116846,"[32, 8]",50.0,50.0
5,0.145866,382.492228,"[64, 16]",100.0,50.0
2,0.146092,268.152193,"[32, 8]",50.0,100.0
11,0.149592,1242.205607,"[128, 32]",100.0,100.0
