In [2]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [3]:
G = nx.read_edgelist("./HU_edges_norm.csv")

In [4]:
### Calculating pagerank score for each node
i_time = time.time()
bet = nx.pagerank(G)
print(f"Computed pagerank in: {time.time() - i_time}")

Computed pagerank in: 0.7576441764831543


In [5]:
### Pagerank scores normalization
bet_l = []
max_v = max(bet.values())
min_v = min(bet.values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# 0.05 sized intervals

In [6]:
node_labels = []
for bet_v in bet_l:
    for i in range(1, 21):
        if(bet_v <= 0.05 * i): 
            node_labels.append(i)
            break

In [7]:
pd.Series(node_labels).value_counts()

2     15024
1     11661
3     10785
4      5871
5      2354
6       942
7       434
8       223
9       114
10       49
11       27
12       22
13       13
14        6
16        5
17        4
15        2
20        1
18        1
dtype: int64

### DW

In [13]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 36.87530732154846
Iteration completed in 33.998384952545166
Iteration completed in 33.79210162162781
Iteration completed in 33.80909180641174
Iteration completed in 35.26128005981445
Iteration completed in 33.75284266471863
Iteration completed in 32.32858347892761
Iteration completed in 32.881632566452026
Iteration completed in 33.87336778640747
Iteration completed in 33.18852639198303
Iteration completed in 32.399771213531494
Iteration completed in 36.95977568626404
Iteration completed in 34.00840091705322
Iteration completed in 37.460846185684204
Iteration completed in 36.22792339324951


In [15]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.347132,0.08465
std,0.017675,0.009598
min,0.318363,0.06673
25%,0.336927,0.077275
50%,0.34655,0.08473
75%,0.357751,0.088962
max,0.378313,0.103347


In [16]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
5,0.318363,170.240475,20.0,10.0
8,0.322886,185.26939,10.0,20.0
2,0.323096,133.75116,40.0,5.0
4,0.333403,87.543224,10.0,10.0
3,0.34045,266.070136,80.0,5.0
12,0.342554,229.253428,10.0,30.0
1,0.345078,68.210582,20.0,5.0
6,0.34655,334.773742,40.0,10.0
7,0.352335,681.768971,80.0,10.0
9,0.353387,369.003128,20.0,20.0


In [17]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.06673,68.210582,20.0,5.0
8,0.076325,185.26939,10.0,20.0
2,0.076853,133.75116,40.0,5.0
4,0.07695,87.543224,10.0,10.0
0,0.077599,36.506493,10.0,5.0
14,0.079948,972.867248,40.0,30.0
6,0.081365,334.773742,40.0,10.0
3,0.08473,266.070136,80.0,5.0
5,0.087673,170.240475,20.0,10.0
13,0.088557,476.520527,20.0,30.0


### N2V

In [18]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [19]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 45.69408202171326
Iteration completed in 44.33357548713684
Iteration completed in 53.79358696937561
Iteration completed in 46.44942569732666
Iteration completed in 40.7738516330719
Iteration completed in 39.07487440109253
Iteration completed in 40.39059090614319
Iteration completed in 43.68697166442871
Iteration completed in 40.47248911857605
Iteration completed in 41.052043437957764
Iteration completed in 42.34433674812317
Iteration completed in 45.32552123069763
Iteration completed in 39.78373384475708
Iteration completed in 42.120227098464966
Iteration completed in 45.666741132736206
Iteration completed in 43.79891633987427
Iteration completed in 40.6240348815918
Iteration completed in 40.247065782547
Iteration completed in 44.818599462509155
Iteration completed in 47.848918437957764
Iteration completed in 40.55660319328308
Iteration completed in 41.48391127586365
Iteration completed in 42.4899685382843
Iteration completed in 44.28393816947937


In [20]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.516705,0.125669
std,0.059417,0.028326
min,0.414598,0.078488
25%,0.487879,0.111394
50%,0.530658,0.126074
75%,0.553297,0.141545
max,0.600337,0.176509


In [21]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.414598,499.899741,1.0,0.5,80.0,10.0
11,0.416176,468.974333,0.5,0.5,80.0,10.0
23,0.425747,500.880916,2.0,1.0,80.0,10.0
7,0.426904,485.59117,0.5,1.0,80.0,10.0
3,0.432899,436.778427,1.0,1.0,80.0,10.0
19,0.447728,461.019866,1.0,2.0,80.0,10.0
14,0.501262,257.42323,1.0,0.5,40.0,10.0
10,0.507888,254.258841,0.5,0.5,40.0,10.0
2,0.510728,235.38871,1.0,1.0,40.0,10.0
6,0.516092,253.726269,0.5,1.0,40.0,10.0


In [22]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
3,0.078488,436.778427,1.0,1.0,80.0,10.0
15,0.083573,499.899741,1.0,0.5,80.0,10.0
7,0.084086,485.59117,0.5,1.0,80.0,10.0
11,0.084856,468.974333,0.5,0.5,80.0,10.0
23,0.087833,500.880916,2.0,1.0,80.0,10.0
19,0.103401,461.019866,1.0,2.0,80.0,10.0
10,0.114058,254.258841,0.5,0.5,40.0,10.0
22,0.117804,248.845797,2.0,1.0,40.0,10.0
14,0.11988,257.42323,1.0,0.5,40.0,10.0
17,0.121192,208.078459,1.0,2.0,80.0,5.0


### MNMF

In [23]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.297024250030518
Iteration completed in 7.169738531112671
Iteration completed in 10.908040523529053
Iteration completed in 9.720978021621704
Iteration completed in 11.502106428146362
Iteration completed in 11.243109941482544
Iteration completed in 15.372917890548706
Iteration completed in 15.420313119888306


In [25]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.302048,0.0625
std,0.013116,0.006937
min,0.285339,0.052747
25%,0.291728,0.057523
50%,0.299379,0.062092
75%,0.31505,0.06582
max,0.317417,0.074123


In [26]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.285339,95.28386,8.0,100.0
1,0.291334,285.491338,8.0,200.0
3,0.291859,701.013073,16.0,200.0
2,0.293437,421.509037,16.0,100.0
5,0.305322,2139.761206,32.0,200.0
4,0.314262,1211.796356,32.0,100.0
6,0.317417,3373.030116,64.0,100.0
7,0.317417,5875.845202,64.0,200.0


In [27]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.052747,285.491338,8.0,200.0
0,0.057252,95.28386,8.0,100.0
3,0.057614,701.013073,16.0,200.0
2,0.061185,421.509037,16.0,100.0
4,0.062999,1211.796356,32.0,100.0
6,0.064597,3373.030116,64.0,100.0
7,0.069487,5875.845202,64.0,200.0
5,0.074123,2139.761206,32.0,200.0


### DANMF

In [28]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [29]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 12.886220932006836
Iteration completed in 13.048490047454834
Iteration completed in 11.247031450271606
Iteration completed in 12.97658395767212
Iteration completed in 15.074119329452515
Iteration completed in 13.178959846496582
Iteration completed in 12.120028734207153
Iteration completed in 13.68842077255249
Iteration completed in 18.41219735145569
Iteration completed in 17.6296169757843
Iteration completed in 17.19794511795044
Iteration completed in 17.83065414428711
Iteration completed in 11.177123546600342
Iteration completed in 12.565053462982178
Iteration completed in 16.865437269210815


In [30]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.414795,0.132781
std,0.009682,0.021277
min,0.396508,0.10422
25%,0.407552,0.113463
50%,0.417753,0.132285
75%,0.421435,0.153688
max,0.426904,0.168115


In [31]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
3,0.396508,115.569006,"[32, 8]",100.0,100.0
12,0.40082,216.932015,"[32, 8]",50.0,50.0
2,0.402503,108.151318,"[32, 8]",50.0,100.0
13,0.404081,516.619871,"[64, 16]",100.0,50.0
7,0.411022,254.67847,"[64, 16]",100.0,100.0
4,0.414703,128.863256,"[64, 16]",50.0,50.0
5,0.416176,174.576857,"[64, 16]",100.0,50.0
9,0.417753,438.769644,"[128, 32]",100.0,50.0
6,0.419016,220.051627,"[64, 16]",50.0,100.0
1,0.419121,71.049077,"[32, 8]",100.0,50.0


In [32]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
11,0.10422,591.802661,"[128, 32]",100.0,100.0
9,0.105738,438.769644,"[128, 32]",100.0,50.0
12,0.110008,216.932015,"[32, 8]",50.0,50.0
1,0.112812,71.049077,"[32, 8]",100.0,50.0
14,0.114113,964.998477,"[128, 32]",50.0,100.0
2,0.120721,108.151318,"[32, 8]",50.0,100.0
8,0.129488,295.975242,"[128, 32]",50.0,50.0
7,0.132285,254.67847,"[64, 16]",100.0,100.0
3,0.133227,115.569006,"[32, 8]",100.0,100.0
6,0.139052,220.051627,"[64, 16]",50.0,100.0


# Heterogeneous intervals

In [6]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [7]:
pd.Series(node_labels).value_counts()

16    16656
17     4197
11     3126
13     3078
12     3068
14     2932
15     2820
10     1532
9      1504
8      1459
7      1389
6      1294
5      1189
4      1009
3       956
2       811
1       518
dtype: int64

### DW

In [8]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [9]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 30.115837574005127
Iteration completed in 32.476500511169434
Iteration completed in 31.107210636138916
Iteration completed in 31.13835573196411
Iteration completed in 32.01785063743591
Iteration completed in 30.698253393173218
Iteration completed in 29.62899661064148
Iteration completed in 31.757540941238403
Iteration completed in 31.27263879776001
Iteration completed in 30.905681848526
Iteration completed in 30.75070023536682
Iteration completed in 31.000481367111206
Iteration completed in 31.11618661880493
Iteration completed in 31.192432165145874
Iteration completed in 31.375728845596313


In [10]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.347209,0.0628
std,0.005534,0.0124
min,0.33761,0.042738
25%,0.34329,0.054281
50%,0.347918,0.061188
75%,0.350757,0.072073
max,0.357909,0.084346


In [11]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
9,0.33761,369.003128,20.0,20.0
10,0.341502,623.870464,40.0,20.0
11,0.341502,1184.07436,80.0,20.0
12,0.34308,229.253428,10.0,30.0
3,0.3435,266.070136,80.0,5.0
6,0.3435,334.773742,40.0,10.0
5,0.345499,170.240475,20.0,10.0
7,0.347918,681.768971,80.0,10.0
13,0.348549,476.520527,20.0,30.0
8,0.3496,185.26939,10.0,20.0


In [12]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.042738,68.210582,20.0,5.0
4,0.044954,87.543224,10.0,10.0
5,0.051818,170.240475,20.0,10.0
2,0.053921,133.75116,40.0,5.0
8,0.054641,185.26939,10.0,20.0
12,0.057821,229.253428,10.0,30.0
9,0.058608,369.003128,20.0,20.0
3,0.061188,266.070136,80.0,5.0
6,0.063777,334.773742,40.0,10.0
0,0.069298,36.506493,10.0,5.0


### N2V

In [13]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [14]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 38.341655254364014
Iteration completed in 38.568734884262085
Iteration completed in 41.22926902770996
Iteration completed in 44.116750717163086
Iteration completed in 38.37522339820862
Iteration completed in 38.18707871437073
Iteration completed in 38.648335218429565
Iteration completed in 42.00413537025452
Iteration completed in 39.402199268341064
Iteration completed in 40.688517808914185
Iteration completed in 40.05982494354248
Iteration completed in 38.78739881515503
Iteration completed in 34.25416707992554
Iteration completed in 34.26397180557251
Iteration completed in 34.61343479156494
Iteration completed in 37.97807240486145
Iteration completed in 34.07952070236206
Iteration completed in 33.774298667907715
Iteration completed in 34.117981910705566
Iteration completed in 36.5487699508667
Iteration completed in 33.81985139846802
Iteration completed in 33.71766519546509
Iteration completed in 34.44379687309265
Iteration completed in 37.91183376312256


In [15]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.402214,0.191967
std,0.022848,0.039004
min,0.364325,0.108316
25%,0.392354,0.181027
50%,0.40487,0.210624
75%,0.419463,0.217255
max,0.438263,0.227043


In [16]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
11,0.364325,468.974333,0.5,0.5,80.0,10.0
15,0.365587,499.899741,1.0,0.5,80.0,10.0
23,0.367375,500.880916,2.0,1.0,80.0,10.0
3,0.368006,436.778427,1.0,1.0,80.0,10.0
7,0.37074,485.59117,0.5,1.0,80.0,10.0
19,0.373265,461.019866,1.0,2.0,80.0,10.0
10,0.398717,254.258841,0.5,0.5,40.0,10.0
14,0.399558,257.42323,1.0,0.5,40.0,10.0
6,0.399663,253.726269,0.5,1.0,40.0,10.0
22,0.403555,248.845797,2.0,1.0,40.0,10.0


In [17]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.108316,499.899741,1.0,0.5,80.0,10.0
11,0.116046,468.974333,0.5,0.5,80.0,10.0
3,0.124314,436.778427,1.0,1.0,80.0,10.0
23,0.129331,500.880916,2.0,1.0,80.0,10.0
7,0.141753,485.59117,0.5,1.0,80.0,10.0
19,0.148796,461.019866,1.0,2.0,80.0,10.0
10,0.191771,254.258841,0.5,0.5,40.0,10.0
14,0.204335,257.42323,1.0,0.5,40.0,10.0
5,0.20547,219.425555,0.5,1.0,80.0,5.0
22,0.206355,248.845797,2.0,1.0,40.0,10.0


### MNMF

In [18]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [19]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 6.630945920944214
Iteration completed in 5.545548915863037
Iteration completed in 9.699987173080444
Iteration completed in 8.95397162437439
Iteration completed in 10.044917106628418
Iteration completed in 9.206128358840942
Iteration completed in 13.85706877708435
Iteration completed in 11.859480142593384


In [20]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.338544,0.06152
std,0.014257,0.006277
min,0.315419,0.053456
25%,0.328565,0.058391
50%,0.341397,0.05994
75%,0.34939,0.064349
max,0.354018,0.072874


In [21]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.315419,285.491338,8.0,200.0
3,0.324464,701.013073,16.0,200.0
0,0.329933,95.28386,8.0,100.0
2,0.336979,421.509037,16.0,100.0
5,0.345814,2139.761206,32.0,200.0
7,0.347918,5875.845202,64.0,200.0
6,0.353807,3373.030116,64.0,100.0
4,0.354018,1211.796356,32.0,100.0


In [22]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.053456,285.491338,8.0,200.0
0,0.055874,95.28386,8.0,100.0
4,0.05923,1211.796356,32.0,100.0
5,0.059818,2139.761206,32.0,200.0
2,0.060061,421.509037,16.0,100.0
3,0.063275,701.013073,16.0,200.0
6,0.067571,3373.030116,64.0,100.0
7,0.072874,5875.845202,64.0,200.0


### DANMF

In [23]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [24]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 9.555464744567871
Iteration completed in 11.706939458847046
Iteration completed in 10.054538249969482
Iteration completed in 10.239539623260498
Iteration completed in 12.150494813919067
Iteration completed in 12.246707439422607
Iteration completed in 11.720378398895264
Iteration completed in 11.9693443775177
Iteration completed in 17.89612627029419
Iteration completed in 16.123289823532104
Iteration completed in 14.894972085952759
Iteration completed in 14.909425973892212
Iteration completed in 10.122385263442993
Iteration completed in 10.829087018966675
Iteration completed in 13.640929937362671


In [25]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.363196,0.125618
std,0.00625,0.008027
min,0.350126,0.111753
25%,0.360486,0.120386
50%,0.363378,0.126308
75%,0.368795,0.132098
max,0.369899,0.139036


In [26]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
2,0.350126,108.151318,"[32, 8]",50.0,100.0
13,0.352335,516.619871,"[64, 16]",100.0,50.0
3,0.357383,115.569006,"[32, 8]",100.0,100.0
1,0.359802,71.049077,"[32, 8]",100.0,50.0
12,0.36117,216.932015,"[32, 8]",50.0,50.0
0,0.362537,59.408877,"[32, 8]",50.0,50.0
5,0.363063,174.576857,"[64, 16]",100.0,50.0
7,0.363378,254.67847,"[64, 16]",100.0,100.0
4,0.363799,128.863256,"[64, 16]",50.0,50.0
8,0.368321,295.975242,"[128, 32]",50.0,50.0


In [27]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
7,0.111753,254.67847,"[64, 16]",100.0,100.0
13,0.114853,516.619871,"[64, 16]",100.0,50.0
3,0.119335,115.569006,"[32, 8]",100.0,100.0
6,0.120219,220.051627,"[64, 16]",50.0,100.0
4,0.120554,128.863256,"[64, 16]",50.0,50.0
5,0.120593,174.576857,"[64, 16]",100.0,50.0
9,0.121359,438.769644,"[128, 32]",100.0,50.0
2,0.126308,108.151318,"[32, 8]",50.0,100.0
14,0.127951,964.998477,"[128, 32]",50.0,100.0
1,0.127997,71.049077,"[32, 8]",100.0,50.0


# Heterogeneous pt2

In [59]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [60]:
pd.Series(node_labels).value_counts()

19    10817
18     9691
20     7536
17     6811
16     4195
15     2391
21     2341
14     1298
13      771
22      503
12      464
11      289
10      213
9        89
23       70
8        29
24       11
7        10
6         4
5         2
1         1
2         1
4         1
dtype: int64

### DW

In [65]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [66]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 26.379839420318604
Iteration completed in 27.457216262817383
Iteration completed in 26.016130208969116
Iteration completed in 26.315436840057373
Iteration completed in 26.593748569488525
Iteration completed in 25.763792037963867
Iteration completed in 26.11982822418213
Iteration completed in 25.709057807922363
Iteration completed in 26.167402505874634
Iteration completed in 25.180063009262085
Iteration completed in 26.137375831604004
Iteration completed in 26.201395511627197
Iteration completed in 25.406941652297974
Iteration completed in 26.68206024169922
Iteration completed in 25.860597848892212


In [67]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.239952,0.069469
std,0.012262,0.015462
min,0.212768,0.048118
25%,0.238273,0.06246
50%,0.242953,0.067419
75%,0.249001,0.070399
max,0.251683,0.115942


In [68]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
1,0.212768,68.210582,20.0,5.0
4,0.220025,87.543224,10.0,10.0
0,0.221603,36.506493,10.0,5.0
5,0.237274,170.240475,20.0,10.0
11,0.239272,1184.07436,80.0,20.0
6,0.240429,334.773742,40.0,10.0
14,0.242112,972.867248,40.0,30.0
8,0.242953,185.26939,10.0,20.0
2,0.245372,133.75116,40.0,5.0
9,0.245688,369.003128,20.0,20.0


In [69]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.048118,68.210582,20.0,5.0
4,0.054572,87.543224,10.0,10.0
3,0.060791,266.070136,80.0,5.0
5,0.062087,170.240475,20.0,10.0
2,0.062833,133.75116,40.0,5.0
9,0.064945,369.003128,20.0,20.0
11,0.066749,1184.07436,80.0,20.0
8,0.067419,185.26939,10.0,20.0
12,0.067911,229.253428,10.0,30.0
10,0.069609,623.870464,40.0,20.0


### N2V

In [70]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [71]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 29.406182289123535
Iteration completed in 30.16091537475586
Iteration completed in 33.16117286682129
Iteration completed in 33.104949951171875
Iteration completed in 30.118155479431152
Iteration completed in 31.42403221130371
Iteration completed in 33.218652963638306
Iteration completed in 33.638853549957275
Iteration completed in 30.005422830581665
Iteration completed in 32.34707021713257
Iteration completed in 31.278759956359863
Iteration completed in 33.80241107940674
Iteration completed in 31.514585733413696
Iteration completed in 31.970560550689697
Iteration completed in 32.19869685173035
Iteration completed in 33.956992626190186
Iteration completed in 32.01994967460632
Iteration completed in 31.430691242218018
Iteration completed in 32.03079533576965
Iteration completed in 33.40922999382019
Iteration completed in 31.07397222518921
Iteration completed in 31.366692304611206
Iteration completed in 32.018227338790894
Iteration completed in 32.197065591812134


In [72]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.347913,0.187943
std,0.066401,0.027744
min,0.237905,0.13897
25%,0.322755,0.170256
50%,0.354175,0.191832
75%,0.389409,0.20648
max,0.452146,0.232689


In [73]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.237905,499.899741,1.0,0.5,80.0,10.0
11,0.240955,468.974333,0.5,0.5,80.0,10.0
7,0.249264,485.59117,0.5,1.0,80.0,10.0
23,0.250526,500.880916,2.0,1.0,80.0,10.0
3,0.250736,436.778427,1.0,1.0,80.0,10.0
19,0.266828,461.019866,1.0,2.0,80.0,10.0
10,0.341397,254.258841,0.5,0.5,40.0,10.0
14,0.341607,257.42323,1.0,0.5,40.0,10.0
6,0.342448,253.726269,0.5,1.0,40.0,10.0
2,0.345814,235.38871,1.0,1.0,40.0,10.0


In [74]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
23,0.13897,500.880916,2.0,1.0,80.0,10.0
15,0.144618,499.899741,1.0,0.5,80.0,10.0
11,0.144868,468.974333,0.5,0.5,80.0,10.0
7,0.150212,485.59117,0.5,1.0,80.0,10.0
19,0.153315,461.019866,1.0,2.0,80.0,10.0
3,0.153901,436.778427,1.0,1.0,80.0,10.0
1,0.175707,192.255094,1.0,1.0,80.0,5.0
8,0.180936,120.73153,0.5,0.5,40.0,5.0
16,0.181867,124.044357,1.0,2.0,40.0,5.0
9,0.183262,214.085608,0.5,0.5,80.0,5.0


### MNMF

In [75]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [76]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 7.442131996154785
Iteration completed in 5.371655702590942
Iteration completed in 8.981937408447266
Iteration completed in 9.202645063400269
Iteration completed in 9.357093572616577
Iteration completed in 8.342885255813599
Iteration completed in 12.417726993560791
Iteration completed in 11.830991268157959


In [77]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.210152,0.08775
std,0.009488,0.020671
min,0.198254,0.062918
25%,0.202671,0.07069
50%,0.208982,0.085565
75%,0.21779,0.106458
max,0.224337,0.114068


In [78]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.198254,95.28386,8.0,100.0
3,0.201409,701.013073,16.0,200.0
1,0.203092,285.491338,8.0,200.0
2,0.204985,421.509037,16.0,100.0
5,0.212979,2139.761206,32.0,200.0
7,0.217501,5875.845202,64.0,200.0
6,0.218658,3373.030116,64.0,100.0
4,0.224337,1211.796356,32.0,100.0


In [79]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.062918,285.491338,8.0,200.0
3,0.06405,701.013073,16.0,200.0
2,0.072904,421.509037,16.0,100.0
6,0.085508,3373.030116,64.0,100.0
5,0.085623,2139.761206,32.0,200.0
7,0.104453,5875.845202,64.0,200.0
4,0.112474,1211.796356,32.0,100.0
0,0.114068,95.28386,8.0,100.0


### DANMF

In [80]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [81]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 8.214516639709473
Iteration completed in 9.0414719581604
Iteration completed in 8.628135681152344
Iteration completed in 8.565783500671387
Iteration completed in 9.660575151443481
Iteration completed in 9.819814920425415
Iteration completed in 10.07233214378357
Iteration completed in 9.436601638793945
Iteration completed in 13.635911703109741
Iteration completed in 13.041648387908936
Iteration completed in 13.870630741119385
Iteration completed in 12.384145259857178
Iteration completed in 8.45549464225769
Iteration completed in 10.480361223220825
Iteration completed in 11.771739959716797


In [82]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.282997,0.150851
std,0.008392,0.029205
min,0.267038,0.109045
25%,0.279291,0.134749
50%,0.284602,0.141167
75%,0.286969,0.160218
max,0.295751,0.209306


In [83]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
3,0.267038,115.569006,"[32, 8]",100.0,100.0
13,0.268511,516.619871,"[64, 16]",100.0,50.0
1,0.273349,71.049077,"[32, 8]",100.0,50.0
12,0.279028,216.932015,"[32, 8]",50.0,50.0
6,0.279554,220.051627,"[64, 16]",50.0,100.0
7,0.28292,254.67847,"[64, 16]",100.0,100.0
2,0.283235,108.151318,"[32, 8]",50.0,100.0
4,0.284602,128.863256,"[64, 16]",50.0,50.0
5,0.285654,174.576857,"[64, 16]",100.0,50.0
11,0.286075,591.802661,"[128, 32]",100.0,100.0


In [84]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
3,0.109045,115.569006,"[32, 8]",100.0,100.0
7,0.116796,254.67847,"[64, 16]",100.0,100.0
13,0.12354,516.619871,"[64, 16]",100.0,50.0
9,0.132222,438.769644,"[128, 32]",100.0,50.0
5,0.137275,174.576857,"[64, 16]",100.0,50.0
11,0.13773,591.802661,"[128, 32]",100.0,100.0
0,0.140965,59.408877,"[32, 8]",50.0,50.0
2,0.141167,108.151318,"[32, 8]",50.0,100.0
1,0.152259,71.049077,"[32, 8]",100.0,50.0
10,0.154041,453.801099,"[128, 32]",50.0,100.0
