In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

In [2]:
G = nx.read_edgelist("./HU_edges_norm.csv")

In [3]:
### Calculating pagerank score for each node
i_time = time.time()
bet = nx.degree(G)
print(f"Computed degrees in: {time.time() - i_time}")

Computed degrees in: 0.00015544891357421875


In [4]:
### Pagerank scores normalization
bet_l = []
max_v = max(dict(bet).values())
min_v = min(dict(bet).values())
for i in range(1, len(bet) + 1):
    bet_norm = (bet[str(i)] - min_v) / (max_v - min_v) 
    bet_l.append(bet_norm)

# Heterogeneous intervals

In [27]:
node_labels = []
for bet_v in bet_l:
    start_val = 0.0001
    i = 1
    while(True):
        if bet_v <= start_val:
            node_labels.append(i)
            break
        else:
            i += 1
            start_val *= 1.5

In [28]:
pd.Series(node_labels).value_counts()

17    9215
19    8755
18    7036
20    3938
14    3683
15    3655
16    3535
13    3411
1     2701
21    1265
22     288
23      49
24       7
dtype: int64

### DW

In [30]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [31]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 38.87402820587158
Iteration completed in 35.20384073257446
Iteration completed in 31.01824188232422
Iteration completed in 30.92821168899536
Iteration completed in 35.477919578552246
Iteration completed in 32.00752639770508
Iteration completed in 32.52510094642639
Iteration completed in 32.41688561439514
Iteration completed in 32.86967849731445
Iteration completed in 31.808719873428345
Iteration completed in 32.258827924728394
Iteration completed in 33.3677339553833
Iteration completed in 33.9418044090271
Iteration completed in 38.061683893203735
Iteration completed in 32.8541305065155


In [32]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.239335,0.125044
std,0.017331,0.021157
min,0.202146,0.091187
25%,0.231437,0.108021
50%,0.243479,0.135347
75%,0.252051,0.14113
max,0.260412,0.152305


In [33]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.202146,36.506493,10.0,5.0
1,0.211296,68.210582,20.0,5.0
4,0.220236,87.543224,10.0,10.0
8,0.227177,185.26939,10.0,20.0
2,0.235696,133.75116,40.0,5.0
5,0.238115,170.240475,20.0,10.0
12,0.240429,229.253428,10.0,30.0
9,0.243479,369.003128,20.0,20.0
13,0.246634,476.520527,20.0,30.0
14,0.247686,972.867248,40.0,30.0


In [34]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
1,0.091187,68.210582,20.0,5.0
0,0.09346,36.506493,10.0,5.0
4,0.096008,87.543224,10.0,10.0
2,0.106018,133.75116,40.0,5.0
5,0.110024,170.240475,20.0,10.0
8,0.118907,185.26939,10.0,20.0
12,0.122965,229.253428,10.0,30.0
6,0.135347,334.773742,40.0,10.0
9,0.135589,369.003128,20.0,20.0
13,0.136475,476.520527,20.0,30.0


### N2V

In [35]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [36]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 41.58656859397888
Iteration completed in 44.54573941230774
Iteration completed in 48.143094539642334
Iteration completed in 46.87595009803772
Iteration completed in 43.26291108131409
Iteration completed in 43.086708545684814
Iteration completed in 47.086190938949585
Iteration completed in 48.07143568992615
Iteration completed in 42.14041495323181
Iteration completed in 43.60881853103638
Iteration completed in 44.603023052215576
Iteration completed in 44.4342098236084
Iteration completed in 41.49005722999573
Iteration completed in 40.40008330345154
Iteration completed in 40.70808720588684
Iteration completed in 39.227330446243286
Iteration completed in 36.974361419677734
Iteration completed in 39.27074432373047
Iteration completed in 44.09617257118225
Iteration completed in 45.76300883293152
Iteration completed in 40.59103012084961
Iteration completed in 43.08279633522034
Iteration completed in 46.42841625213623
Iteration completed in 45.49763870239258


In [37]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.275741,0.165481
std,0.023591,0.022877
min,0.235907,0.122853
25%,0.265382,0.150436
50%,0.275452,0.172785
75%,0.288336,0.180468
max,0.316891,0.197737


In [38]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
3,0.235907,436.778427,1.0,1.0,80.0,10.0
7,0.238746,485.59117,0.5,1.0,80.0,10.0
15,0.243164,499.899741,1.0,0.5,80.0,10.0
19,0.24411,461.019866,1.0,2.0,80.0,10.0
11,0.245477,468.974333,0.5,0.5,80.0,10.0
23,0.256942,500.880916,2.0,1.0,80.0,10.0
5,0.268195,219.425555,0.5,1.0,80.0,5.0
17,0.271771,208.078459,1.0,2.0,80.0,5.0
6,0.272402,253.726269,0.5,1.0,40.0,10.0
9,0.272928,214.085608,0.5,0.5,80.0,5.0


In [39]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.122853,499.899741,1.0,0.5,80.0,10.0
3,0.123998,436.778427,1.0,1.0,80.0,10.0
7,0.131896,485.59117,0.5,1.0,80.0,10.0
23,0.13695,500.880916,2.0,1.0,80.0,10.0
11,0.138044,468.974333,0.5,0.5,80.0,10.0
19,0.138847,461.019866,1.0,2.0,80.0,10.0
5,0.154298,219.425555,0.5,1.0,80.0,5.0
9,0.158368,214.085608,0.5,0.5,80.0,5.0
17,0.159942,208.078459,1.0,2.0,80.0,5.0
13,0.162778,214.618617,1.0,0.5,80.0,5.0


### MNMF

In [40]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [41]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 8.013367652893066
Iteration completed in 6.535597801208496
Iteration completed in 11.479940414428711
Iteration completed in 10.339855194091797
Iteration completed in 12.148955821990967
Iteration completed in 10.935511112213135
Iteration completed in 16.399409532546997
Iteration completed in 13.937598943710327


In [42]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.185173,0.087333
std,0.0131,0.006835
min,0.169752,0.078917
25%,0.17459,0.083713
50%,0.184318,0.085708
75%,0.192706,0.091293
max,0.203934,0.097361


In [43]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
2,0.169752,421.509037,16.0,100.0
0,0.17175,95.28386,8.0,100.0
1,0.175536,285.491338,8.0,200.0
3,0.18069,701.013073,16.0,200.0
5,0.187947,2139.761206,32.0,200.0
4,0.189525,1211.796356,32.0,100.0
6,0.202251,3373.030116,64.0,100.0
7,0.203934,5875.845202,64.0,200.0


In [44]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
2,0.078917,421.509037,16.0,100.0
0,0.079902,95.28386,8.0,100.0
1,0.084984,285.491338,8.0,200.0
3,0.085615,701.013073,16.0,200.0
4,0.085802,1211.796356,32.0,100.0
5,0.089545,2139.761206,32.0,200.0
7,0.096536,5875.845202,64.0,200.0
6,0.097361,3373.030116,64.0,100.0


### DANMF

In [45]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [46]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.955102682113647
Iteration completed in 11.783040761947632
Iteration completed in 10.01301622390747
Iteration completed in 11.206810712814331
Iteration completed in 12.876270771026611
Iteration completed in 12.710163831710815
Iteration completed in 11.920438766479492
Iteration completed in 12.169365406036377
Iteration completed in 17.824352979660034
Iteration completed in 17.489765167236328
Iteration completed in 16.623816967010498
Iteration completed in 17.391331434249878
Iteration completed in 10.902516841888428
Iteration completed in 12.261860847473145
Iteration completed in 15.568785905838013


In [47]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.331854,0.260481
std,0.010939,0.026501
min,0.310475,0.226862
25%,0.325147,0.246332
50%,0.330038,0.253416
75%,0.341607,0.267006
max,0.351283,0.323291


In [48]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
3,0.310475,115.569006,"[32, 8]",100.0,100.0
7,0.318258,254.67847,"[64, 16]",100.0,100.0
2,0.324674,108.151318,"[32, 8]",50.0,100.0
6,0.324989,220.051627,"[64, 16]",50.0,100.0
13,0.325305,516.619871,"[64, 16]",100.0,50.0
0,0.328881,59.408877,"[32, 8]",50.0,50.0
12,0.329617,216.932015,"[32, 8]",50.0,50.0
1,0.330038,71.049077,"[32, 8]",100.0,50.0
5,0.330984,174.576857,"[64, 16]",100.0,50.0
4,0.331721,128.863256,"[64, 16]",50.0,50.0


In [49]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
7,0.226862,254.67847,"[64, 16]",100.0,100.0
4,0.23592,128.863256,"[64, 16]",50.0,50.0
13,0.23894,516.619871,"[64, 16]",100.0,50.0
2,0.246309,108.151318,"[32, 8]",50.0,100.0
5,0.246355,174.576857,"[64, 16]",100.0,50.0
3,0.247178,115.569006,"[32, 8]",100.0,100.0
11,0.248557,591.802661,"[128, 32]",100.0,100.0
10,0.253416,453.801099,"[128, 32]",50.0,100.0
6,0.257676,220.051627,"[64, 16]",50.0,100.0
0,0.260117,59.408877,"[32, 8]",50.0,50.0


# Heterogeneous intervals 2

In [5]:
node_labels = []
for bet_v in bet_l:
    ok = False
    for i in range(1, 11):
        if(bet_v <= 0.005 * i): 
            node_labels.append(i)
            ok = True
            break
    if ok: continue
    for i in range(1, 6):
        if(bet_v <= 0.05 + 0.01 * i):
            node_labels.append(10 + i)
            ok = True
            break
    if ok: continue
    if(bet_v <= 0.2):
        node_labels.append(16)
        continue
    node_labels.append(17)

In [6]:
pd.Series(node_labels).value_counts()

16    10235
15     4038
4      3683
6      3655
8      3535
10     3422
2      3411
11     3015
12     2778
1      2701
13     2588
14     2346
17     2131
dtype: int64

### DW

In [7]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [8]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 39.13630533218384
Iteration completed in 36.79685163497925
Iteration completed in 35.38496422767639
Iteration completed in 34.8969886302948
Iteration completed in 40.336549043655396
Iteration completed in 35.68955111503601
Iteration completed in 46.55622625350952
Iteration completed in 42.90240740776062
Iteration completed in 33.33965611457825
Iteration completed in 32.55997347831726
Iteration completed in 32.27424716949463
Iteration completed in 32.44959330558777
Iteration completed in 32.440887689590454
Iteration completed in 32.288761615753174
Iteration completed in 32.18465280532837


In [9]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.239132,0.116215
std,0.01662,0.017918
min,0.207404,0.081776
25%,0.235276,0.110229
50%,0.245583,0.119909
75%,0.250578,0.127667
max,0.257573,0.137506


In [10]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
4,0.207404,87.543224,10.0,10.0
1,0.208772,68.210582,20.0,5.0
0,0.212873,36.506493,10.0,5.0
2,0.234224,133.75116,40.0,5.0
8,0.236327,185.26939,10.0,20.0
5,0.236643,170.240475,20.0,10.0
12,0.245267,229.253428,10.0,30.0
10,0.245583,623.870464,40.0,20.0
9,0.247581,369.003128,20.0,20.0
13,0.247791,476.520527,20.0,30.0


In [11]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.081776,36.506493,10.0,5.0
1,0.084994,68.210582,20.0,5.0
4,0.090742,87.543224,10.0,10.0
8,0.109351,185.26939,10.0,20.0
5,0.111108,170.240475,20.0,10.0
2,0.115398,133.75116,40.0,5.0
12,0.116072,229.253428,10.0,30.0
9,0.119909,369.003128,20.0,20.0
6,0.125449,334.773742,40.0,10.0
13,0.125673,476.520527,20.0,30.0


### N2V

In [12]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [13]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 43.82649540901184
Iteration completed in 42.90385580062866
Iteration completed in 46.68896269798279
Iteration completed in 43.24722194671631
Iteration completed in 37.8803448677063
Iteration completed in 38.90583395957947
Iteration completed in 40.768834829330444
Iteration completed in 42.686275482177734
Iteration completed in 36.16690731048584
Iteration completed in 35.93584966659546
Iteration completed in 37.88871669769287
Iteration completed in 40.45774269104004
Iteration completed in 37.43344211578369
Iteration completed in 38.16169023513794
Iteration completed in 41.72797250747681
Iteration completed in 43.24136686325073
Iteration completed in 39.85399889945984
Iteration completed in 44.175307750701904
Iteration completed in 46.7670373916626
Iteration completed in 50.3098201751709
Iteration completed in 48.13193416595459
Iteration completed in 53.128920793533325
Iteration completed in 46.11293339729309
Iteration completed in 49.41914987564087


In [14]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,24.0,24.0
mean,0.280974,0.154073
std,0.025043,0.020676
min,0.235065,0.120173
25%,0.270956,0.143273
50%,0.291912,0.157396
75%,0.296934,0.164373
max,0.311527,0.190123


In [15]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
15,0.235065,499.899741,1.0,0.5,80.0,10.0
3,0.236958,436.778427,1.0,1.0,80.0,10.0
7,0.238746,485.59117,0.5,1.0,80.0,10.0
11,0.241165,468.974333,0.5,0.5,80.0,10.0
19,0.242427,461.019866,1.0,2.0,80.0,10.0
23,0.245162,500.880916,2.0,1.0,80.0,10.0
9,0.279554,214.085608,0.5,0.5,80.0,5.0
14,0.284497,257.42323,1.0,0.5,40.0,10.0
17,0.289125,208.078459,1.0,2.0,80.0,5.0
1,0.289861,192.255094,1.0,1.0,80.0,5.0


In [16]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
19,0.120173,461.019866,1.0,2.0,80.0,10.0
15,0.120541,499.899741,1.0,0.5,80.0,10.0
7,0.122872,485.59117,0.5,1.0,80.0,10.0
3,0.123027,436.778427,1.0,1.0,80.0,10.0
11,0.123483,468.974333,0.5,0.5,80.0,10.0
23,0.135904,500.880916,2.0,1.0,80.0,10.0
9,0.14573,214.085608,0.5,0.5,80.0,5.0
1,0.153078,192.255094,1.0,1.0,80.0,5.0
17,0.153347,208.078459,1.0,2.0,80.0,5.0
14,0.154686,257.42323,1.0,0.5,40.0,10.0


### MNMF

In [17]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [18]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 10.717861652374268
Iteration completed in 10.21182894706726
Iteration completed in 17.41844654083252
Iteration completed in 16.334556579589844
Iteration completed in 18.308497667312622
Iteration completed in 14.793391942977905
Iteration completed in 17.975183963775635
Iteration completed in 21.75452423095703


In [19]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,8.0,8.0
mean,0.188552,0.08154
std,0.014092,0.006158
min,0.168069,0.071533
25%,0.180979,0.078302
50%,0.187316,0.080664
75%,0.198754,0.085731
max,0.21056,0.091111


In [20]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
0,0.168069,95.28386,8.0,100.0
1,0.173643,285.491338,8.0,200.0
3,0.183424,701.013073,16.0,200.0
2,0.184687,421.509037,16.0,100.0
5,0.189945,2139.761206,32.0,200.0
7,0.198464,5875.845202,64.0,200.0
4,0.199621,1211.796356,32.0,100.0
6,0.21056,3373.030116,64.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.071533,95.28386,8.0,100.0
1,0.077341,285.491338,8.0,200.0
5,0.078623,2139.761206,32.0,200.0
2,0.079951,421.509037,16.0,100.0
4,0.081377,1211.796356,32.0,100.0
3,0.085268,701.013073,16.0,200.0
7,0.087119,5875.845202,64.0,200.0
6,0.091111,3373.030116,64.0,100.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
f1_scores = []
f1_scores_macro = []
for i in range(tests_num):
    start_time = time.time()
    data = pd.read_csv("./danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()
 
    X_data = list(map(lambda x: sorted(x, reverse=True), data))
    y_data = node_labels

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    print(f"Iteration completed in {time.time() - start_time}")

Iteration completed in 14.056573629379272
Iteration completed in 13.326437950134277
Iteration completed in 12.362221002578735
Iteration completed in 12.773163080215454
Iteration completed in 17.15690016746521
Iteration completed in 15.831449031829834
Iteration completed in 16.470604419708252
Iteration completed in 15.7089102268219
Iteration completed in 26.63672637939453
Iteration completed in 22.0832941532135
Iteration completed in 18.716891050338745
Iteration completed in 19.08872127532959
Iteration completed in 12.271103858947754
Iteration completed in 14.065866231918335
Iteration completed in 19.48688006401062


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro
count,15.0,15.0
mean,0.306226,0.21208
std,0.010547,0.01005
min,0.287127,0.192461
25%,0.299853,0.206267
50%,0.305427,0.2102
75%,0.311895,0.21753
max,0.328881,0.233273


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
3,0.287127,115.569006,"[32, 8]",100.0,100.0
2,0.293858,108.151318,"[32, 8]",50.0,100.0
6,0.296066,220.051627,"[64, 16]",50.0,100.0
7,0.299011,254.67847,"[64, 16]",100.0,100.0
13,0.300694,516.619871,"[64, 16]",100.0,50.0
1,0.301851,71.049077,"[32, 8]",100.0,50.0
12,0.30427,216.932015,"[32, 8]",50.0,50.0
14,0.305427,964.998477,"[128, 32]",50.0,100.0
0,0.309424,59.408877,"[32, 8]",50.0,50.0
11,0.31016,591.802661,"[128, 32]",100.0,100.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
3,0.192461,115.569006,"[32, 8]",100.0,100.0
7,0.202353,254.67847,"[64, 16]",100.0,100.0
2,0.204525,108.151318,"[32, 8]",50.0,100.0
6,0.204895,220.051627,"[64, 16]",50.0,100.0
13,0.207639,516.619871,"[64, 16]",100.0,50.0
14,0.209153,964.998477,"[128, 32]",50.0,100.0
1,0.209357,71.049077,"[32, 8]",100.0,50.0
5,0.2102,174.576857,"[64, 16]",100.0,50.0
12,0.211571,216.932015,"[32, 8]",50.0,50.0
11,0.21481,591.802661,"[128, 32]",100.0,100.0
