In [4]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

In [5]:
### Identifying Leiden communities
network = nx.read_edgelist("./RO_edges_norm.csv")
nodes = network.nodes()
comms = algorithms.leiden(network)

In [6]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok

### N2V

In [7]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./RO_n2v/RO_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [9]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_n2v/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 33.66484761238098
Completed iteration in: 33.060821771621704
Completed iteration in: 33.314762592315674
Completed iteration in: 33.3178346157074
Completed iteration in: 33.90935254096985
Completed iteration in: 33.448808431625366
Completed iteration in: 33.293044328689575
Completed iteration in: 33.25424885749817
Completed iteration in: 32.574159145355225
Completed iteration in: 35.70158553123474
Completed iteration in: 36.46818423271179
Completed iteration in: 35.61125707626343
Completed iteration in: 36.12198328971863
Completed iteration in: 32.88347601890564
Completed iteration in: 33.01584601402283
Completed iteration in: 32.56550049781799
Completed iteration in: 34.03177356719971
Completed iteration in: 33.948387145996094
Completed iteration in: 32.205408334732056
Completed iteration in: 33.246999979019165
Completed iteration in: 32.90428304672241
Completed iteration in: 32.91075849533081
Completed iteration in: 33.0196328163147
Completed iteration in: 38.3

In [10]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,24.0,24.0,24.0
mean,0.870756,0.857189,0.872509
std,0.012072,0.023471,0.011655
min,0.844285,0.817469,0.847357
25%,0.864004,0.839161,0.86626
50%,0.868701,0.856592,0.870596
75%,0.874686,0.868655,0.876385
max,0.893716,0.902176,0.894382


In [11]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
7,0.844285,334.074217,0.5,1.0,80.0,10.0
19,0.852783,350.874268,1.0,2.0,80.0,10.0
5,0.859605,154.161536,0.5,1.0,80.0,5.0
11,0.86152,358.826658,0.5,0.5,80.0,10.0
3,0.863674,317.039041,1.0,1.0,80.0,10.0
18,0.863914,184.81163,1.0,2.0,40.0,10.0
22,0.864034,206.661722,2.0,1.0,40.0,10.0
6,0.86535,179.560204,0.5,1.0,40.0,10.0
17,0.866906,154.053255,1.0,2.0,80.0,5.0
10,0.867744,188.911285,0.5,0.5,40.0,10.0


In [12]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
15,0.817469,367.378012,1.0,0.5,80.0,10.0
19,0.819708,350.874268,1.0,2.0,80.0,10.0
3,0.82832,317.039041,1.0,1.0,80.0,10.0
13,0.833604,159.525569,1.0,0.5,80.0,5.0
18,0.834143,184.81163,1.0,2.0,40.0,10.0
7,0.836962,334.074217,0.5,1.0,80.0,10.0
1,0.839893,131.534873,1.0,1.0,80.0,5.0
2,0.845408,158.189576,1.0,1.0,40.0,10.0
11,0.850839,358.826658,0.5,0.5,80.0,10.0
10,0.851483,188.911285,0.5,0.5,40.0,10.0


In [13]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,p,q,walk_num,walk_len
7,0.847357,334.074217,0.5,1.0,80.0,10.0
19,0.855248,350.874268,1.0,2.0,80.0,10.0
5,0.862211,154.161536,0.5,1.0,80.0,5.0
11,0.86295,358.826658,0.5,0.5,80.0,10.0
3,0.865825,317.039041,1.0,1.0,80.0,10.0
18,0.865863,184.81163,1.0,2.0,40.0,10.0
22,0.866393,206.661722,2.0,1.0,40.0,10.0
6,0.866621,179.560204,0.5,1.0,40.0,10.0
17,0.868811,154.053255,1.0,2.0,80.0,5.0
10,0.869334,188.911285,0.5,0.5,40.0,10.0


### DW

In [14]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./RO_dw/RO_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [15]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_dw/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 31.25110125541687
Completed iteration in: 30.972058534622192
Completed iteration in: 29.98250126838684
Completed iteration in: 30.41923689842224
Completed iteration in: 30.747735500335693
Completed iteration in: 30.229262590408325
Completed iteration in: 30.065194845199585
Completed iteration in: 29.759159088134766
Completed iteration in: 30.53494691848755
Completed iteration in: 30.191242694854736
Completed iteration in: 30.06365728378296
Completed iteration in: 29.843146085739136
Completed iteration in: 31.135734796524048
Completed iteration in: 30.55040216445923
Completed iteration in: 30.433812379837036


In [16]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.835931,0.828938,0.839026
std,0.055493,0.064807,0.053083
min,0.681269,0.676354,0.693359
25%,0.82687,0.813979,0.829493
50%,0.856014,0.855299,0.858269
75%,0.871873,0.869649,0.874042
max,0.875763,0.884363,0.877601


In [17]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.681269,25.814546,10.0,5.0
1,0.752723,49.297842,20.0,5.0
2,0.791861,94.867974,40.0,5.0
3,0.813046,178.922319,80.0,5.0
4,0.840694,59.892339,10.0,10.0
5,0.851227,119.407379,20.0,10.0
7,0.85386,446.217201,80.0,10.0
6,0.856014,232.0977,40.0,10.0
8,0.862358,130.357413,10.0,20.0
12,0.868462,201.832897,10.0,30.0


In [18]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.676354,25.814546,10.0,5.0
1,0.697361,49.297842,20.0,5.0
2,0.785363,94.867974,40.0,5.0
3,0.797358,178.922319,80.0,5.0
4,0.8306,59.892339,10.0,10.0
12,0.830824,201.832897,10.0,30.0
5,0.85471,119.407379,20.0,10.0
13,0.855299,398.917009,20.0,30.0
9,0.857296,256.848043,20.0,20.0
7,0.859609,446.217201,80.0,10.0


In [19]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.693359,25.814546,10.0,5.0
1,0.756983,49.297842,20.0,5.0
2,0.796528,94.867974,40.0,5.0
3,0.81561,178.922319,80.0,5.0
4,0.843376,59.892339,10.0,10.0
5,0.853531,119.407379,20.0,10.0
7,0.856036,446.217201,80.0,10.0
6,0.858269,232.0977,40.0,10.0
8,0.864127,130.357413,10.0,20.0
12,0.869995,201.832897,10.0,30.0


### MNMF

In [20]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./RO_mnmf/RO_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [21]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_mnmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric    
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 6.175420045852661
Completed iteration in: 5.406421184539795
Completed iteration in: 10.865065813064575
Completed iteration in: 9.282708644866943
Completed iteration in: 15.469955205917358
Completed iteration in: 12.890292882919312
Completed iteration in: 25.571574926376343
Completed iteration in: 19.732340574264526


In [22]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,8.0,8.0,8.0
mean,0.742175,0.606125,0.734691
std,0.14331,0.169908,0.150241
min,0.508677,0.343271,0.490102
25%,0.674387,0.519316,0.663964
50%,0.788929,0.654659,0.782585
75%,0.855266,0.738135,0.853204
max,0.866188,0.76761,0.865468


In [23]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.508677,144.387542,8.0,200.0
0,0.554279,47.664274,8.0,100.0
3,0.714423,336.308679,16.0,200.0
2,0.731777,207.895045,16.0,100.0
4,0.84608,487.69828,32.0,100.0
5,0.852543,772.527251,32.0,200.0
7,0.863435,1883.460323,64.0,200.0
6,0.866188,1141.203295,64.0,100.0


In [24]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.343271,144.387542,8.0,200.0
0,0.379064,47.664274,8.0,100.0
3,0.566067,336.308679,16.0,200.0
2,0.576888,207.895045,16.0,100.0
4,0.732431,487.69828,32.0,100.0
5,0.734436,772.527251,32.0,200.0
7,0.74923,1883.460323,64.0,200.0
6,0.76761,1141.203295,64.0,100.0


In [25]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, dim, it)),
               columns =['F1-weigh', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Dimensions,Iterations
1,0.490102,144.387542,8.0,200.0
0,0.538368,47.664274,8.0,100.0
3,0.705829,336.308679,16.0,200.0
2,0.721486,207.895045,16.0,100.0
4,0.843684,487.69828,32.0,100.0
5,0.850115,772.527251,32.0,200.0
7,0.862474,1883.460323,64.0,200.0
6,0.865468,1141.203295,64.0,100.0


### DANMF

In [26]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./RO_danmf/RO_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [27]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./RO_danmf/RO_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 12.27226209640503
Completed iteration in: 11.992717027664185
Completed iteration in: 11.950240135192871
Completed iteration in: 11.944530010223389
Completed iteration in: 15.095505714416504
Completed iteration in: 15.357762098312378
Completed iteration in: 15.906373262405396
Completed iteration in: 15.026423215866089
Completed iteration in: 26.274548053741455
Completed iteration in: 27.068122625350952
Completed iteration in: 24.83404302597046
Completed iteration in: 25.613719940185547
Completed iteration in: 11.308794021606445
Completed iteration in: 13.901341676712036
Completed iteration in: 24.727815628051758


In [28]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.651745,0.541325,0.638837
std,0.083026,0.091051,0.087293
min,0.533932,0.412477,0.516431
25%,0.552663,0.432968,0.535216
50%,0.676002,0.555966,0.663932
75%,0.728486,0.622479,0.719974
max,0.744105,0.65346,0.736194


In [29]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
2,0.533932,68.263189,"[32, 8]",50.0,100.0
3,0.53848,77.28074,"[32, 8]",100.0,100.0
12,0.541352,136.182096,"[32, 8]",200.0,200.0
1,0.549252,50.047976,"[32, 8]",100.0,50.0
0,0.556074,39.073625,"[32, 8]",50.0,50.0
6,0.667864,139.50002,"[64, 16]",50.0,100.0
7,0.674446,169.919596,"[64, 16]",100.0,100.0
13,0.676002,313.978146,"[64, 16]",200.0,200.0
4,0.677558,81.475821,"[64, 16]",50.0,50.0
5,0.687732,116.574285,"[64, 16]",100.0,50.0


In [30]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
3,0.412477,77.28074,"[32, 8]",100.0,100.0
2,0.422265,68.263189,"[32, 8]",50.0,100.0
1,0.428753,50.047976,"[32, 8]",100.0,50.0
0,0.431144,39.073625,"[32, 8]",50.0,50.0
12,0.434793,136.182096,"[32, 8]",200.0,200.0
7,0.546286,169.919596,"[64, 16]",100.0,100.0
6,0.552624,139.50002,"[64, 16]",50.0,100.0
13,0.555966,313.978146,"[64, 16]",200.0,200.0
4,0.565131,81.475821,"[64, 16]",50.0,50.0
5,0.588362,116.574285,"[64, 16]",100.0,50.0


In [31]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, lay, pre_it, it)),
               columns =['F1-weigh', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Layers,Pre-terations,Iterations
2,0.516431,68.263189,"[32, 8]",50.0,100.0
3,0.518896,77.28074,"[32, 8]",100.0,100.0
12,0.522458,136.182096,"[32, 8]",200.0,200.0
1,0.531655,50.047976,"[32, 8]",100.0,50.0
0,0.538776,39.073625,"[32, 8]",50.0,50.0
6,0.655042,139.50002,"[64, 16]",50.0,100.0
7,0.661659,169.919596,"[64, 16]",100.0,100.0
4,0.663932,81.475821,"[64, 16]",50.0,50.0
13,0.664704,313.978146,"[64, 16]",200.0,200.0
5,0.675592,116.574285,"[64, 16]",100.0,50.0


### AVPRA

In [32]:
### Reading VLs from file
obj = pd.read_pickle("./RO.pickled") 

In [33]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies = []
f1_scores_macro = []
f1_scores_weigh = []
for res in obj:
    s_time = time.time()
    # Input 
    X_data = res[1]
    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 4.4722900390625
Iteration completed in 12.09586763381958
Iteration completed in 20.341968297958374
Iteration completed in 26.611642122268677
Iteration completed in 30.516812086105347
Iteration completed in 31.165623903274536
Iteration completed in 31.37542700767517
Iteration completed in 30.765419483184814
Iteration completed in 30.13176202774048
Iteration completed in 30.482144594192505
Iteration completed in 31.360885858535767
Iteration completed in 30.7687828540802
Iteration completed in 30.70772075653076
Iteration completed in 29.6639187335968
Iteration completed in 29.365875720977783
Iteration completed in 28.843959093093872
Iteration completed in 28.62280583381653
Iteration completed in 28.03406071662903
Iteration completed in 28.120938777923584
Iteration completed in 27.55914330482483
Iteration completed in 27.584564685821533


In [34]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [35]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2 = []
f1_scores2_macro = []
f1_scores2_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [36]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3 = []
f1_scores3_macro = []
f1_scores3_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [37]:
G = nx.read_edgelist("./RO_edges_norm.csv")

In [38]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [39]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_macro, "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, f1_scores2_macro, "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, f1_scores3_macro, "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA_all_macro.png", dpi=500)
plt.show()

In [40]:
max(f1_scores_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_macro.index(max(f1_scores_macro))]

(0.9213696257220697, 22)

In [41]:
max(f1_scores2_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_macro.index(max(f1_scores2_macro))]

(0.31267536205228147, 8)

In [42]:
max(f1_scores3_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_macro.index(max(f1_scores3_macro))]

(0.008658767197270005, 1)

### Comparison weighted

In [43]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_weigh, "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, f1_scores2_weigh, "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, f1_scores3_weigh, "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA_all_weighted.png", dpi=500)
plt.show()

In [44]:
max(f1_scores_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_weigh.index(max(f1_scores_weigh))]

(0.9180146339672548, 22)

In [45]:
max(f1_scores2_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_weigh.index(max(f1_scores2_weigh))]

(0.3010901162704476, 9)

In [46]:
max(f1_scores3_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_weigh.index(max(f1_scores3_weigh))]

(0.04363979123946014, 1)

In [47]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_macro, "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA_macro.png", dpi=500)
plt.show()

In [48]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_weigh, "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA_weighted.png", dpi=500)
plt.show()

In [49]:
max(accuracies), l[accuracies.index(max(accuracies))]

(0.9180131657690006, 22)

In [50]:
max(accuracies2), l[accuracies2.index(max(accuracies2))]

(0.3509275882704967, 14)

In [51]:
max(accuracies3), l[accuracies3.index(max(accuracies3))]

(0.13847995212447636, 22)

In [52]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies2, "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, accuracies3, "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA_all_micro.png", dpi=500)
plt.show()

### Only F1 micro

In [53]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=18, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_RO_AVPRA_micro.png", dpi=500)
plt.show()