In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'graph_tool'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [2]:
### Identifying Leiden communities
network = nx.read_edgelist("./HR_edges_norm.csv")
nodes = network.nodes()
comms = algorithms.leiden(network)

In [3]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok

### N2V

In [4]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HR_n2v/HR_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [5]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_n2v/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 44.800278186798096
Completed iteration in: 44.174320220947266
Completed iteration in: 43.47666931152344
Completed iteration in: 44.21663451194763
Completed iteration in: 45.48612189292908
Completed iteration in: 44.15342593193054
Completed iteration in: 44.492429971694946
Completed iteration in: 44.02188038825989
Completed iteration in: 46.43574619293213
Completed iteration in: 45.71853590011597
Completed iteration in: 44.50842809677124
Completed iteration in: 44.89097738265991
Completed iteration in: 48.4673593044281
Completed iteration in: 45.316184759140015
Completed iteration in: 46.69828939437866
Completed iteration in: 47.35734510421753
Completed iteration in: 46.63527297973633
Completed iteration in: 44.04982662200928
Completed iteration in: 41.98779892921448
Completed iteration in: 40.65975546836853
Completed iteration in: 42.51383066177368
Completed iteration in: 40.26958394050598
Completed iteration in: 40.2605619430542
Completed iteration in: 40.48995

In [6]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,24.0,24.0,24.0
mean,0.903772,0.898187,0.905052
std,0.019629,0.02444,0.019279
min,0.873843,0.857463,0.875545
25%,0.892327,0.884539,0.893742
50%,0.901466,0.897334,0.902649
75%,0.912849,0.909917,0.914139
max,0.935502,0.937019,0.936115


In [7]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
7,0.873843,687.228601,0.5,1.0,80.0,10.0
11,0.878058,700.444418,0.5,0.5,80.0,10.0
3,0.87934,575.860253,1.0,1.0,80.0,10.0
19,0.880989,695.624031,1.0,2.0,80.0,10.0
15,0.881173,696.772516,1.0,0.5,80.0,10.0
23,0.886761,709.758869,2.0,1.0,80.0,10.0
6,0.894182,397.617808,0.5,1.0,40.0,10.0
10,0.896106,388.473305,0.5,0.5,40.0,10.0
2,0.897755,331.992402,1.0,1.0,40.0,10.0
18,0.898672,383.037323,1.0,2.0,40.0,10.0


In [8]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
7,0.857463,687.228601,0.5,1.0,80.0,10.0
11,0.86297,700.444418,0.5,0.5,80.0,10.0
19,0.868242,695.624031,1.0,2.0,80.0,10.0
15,0.869806,696.772516,1.0,0.5,80.0,10.0
3,0.871442,575.860253,1.0,1.0,80.0,10.0
23,0.877616,709.758869,2.0,1.0,80.0,10.0
22,0.886847,400.727971,2.0,1.0,40.0,10.0
18,0.887616,383.037323,1.0,2.0,40.0,10.0
10,0.88832,388.473305,0.5,0.5,40.0,10.0
6,0.890717,397.617808,0.5,1.0,40.0,10.0


In [9]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,p,q,walk_num,walk_len
7,0.875545,687.228601,0.5,1.0,80.0,10.0
11,0.879575,700.444418,0.5,0.5,80.0,10.0
3,0.880964,575.860253,1.0,1.0,80.0,10.0
19,0.882661,695.624031,1.0,2.0,80.0,10.0
15,0.883067,696.772516,1.0,0.5,80.0,10.0
23,0.888067,709.758869,2.0,1.0,80.0,10.0
6,0.895634,397.617808,0.5,1.0,40.0,10.0
10,0.897078,388.473305,0.5,0.5,40.0,10.0
2,0.898873,331.992402,1.0,1.0,40.0,10.0
18,0.90003,383.037323,1.0,2.0,40.0,10.0


### DW

In [10]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HR_dw/HR_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [11]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_dw/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 31.960910081863403
Completed iteration in: 34.23184013366699
Completed iteration in: 33.839301109313965
Completed iteration in: 33.2787561416626
Completed iteration in: 34.77603054046631
Completed iteration in: 32.49629068374634
Completed iteration in: 31.91238045692444
Completed iteration in: 32.476274490356445
Completed iteration in: 34.00492000579834
Completed iteration in: 34.09256148338318
Completed iteration in: 34.71637940406799
Completed iteration in: 33.064688205718994
Completed iteration in: 34.185839891433716
Completed iteration in: 33.71802759170532
Completed iteration in: 33.25294303894043


In [12]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.912084,0.913807,0.914113
std,0.016412,0.016359,0.015651
min,0.861383,0.86481,0.865443
25%,0.908749,0.908928,0.911151
50%,0.91727,0.919069,0.919056
75%,0.921209,0.924165,0.922865
max,0.926798,0.927905,0.928023


In [13]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.861383,38.241227,10.0,5.0
1,0.898946,72.052979,20.0,5.0
2,0.900779,142.735676,40.0,5.0
3,0.901695,288.967294,80.0,5.0
6,0.915804,341.508353,40.0,10.0
5,0.916903,171.306192,20.0,10.0
4,0.917087,90.918856,10.0,10.0
12,0.91727,310.089163,10.0,30.0
7,0.917728,670.22348,80.0,10.0
8,0.917728,188.168018,10.0,20.0


In [14]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.86481,38.241227,10.0,5.0
2,0.89834,142.735676,40.0,5.0
1,0.900922,72.052979,20.0,5.0
3,0.906095,288.967294,80.0,5.0
10,0.911762,734.545967,40.0,20.0
4,0.916881,90.918856,10.0,10.0
6,0.917879,341.508353,40.0,10.0
8,0.919069,188.168018,10.0,20.0
12,0.921182,310.089163,10.0,30.0
14,0.923087,1403.005183,40.0,30.0


In [15]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.865443,38.241227,10.0,5.0
1,0.901983,72.052979,20.0,5.0
2,0.903427,142.735676,40.0,5.0
3,0.904629,288.967294,80.0,5.0
6,0.917674,341.508353,40.0,10.0
4,0.918552,90.918856,10.0,10.0
5,0.919044,171.306192,20.0,10.0
12,0.919056,310.089163,10.0,30.0
8,0.919123,188.168018,10.0,20.0
7,0.91962,670.22348,80.0,10.0


### MNMF

In [16]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HR_mnmf/HR_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [17]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_mnmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric    
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 7.915022850036621
Completed iteration in: 6.984973907470703
Completed iteration in: 16.609859943389893
Completed iteration in: 14.213777780532837
Completed iteration in: 23.987975358963013
Completed iteration in: 20.347594261169434
Completed iteration in: 39.47210383415222


In [18]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,7.0,7.0,7.0
mean,0.858805,0.831926,0.855876
std,0.07448,0.114319,0.079565
min,0.738983,0.663901,0.727282
25%,0.826111,0.77259,0.821556
50%,0.894091,0.88317,0.893788
75%,0.907467,0.909182,0.907774
max,0.911406,0.912868,0.911399


In [19]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.738983,1231.952486,8.0,200.0
0,0.762712,423.98246,8.0,100.0
2,0.88951,1873.740525,16.0,100.0
3,0.894091,3089.716942,16.0,200.0
4,0.906551,6102.296241,32.0,100.0
6,0.908383,18979.924733,64.0,100.0
5,0.911406,11782.491975,32.0,200.0


In [20]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
0,0.663901,423.98246,8.0,100.0
1,0.667702,1231.952486,8.0,200.0
2,0.877478,1873.740525,16.0,100.0
3,0.88317,3089.716942,16.0,200.0
5,0.905931,11782.491975,32.0,200.0
6,0.912433,18979.924733,64.0,100.0
4,0.912868,6102.296241,32.0,100.0


In [21]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, dim, it)),
               columns =['F1-weigh', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Dimensions,Iterations
1,0.727282,1231.952486,8.0,200.0
0,0.753799,423.98246,8.0,100.0
2,0.889313,1873.740525,16.0,100.0
3,0.893788,3089.716942,16.0,200.0
4,0.90672,6102.296241,32.0,100.0
6,0.908828,18979.924733,64.0,100.0
5,0.911399,11782.491975,32.0,200.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HR_danmf/HR_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HR_danmf/HR_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 15.479259967803955
Completed iteration in: 15.600039958953857
Completed iteration in: 15.11432147026062
Completed iteration in: 16.068597555160522
Completed iteration in: 17.423462629318237
Completed iteration in: 16.534188508987427
Completed iteration in: 15.334161758422852
Completed iteration in: 15.800604104995728
Completed iteration in: 24.9135639667511
Completed iteration in: 26.32099223136902
Completed iteration in: 27.219987630844116
Completed iteration in: 26.511469841003418
Completed iteration in: 12.46737265586853
Completed iteration in: 15.169278860092163
Completed iteration in: 26.914104461669922


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.802284,0.753125,0.7997
std,0.063328,0.088239,0.066012
min,0.669446,0.581243,0.660495
25%,0.744022,0.6725,0.739282
50%,0.825011,0.774174,0.823163
75%,0.857902,0.820136,0.857911
max,0.865231,0.861645,0.865162


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
12,0.669446,560.116846,"[32, 8]",50.0,50.0
2,0.720018,268.152193,"[32, 8]",50.0,100.0
3,0.736967,278.286608,"[32, 8]",100.0,100.0
0,0.740632,151.062392,"[32, 8]",50.0,50.0
1,0.747412,164.564006,"[32, 8]",100.0,50.0
13,0.80055,1321.520216,"[64, 16]",100.0,50.0
6,0.81759,608.053479,"[64, 16]",50.0,100.0
7,0.825011,653.606651,"[64, 16]",100.0,100.0
5,0.829867,382.492228,"[64, 16]",100.0,50.0
4,0.836464,336.803176,"[64, 16]",50.0,50.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
12,0.581243,560.116846,"[32, 8]",50.0,50.0
2,0.626107,268.152193,"[32, 8]",50.0,100.0
1,0.660728,164.564006,"[32, 8]",100.0,50.0
0,0.666296,151.062392,"[32, 8]",50.0,50.0
3,0.678704,278.286608,"[32, 8]",100.0,100.0
6,0.750304,608.053479,"[64, 16]",50.0,100.0
7,0.767402,653.606651,"[64, 16]",100.0,100.0
13,0.774174,1321.520216,"[64, 16]",100.0,50.0
5,0.806907,382.492228,"[64, 16]",100.0,50.0
14,0.807227,2270.463403,"[128, 32]",50.0,100.0


In [27]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, lay, pre_it, it)),
               columns =['F1-weigh', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Layers,Pre-terations,Iterations
12,0.660495,560.116846,"[32, 8]",50.0,50.0
2,0.713962,268.152193,"[32, 8]",50.0,100.0
3,0.732673,278.286608,"[32, 8]",100.0,100.0
0,0.735638,151.062392,"[32, 8]",50.0,50.0
1,0.742927,164.564006,"[32, 8]",100.0,50.0
13,0.796761,1321.520216,"[64, 16]",100.0,50.0
6,0.815227,608.053479,"[64, 16]",50.0,100.0
7,0.823163,653.606651,"[64, 16]",100.0,100.0
5,0.829165,382.492228,"[64, 16]",100.0,50.0
4,0.835639,336.803176,"[64, 16]",50.0,50.0


### AVPRA

In [31]:
### Reading VLs from file
obj = pd.read_pickle("./HR.pickled") 

In [32]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies = []
f1_scores_macro = []
f1_scores_weigh = []
for res in obj:
    s_time = time.time()
    # Input 
    X_data = res[1]
    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 4.876799583435059
Iteration completed in 14.329792261123657
Iteration completed in 21.53307867050171
Iteration completed in 26.061728477478027
Iteration completed in 26.932313919067383
Iteration completed in 27.34579825401306
Iteration completed in 27.770158767700195
Iteration completed in 28.428438663482666
Iteration completed in 28.402973413467407
Iteration completed in 25.306987285614014
Iteration completed in 23.28538990020752
Iteration completed in 23.538427591323853
Iteration completed in 22.333151817321777
Iteration completed in 21.98496437072754
Iteration completed in 21.77078127861023
Iteration completed in 21.449708938598633
Iteration completed in 21.402557373046875
Iteration completed in 21.226351499557495
Iteration completed in 20.880239248275757
Iteration completed in 20.613039255142212
Iteration completed in 20.265904426574707


In [33]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [34]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2 = []
f1_scores2_macro = []
f1_scores2_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [35]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3 = []
f1_scores3_macro = []
f1_scores3_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [36]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [37]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_macro, "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, f1_scores2_macro, "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, f1_scores3_macro, "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_all_macro.png", dpi=500)
plt.show()

In [38]:
max(f1_scores_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_macro.index(max(f1_scores_macro))]

(0.9428818123386012, 14)

In [39]:
max(f1_scores2_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_macro.index(max(f1_scores2_macro))]

(0.32811739042480675, 6)

In [40]:
max(f1_scores3_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_macro.index(max(f1_scores3_macro))]

(0.027761439207270224, 4)

### Comparison weighted

In [41]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_weigh, "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, f1_scores2_weigh, "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, f1_scores3_weigh, "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_all_weighted.png", dpi=500)
plt.show()

In [42]:
max(f1_scores_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_weigh.index(max(f1_scores_weigh))]

(0.9448805973603183, 14)

In [43]:
max(f1_scores2_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_weigh.index(max(f1_scores2_weigh))]

(0.38093071133288003, 8)

In [44]:
max(f1_scores3_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_weigh.index(max(f1_scores3_weigh))]

(0.05863492887137417, 0)

In [45]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_macro, "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_macro.png", dpi=500)
plt.show()

In [46]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_weigh, "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_weighted.png", dpi=500)
plt.show()

In [47]:
max(accuracies), l[accuracies.index(max(accuracies))]

(0.9446633073751718, 14)

In [48]:
max(accuracies2), l[accuracies2.index(max(accuracies2))]

(0.41694915254237286, 8)

In [49]:
max(accuracies3), l[accuracies3.index(max(accuracies3))]

(0.16601007787448466, 20)

In [50]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies2, "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, accuracies3, "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_all_micro.png", dpi=500)
plt.show()

### Only F1 micro

In [51]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_micro.png", dpi=500)
plt.show()