In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
from cdlib import algorithms
import random
import csv
%matplotlib inline

Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'graph_tool'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'wurlitzer'}


In [2]:
### Identifying Leiden communities
network = nx.read_edgelist("./HU_edges_norm.csv")
nodes = network.nodes()
comms = algorithms.leiden(network)

In [3]:
comms_dict = comms.to_node_community_map()
comms_dict_ok = {}
for node in nodes:
    comms_dict_ok[node] = comms_dict[node][0]
comms_dict = comms_dict_ok

### N2V

In [4]:
### Getting total tests number
tests_num = 0
exec_time = []
p = []
q = []
walk_len = []
walk_num = []
with open("./HU_n2v/HU_n2v_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))
        if "p:" in line:
            p.append(float(line[3:]))
        if "q:" in line:
            q.append(float(line[3:]))
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))

In [5]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_n2v/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 38.1764600276947
Completed iteration in: 39.015941858291626
Completed iteration in: 39.07096719741821
Completed iteration in: 39.82663154602051
Completed iteration in: 39.848137855529785
Completed iteration in: 40.72675657272339
Completed iteration in: 40.357197284698486
Completed iteration in: 40.29876661300659
Completed iteration in: 40.507424116134644
Completed iteration in: 40.068660736083984
Completed iteration in: 39.58502721786499
Completed iteration in: 40.74253487586975
Completed iteration in: 40.17141914367676
Completed iteration in: 39.931631088256836
Completed iteration in: 40.08144283294678
Completed iteration in: 40.87869739532471
Completed iteration in: 42.74984955787659
Completed iteration in: 41.03916096687317
Completed iteration in: 40.43833136558533
Completed iteration in: 40.94141912460327
Completed iteration in: 39.97137999534607
Completed iteration in: 37.00688624382019
Completed iteration in: 35.90319895744324
Completed iteration in: 36.64

In [6]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,24.0,24.0,24.0
mean,0.836129,0.821749,0.836681
std,0.023407,0.028997,0.02343
min,0.800484,0.773736,0.800188
25%,0.819704,0.802356,0.819969
50%,0.829459,0.819608,0.830199
75%,0.844394,0.832798,0.845104
max,0.880101,0.872849,0.880766


In [7]:
pd.DataFrame(list(zip(f1_scores, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,p,q,walk_num,walk_len
19,0.800484,461.019866,1.0,2.0,80.0,10.0
7,0.811527,485.59117,0.5,1.0,80.0,10.0
11,0.813946,468.974333,0.5,0.5,80.0,10.0
15,0.814262,499.899741,1.0,0.5,80.0,10.0
3,0.814788,436.778427,1.0,1.0,80.0,10.0
18,0.818363,248.378643,1.0,2.0,40.0,10.0
6,0.820151,253.726269,0.5,1.0,40.0,10.0
23,0.822886,500.880916,2.0,1.0,80.0,10.0
2,0.825621,235.38871,1.0,1.0,40.0,10.0
17,0.827303,208.078459,1.0,2.0,80.0,5.0


In [8]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,p,q,walk_num,walk_len
3,0.773736,436.778427,1.0,1.0,80.0,10.0
15,0.778214,499.899741,1.0,0.5,80.0,10.0
19,0.787685,461.019866,1.0,2.0,80.0,10.0
10,0.788761,254.258841,0.5,0.5,40.0,10.0
7,0.793874,485.59117,0.5,1.0,80.0,10.0
11,0.798746,468.974333,0.5,0.5,80.0,10.0
23,0.803559,500.880916,2.0,1.0,80.0,10.0
2,0.807268,235.38871,1.0,1.0,40.0,10.0
18,0.811257,248.378643,1.0,2.0,40.0,10.0
6,0.812614,253.726269,0.5,1.0,40.0,10.0


In [9]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, p, q, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'p', 'q', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,p,q,walk_num,walk_len
19,0.800188,461.019866,1.0,2.0,80.0,10.0
7,0.811415,485.59117,0.5,1.0,80.0,10.0
15,0.814931,499.899741,1.0,0.5,80.0,10.0
11,0.815053,468.974333,0.5,0.5,80.0,10.0
3,0.815591,436.778427,1.0,1.0,80.0,10.0
6,0.819765,253.726269,0.5,1.0,40.0,10.0
18,0.820037,248.378643,1.0,2.0,40.0,10.0
23,0.822032,500.880916,2.0,1.0,80.0,10.0
2,0.826221,235.38871,1.0,1.0,40.0,10.0
17,0.828249,208.078459,1.0,2.0,80.0,5.0


### DW

In [10]:
### Getting total tests number
tests_num = 0
exec_time = []
walk_num = []
walk_len = []
with open("./HU_dw/HU_dw_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "walk_length:" in line:
            walk_len.append(float(line[13:-1]))
        if "num_walk:" in line:
            walk_num.append(float(line[9:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [11]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_dw/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 28.015589952468872
Completed iteration in: 27.11214303970337
Completed iteration in: 26.63985848426819
Completed iteration in: 26.624447107315063
Completed iteration in: 26.999606132507324
Completed iteration in: 28.488025903701782
Completed iteration in: 28.98057794570923
Completed iteration in: 30.547727823257446
Completed iteration in: 30.310967206954956
Completed iteration in: 30.134315967559814
Completed iteration in: 29.950615167617798
Completed iteration in: 29.0375075340271
Completed iteration in: 28.653651237487793
Completed iteration in: 29.14015555381775
Completed iteration in: 29.180437326431274


In [12]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.838613,0.828079,0.839207
std,0.048383,0.050727,0.04756
min,0.68984,0.681056,0.694615
25%,0.831037,0.819695,0.830223
50%,0.857488,0.843695,0.858554
75%,0.866639,0.860071,0.867107
max,0.87053,0.872437,0.871412


In [13]:
pd.DataFrame(list(zip(f1_scores, exec_time, walk_num, walk_len)),
               columns =['F1-micro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,walk_num,walk_len
0,0.68984,36.506493,10.0,5.0
1,0.781868,68.210582,20.0,5.0
2,0.811527,133.75116,40.0,5.0
3,0.824043,266.070136,80.0,5.0
4,0.838031,87.543224,10.0,10.0
7,0.853281,681.768971,80.0,10.0
5,0.854859,170.240475,20.0,10.0
6,0.857488,334.773742,40.0,10.0
9,0.863694,369.003128,20.0,20.0
14,0.863799,972.867248,40.0,30.0


In [14]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, walk_num, walk_len)),
               columns =['F1-macro', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,walk_num,walk_len
0,0.681056,36.506493,10.0,5.0
3,0.774765,266.070136,80.0,5.0
1,0.7774,68.210582,20.0,5.0
2,0.80268,133.75116,40.0,5.0
4,0.83671,87.543224,10.0,10.0
11,0.836789,1184.07436,80.0,20.0
5,0.840981,170.240475,20.0,10.0
7,0.843695,681.768971,80.0,10.0
13,0.853944,476.520527,20.0,30.0
9,0.85754,369.003128,20.0,20.0


In [15]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, walk_num, walk_len)),
               columns =['F1-weigh', 'Exec time', 'walk_num', 'walk_len']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,walk_num,walk_len
0,0.694615,36.506493,10.0,5.0
1,0.782867,68.210582,20.0,5.0
2,0.810436,133.75116,40.0,5.0
3,0.822662,266.070136,80.0,5.0
4,0.837785,87.543224,10.0,10.0
7,0.853975,681.768971,80.0,10.0
5,0.854641,170.240475,20.0,10.0
6,0.858554,334.773742,40.0,10.0
9,0.863978,369.003128,20.0,20.0
14,0.864372,972.867248,40.0,30.0


### MNMF

In [16]:
### Getting total tests number
tests_num = 0
exec_time = []
dim = []
it = []
with open("./HU_mnmf/HU_mnmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "dimensions" in line:
            dim.append(float(line[12:]))
        if "iterations:" in line:
            it.append(float(line[12:]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [17]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []

for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_mnmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric    
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 7.536314010620117
Completed iteration in: 6.682097434997559
Completed iteration in: 14.365582466125488
Completed iteration in: 12.534231424331665
Completed iteration in: 20.661597967147827
Completed iteration in: 16.951598644256592
Completed iteration in: 37.47861862182617
Completed iteration in: 28.83633017539978


In [18]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,8.0,8.0,8.0
mean,0.784957,0.72162,0.779085
std,0.10712,0.145744,0.11301
min,0.612958,0.507244,0.597397
25%,0.748133,0.646998,0.740094
50%,0.825778,0.753834,0.821837
75%,0.861774,0.846167,0.860301
max,0.872949,0.857104,0.871979


In [19]:
pd.DataFrame(list(zip(f1_scores, exec_time, dim, it)),
               columns =['F1-micro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Dimensions,Iterations
1,0.612958,285.491338,8.0,200.0
0,0.625473,95.28386,8.0,100.0
3,0.78902,701.013073,16.0,200.0
2,0.796382,421.509037,16.0,100.0
4,0.855175,1211.796356,32.0,100.0
5,0.859697,2139.761206,32.0,200.0
6,0.868006,3373.030116,64.0,100.0
7,0.872949,5875.845202,64.0,200.0


In [20]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, dim, it)),
               columns =['F1-macro', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Dimensions,Iterations
1,0.507244,285.491338,8.0,200.0
0,0.513726,95.28386,8.0,100.0
3,0.691423,701.013073,16.0,200.0
2,0.697924,421.509037,16.0,100.0
4,0.809745,1211.796356,32.0,100.0
5,0.844436,2139.761206,32.0,200.0
6,0.851361,3373.030116,64.0,100.0
7,0.857104,5875.845202,64.0,200.0


In [21]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, dim, it)),
               columns =['F1-weigh', 'Exec time', 'Dimensions', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Dimensions,Iterations
1,0.597397,285.491338,8.0,200.0
0,0.611317,95.28386,8.0,100.0
3,0.78302,701.013073,16.0,200.0
2,0.790753,421.509037,16.0,100.0
4,0.852921,1211.796356,32.0,100.0
5,0.857953,2139.761206,32.0,200.0
6,0.867342,3373.030116,64.0,100.0
7,0.871979,5875.845202,64.0,200.0


### DANMF

In [22]:
### Getting total tests number
tests_num = 0
exec_time = []
lay = []
it = []
pre_it = []
with open("./HU_danmf/HU_danmf_info.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if "Test" in line:
            tests_num += 1
        if "layers:" in line:
            lay.append(line[8:-1])
        if "pre_iterations:" in line:
            pre_it.append(float(line[16:-1]))
        if "iterations:" in line and "pre_iterations" not in line:
            it.append(float(line[11:-1]))
        if "Embedding" in line:
            exec_time.append(float(line[31:-2]))

In [23]:
f1_scores = []
f1_scores_macro = []
f1_scores_weigh = []
for i in range(tests_num):
    ### Random forest classifier creation with 70 trees
    clf = RandomForestClassifier(n_estimators=70)
    start_time = time.time()
    s_t = time.time()
    # Input 
    X_data = pd.read_csv("./HU_danmf/HU_emb_vectors" + str(i) + ".csv", header=None, sep=";").values.tolist()

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    f1_scores.append(metrics.f1_score(y_test, y_pred, average="micro"))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    print("Completed iteration in:", time.time() - s_t)

    end_time = time.time()

Completed iteration in: 12.22700047492981
Completed iteration in: 12.796839475631714
Completed iteration in: 11.711814165115356
Completed iteration in: 11.911804437637329
Completed iteration in: 14.88362741470337
Completed iteration in: 15.106140375137329
Completed iteration in: 14.879892826080322
Completed iteration in: 14.84758734703064
Completed iteration in: 26.148178100585938
Completed iteration in: 26.92383074760437
Completed iteration in: 25.10265851020813
Completed iteration in: 25.89030170440674
Completed iteration in: 11.699205160140991
Completed iteration in: 14.916791200637817
Completed iteration in: 25.597542762756348


In [24]:
df = pd.DataFrame(columns = ["Micro", 'Macro', 'Weigh'])
df["Micro"] = pd.Series(f1_scores).describe()
df["Weigh"] = pd.Series(f1_scores_weigh).describe()
df["Macro"] = pd.Series(f1_scores_macro).describe()
df

Unnamed: 0,Micro,Macro,Weigh
count,15.0,15.0,15.0
mean,0.642792,0.587511,0.626522
std,0.102584,0.104639,0.108222
min,0.44268,0.396512,0.419814
25%,0.544804,0.492519,0.520351
50%,0.664493,0.589148,0.648967
75%,0.736958,0.678086,0.725701
max,0.757678,0.732899,0.75153


In [25]:
pd.DataFrame(list(zip(f1_scores, exec_time, lay, pre_it, it)),
               columns =['F1-micro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-micro")

Unnamed: 0,F1-micro,Exec time,Layers,Pre-terations,Iterations
2,0.44268,108.151318,"[32, 8]",50.0,100.0
0,0.516828,59.408877,"[32, 8]",50.0,50.0
3,0.520404,115.569006,"[32, 8]",100.0,100.0
12,0.530501,216.932015,"[32, 8]",50.0,50.0
1,0.559108,71.049077,"[32, 8]",100.0,50.0
7,0.657867,254.67847,"[64, 16]",100.0,100.0
13,0.662705,516.619871,"[64, 16]",100.0,50.0
5,0.664493,174.576857,"[64, 16]",100.0,50.0
6,0.671435,220.051627,"[64, 16]",50.0,100.0
4,0.690576,128.863256,"[64, 16]",50.0,50.0


In [26]:
pd.DataFrame(list(zip(f1_scores_macro, exec_time, lay, pre_it, it)),
               columns =['F1-macro', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-macro")

Unnamed: 0,F1-macro,Exec time,Layers,Pre-terations,Iterations
2,0.396512,108.151318,"[32, 8]",50.0,100.0
12,0.462709,216.932015,"[32, 8]",50.0,50.0
3,0.470833,115.569006,"[32, 8]",100.0,100.0
0,0.491693,59.408877,"[32, 8]",50.0,50.0
1,0.493345,71.049077,"[32, 8]",100.0,50.0
7,0.586282,254.67847,"[64, 16]",100.0,100.0
13,0.586489,516.619871,"[64, 16]",100.0,50.0
6,0.589148,220.051627,"[64, 16]",50.0,100.0
5,0.600367,174.576857,"[64, 16]",100.0,50.0
4,0.625842,128.863256,"[64, 16]",50.0,50.0


In [27]:
pd.DataFrame(list(zip(f1_scores_weigh, exec_time, lay, pre_it, it)),
               columns =['F1-weigh', 'Exec time', 'Layers', 'Pre-terations', 'Iterations']).sort_values(by="F1-weigh")

Unnamed: 0,F1-weigh,Exec time,Layers,Pre-terations,Iterations
2,0.419814,108.151318,"[32, 8]",50.0,100.0
3,0.495736,115.569006,"[32, 8]",100.0,100.0
0,0.496879,59.408877,"[32, 8]",50.0,50.0
12,0.50669,216.932015,"[32, 8]",50.0,50.0
1,0.534012,71.049077,"[32, 8]",100.0,50.0
7,0.641575,254.67847,"[64, 16]",100.0,100.0
13,0.64609,516.619871,"[64, 16]",100.0,50.0
5,0.648967,174.576857,"[64, 16]",100.0,50.0
6,0.653749,220.051627,"[64, 16]",50.0,100.0
4,0.675526,128.863256,"[64, 16]",50.0,50.0


### AVPRA

In [28]:
### Reading VLs from file
obj = pd.read_pickle("./HU.pickled") 

In [29]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies = []
f1_scores_macro = []
f1_scores_weigh = []
for res in obj:
    s_time = time.time()
    # Input 
    X_data = res[1]
    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 3.8904829025268555
Iteration completed in 10.818247556686401
Iteration completed in 17.23558259010315
Iteration completed in 21.91325569152832
Iteration completed in 22.98432755470276
Iteration completed in 23.460631608963013
Iteration completed in 24.459009647369385
Iteration completed in 24.818222045898438
Iteration completed in 25.802737712860107
Iteration completed in 21.24305486679077
Iteration completed in 20.679200410842896
Iteration completed in 20.348020315170288
Iteration completed in 21.70822238922119
Iteration completed in 22.393040895462036
Iteration completed in 22.223485708236694
Iteration completed in 21.975518703460693
Iteration completed in 22.38759207725525
Iteration completed in 21.7547767162323
Iteration completed in 21.408528804779053
Iteration completed in 21.106276273727417
Iteration completed in 21.166898488998413


In [30]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [31]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2 = []
f1_scores2_macro = []
f1_scores2_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [32]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3 = []
f1_scores3_macro = []
f1_scores3_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [33]:
G = nx.read_edgelist("./HU_edges_norm.csv")

In [34]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [35]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_macro, "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, f1_scores2_macro, "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, f1_scores3_macro, "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_all_macro.png", dpi=500)
plt.show()

In [36]:
max(f1_scores_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_macro.index(max(f1_scores_macro))]

(0.9009806671735232, 16)

In [37]:
max(f1_scores2_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_macro.index(max(f1_scores2_macro))]

(0.1745617725630631, 6)

In [38]:
max(f1_scores3_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_macro.index(max(f1_scores3_macro))]

(0.012626402923672024, 2)

### Comparison weighted

In [39]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_weigh, "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, f1_scores2_weigh, "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, f1_scores3_weigh, "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_all_weighted.png", dpi=500)
plt.show()

In [40]:
max(f1_scores_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_weigh.index(max(f1_scores_weigh))]

(0.9065950857194097, 16)

In [41]:
max(f1_scores2_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_weigh.index(max(f1_scores2_weigh))]

(0.1776802872379407, 4)

In [42]:
max(f1_scores3_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_weigh.index(max(f1_scores3_weigh))]

(0.04897347545999005, 2)

In [43]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_macro, "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_macro.png", dpi=500)
plt.show()

In [44]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_weigh, "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_weighted.png", dpi=500)
plt.show()

In [45]:
max(accuracies), l[accuracies.index(max(accuracies))]

(0.9069204880100967, 16)

In [46]:
max(accuracies2), l[accuracies2.index(max(accuracies2))]

(0.23085822465292385, 7)

In [47]:
max(accuracies3), l[accuracies3.index(max(accuracies3))]

(0.16543962978544383, 20)

In [48]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies2, "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, accuracies3, "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_all_micro.png", dpi=500)
plt.show()

### Only F1 micro

In [49]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_micro.png", dpi=500)
plt.show()