In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

### HR

In [2]:
### Reading VLs from file
obj = pd.read_pickle("./HR.pickled") 

In [3]:
data = pd.read_csv("./HR_comms.txt", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

In [4]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies = []
f1_scores_macro = []
f1_scores_weigh = []
for res in obj:
    s_time = time.time()
    # Input 
    X_data = res[1]
    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 4.431243896484375
Iteration completed in 12.981043577194214
Iteration completed in 20.591996669769287
Iteration completed in 23.150129079818726
Iteration completed in 25.824406147003174
Iteration completed in 26.07561683654785
Iteration completed in 26.873189449310303
Iteration completed in 27.52234721183777
Iteration completed in 28.056808948516846
Iteration completed in 27.064818143844604
Iteration completed in 27.81883955001831
Iteration completed in 27.21990466117859
Iteration completed in 26.774442434310913
Iteration completed in 27.287861108779907
Iteration completed in 26.69911241531372
Iteration completed in 27.022515058517456
Iteration completed in 26.242228269577026
Iteration completed in 26.259974718093872
Iteration completed in 26.122005939483643
Iteration completed in 26.56774377822876
Iteration completed in 25.257197380065918


In [5]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [6]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2 = []
f1_scores2_macro = []
f1_scores2_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [7]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3 = []
f1_scores3_macro = []
f1_scores3_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [9]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [10]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_macro, "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, f1_scores2_macro, "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, f1_scores3_macro, "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_all_macro.png", dpi=500)
plt.show()

  plt.show()


In [11]:
max(f1_scores_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_macro.index(max(f1_scores_macro))]

(0.934652928358529, 12)

In [12]:
max(f1_scores2_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_macro.index(max(f1_scores2_macro))]

(0.33598107579833747, 5)

In [13]:
max(f1_scores3_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_macro.index(max(f1_scores3_macro))]

(0.039308119147952014, 5)

### Comparison weighted

In [14]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_weigh, "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, f1_scores2_weigh, "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, f1_scores3_weigh, "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_all_weighted.png", dpi=500)
plt.show()

  plt.show()


In [15]:
max(f1_scores_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_weigh.index(max(f1_scores_weigh))]

(0.9312446539501439, 16)

In [16]:
max(f1_scores2_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_weigh.index(max(f1_scores2_weigh))]

(0.37026009679725896, 6)

In [17]:
max(f1_scores3_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_weigh.index(max(f1_scores3_weigh))]

(0.05766769306158939, 0)

In [18]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_macro, "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_macro.png", dpi=500)
plt.show()

  plt.show()


In [19]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_weigh, "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_weighted.png", dpi=500)
plt.show()

  plt.show()


In [20]:
max(accuracies), l[accuracies.index(max(accuracies))]

(0.9310123683005039, 16)

In [21]:
max(accuracies2), l[accuracies2.index(max(accuracies2))]

(0.416491067338525, 7)

In [22]:
max(accuracies3), l[accuracies3.index(max(accuracies3))]

(0.15959688502061384, 14)

In [23]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies2, "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, accuracies3, "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_comparisons_HR.png", dpi=500)
plt.show()

  plt.show()


### Only F1 micro

In [24]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=12, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HR_AVPRA_micro.png", dpi=500)
plt.show()

  plt.show()
