In [2]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

### HU

In [3]:
### Reading VLs from file
obj = pd.read_pickle("./HU.pickled") 

In [4]:
data = pd.read_csv("./HU_comms.txt", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

In [5]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies = []
f1_scores_macro = []
f1_scores_weigh = []
for res in obj:
    s_time = time.time()
    # Input 
    X_data = res[1]
    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 4.015400409698486
Iteration completed in 11.809913635253906
Iteration completed in 18.670093774795532
Iteration completed in 22.881443977355957
Iteration completed in 25.653970956802368
Iteration completed in 26.712540864944458
Iteration completed in 25.01539421081543
Iteration completed in 24.569973707199097
Iteration completed in 24.539327383041382
Iteration completed in 24.451976537704468
Iteration completed in 24.614954471588135
Iteration completed in 24.409750938415527
Iteration completed in 23.800541639328003
Iteration completed in 23.664335012435913
Iteration completed in 23.413392782211304
Iteration completed in 23.213799476623535
Iteration completed in 22.838446140289307
Iteration completed in 22.746626138687134
Iteration completed in 22.24212074279785
Iteration completed in 23.016165494918823
Iteration completed in 23.29863953590393


In [6]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [7]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2 = []
f1_scores2_macro = []
f1_scores2_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [8]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3 = []
f1_scores3_macro = []
f1_scores3_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [10]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [25]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_macro, "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, f1_scores2_macro, "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, f1_scores3_macro, "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_all_macro.png", dpi=500)
plt.show()

  plt.show()


In [12]:
max(f1_scores_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_macro.index(max(f1_scores_macro))]

(0.8880725549110196, 20)

In [13]:
max(f1_scores2_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_macro.index(max(f1_scores2_macro))]

(0.13956091722896538, 5)

In [14]:
max(f1_scores3_macro), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_macro.index(max(f1_scores3_macro))]

(0.016516245154059143, 2)

### Comparison weighted

In [26]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, f1_scores_weigh, "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, f1_scores2_weigh, "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, f1_scores3_weigh, "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_all_weighted.png", dpi=500)
plt.show()

  plt.show()


In [16]:
max(f1_scores_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores_weigh.index(max(f1_scores_weigh))]

(0.8901613465622639, 20)

In [17]:
max(f1_scores2_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores2_weigh.index(max(f1_scores2_weigh))]

(0.19370354914585863, 5)

In [18]:
max(f1_scores3_weigh), (list(range(0,10)) + list(range(10, 32, 2)))[f1_scores3_weigh.index(max(f1_scores3_weigh))]

(0.05919622298084826, 2)

In [19]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_macro, "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_macro.png", dpi=500)
plt.show()

  plt.show()


In [20]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_weigh, "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_weighted.png", dpi=500)
plt.show()

  plt.show()


In [21]:
max(accuracies), l[accuracies.index(max(accuracies))]

(0.8896718552797644, 20)

In [22]:
max(accuracies2), l[accuracies2.index(max(accuracies2))]

(0.24453092132940682, 6)

In [23]:
max(accuracies3), l[accuracies3.index(max(accuracies3))]

(0.17700883466554482, 28)

In [29]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies2, "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, accuracies3, "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_all_micro.png", dpi=500)
plt.show()

  plt.show()


### Only micro F1

In [28]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 32, 2))
plt.plot(l, accuracies, "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=14, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_HU_AVPRA_micro.png", dpi=500)
plt.show()

  plt.show()
