In [1]:
### Import useful libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import metrics
import networkx as nx
import time
import random
import csv
%matplotlib inline

### Only lang

In [2]:
l = list(range(0, 10)) + list(range(10, 30, 2))

In [3]:
### Reading VLs from file
obj = pd.read_pickle("./Only_lang/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("./Only_lang/log_trial_1_LPStates.pickled")[1:]\
        +  pd.read_pickle("./Only_lang/log_trial_2_LPStates.pickled")[1:]\
            + pd.read_pickle("./Only_lang/log_trial_3_LPStates.pickled")[1:]

In [4]:
data = pd.read_csv("./twitch_comms.csv", sep=" ", header=None)
comms_dict = {}
for row in data.iterrows():
    comms_dict[str(row[1][0])] = row[1][1]

In [5]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies = []
f1_scores_macro = []
f1_scores_weigh = []
for res in obj:
    s_time = time.time()
    # Input 
    X_data = res[1]
    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 3.797091007232666
Iteration completed in 12.508708000183105
Iteration completed in 51.834871768951416
Iteration completed in 52.75838112831116
Iteration completed in 53.3145968914032
Iteration completed in 53.30100679397583
Iteration completed in 53.077017307281494
Iteration completed in 54.61321949958801
Iteration completed in 49.390111684799194
Iteration completed in 50.49282169342041
Iteration completed in 43.771607875823975
Iteration completed in 48.585078954696655
Iteration completed in 47.025930643081665
Iteration completed in 45.32200050354004
Iteration completed in 50.47292685508728
Iteration completed in 49.30631709098816
Iteration completed in 48.08205485343933
Iteration completed in 50.38036823272705
Iteration completed in 52.70094299316406
Iteration completed in 53.58765196800232
Iteration completed in 52.1880145072937
Iteration completed in 48.52446794509888
Iteration completed in 45.81927704811096
Iteration completed in 46.41480350494385
Iteration c

In [6]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [7]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2 = []
f1_scores2_macro = []
f1_scores2_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [8]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3 = []
f1_scores3_macro = []
f1_scores3_weigh = []
for res in obj:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [9]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [10]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [f1_scores_macro[i] for i in l], "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, [f1_scores2_macro[i] for i in l], "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, [f1_scores3_macro[i] for i in l], "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="lower right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_onlyL_AVPRA_all_macro.png", dpi=500)
plt.show()

  plt.show()


In [11]:
max(f1_scores_macro), f1_scores_macro.index(max(f1_scores_macro))

(0.8739236820477795, 10)

In [12]:
max(f1_scores2_macro), f1_scores2_macro.index(max(f1_scores2_macro))

(0.7219649344059615, 5)

In [13]:
max(f1_scores3_macro), f1_scores3_macro.index(max(f1_scores3_macro))

(0.6342339832223101, 1)

### Comparison weighted

In [14]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [f1_scores_weigh[i] for i in l], "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, [f1_scores2_weigh[i] for i in l], "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, [f1_scores3_weigh[i] for i in l], "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="lower right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_onlyL_AVPRA_all_weighted.png", dpi=500)
plt.show()

  plt.show()


In [15]:
max(f1_scores_weigh), f1_scores_weigh.index(max(f1_scores_weigh))

(0.8338172346497804, 11)

In [16]:
max(f1_scores2_weigh), f1_scores2_weigh.index(max(f1_scores2_weigh))

(0.633068269396643, 5)

In [17]:
max(f1_scores3_weigh), f1_scores3_weigh.index(max(f1_scores3_weigh))

(0.3613434339331933, 1)

In [18]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [accuracies[i] for i in l], "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, [f1_scores_macro[i] for i in l], "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_onlyL_AVPRA_macro.png", dpi=500)
plt.show()

  plt.show()


In [19]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [accuracies[i] for i in l], "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, [f1_scores_weigh[i] for i in l], "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_onlyL_AVPRA_weighted.png", dpi=500)
plt.show()

  plt.show()


In [20]:
max(accuracies), l[accuracies.index(max(accuracies))]

(0.8394848764238765, 12)

In [21]:
max(accuracies2), l[accuracies2.index(max(accuracies2))]

(0.6444695595277042, 8)

In [22]:
max(accuracies3), l[accuracies3.index(max(accuracies3))]

(0.4918954287243851, 0)

In [23]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [accuracies[i] for i in l], "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, [accuracies2[i] for i in l], "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, [accuracies3[i] for i in l], "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="lower right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_comparisons_twitch_onlyF.png", dpi=500)
plt.show()

  plt.show()


# All features

In [24]:
### Reading VLs from file
obj = pd.read_pickle("./All_feat/log_trial_0_LPStates.pickled")\
    + pd.read_pickle("./All_feat/log_trial_1_LPStates.pickled")[1:]\
        +  pd.read_pickle("./All_feat/log_trial_2_LPStates.pickled")[1:]\
            + pd.read_pickle("./All_feat/log_trial_3_LPStates.pickled")[1:]

In [25]:
### Random forest classifier creation with 70 trees
clf = RandomForestClassifier(n_estimators=70)
start_time = time.time()
accuracies_allf = []
f1_scores_macro_allf = []
f1_scores_weigh_allf = []
for res in [obj[i] for i in l]:
    s_time = time.time()
    # Input 
    X_data = res[1]
    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies_allf.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores_macro_allf.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores_weigh_allf.append(metrics.f1_score(y_test, y_pred, average="weighted"))

    print(f"Iteration completed in {time.time() - s_time}")
end_time = time.time()

Iteration completed in 4.777518272399902
Iteration completed in 17.610280990600586
Iteration completed in 57.12461590766907
Iteration completed in 69.41503095626831
Iteration completed in 57.900755643844604
Iteration completed in 68.97751665115356
Iteration completed in 61.19676685333252
Iteration completed in 61.75340533256531
Iteration completed in 54.12453317642212
Iteration completed in 60.472928285598755
Iteration completed in 57.43724083900452
Iteration completed in 60.124637842178345
Iteration completed in 53.83508777618408
Iteration completed in 62.17167949676514
Iteration completed in 57.288023710250854
Iteration completed in 56.26481294631958
Iteration completed in 56.660359621047974
Iteration completed in 72.802161693573
Iteration completed in 127.16159868240356
Iteration completed in 135.9566969871521


In [26]:
### Function that returns the 10 / 1 index of the maximum values of a list
def get10maxidx(l):
    return list(map(lambda x: x[1], sorted(zip(l, range(0, len(l))), reverse=True)[:10]))
def getmaxidx(l):
    return l.index(max(l))

In [27]:
### 10MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time2 = time.time()
accuracies2_allf = []
f1_scores2_macro_allf = []
f1_scores2_weigh_allf = []
for res in [obj[i] for i in l]:
    # Input 
    X_data = list(map(lambda x: get10maxidx(x), res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies2_allf.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores2_macro_allf.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores2_weigh_allf.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time2 = time.time()

In [28]:
### MWL classification
clf = RandomForestClassifier(n_estimators=70)
start_time3 = time.time()
accuracies3_allf = []
f1_scores3_macro_allf = []
f1_scores3_weigh_allf = []
for res in [obj[i] for i in l]:
    # Input 
    X_data = list(map(lambda x: [getmaxidx(x)], res[1]))

    # Output communities defined by Louvain algorithm
    y_data = [comms_dict.get(str(i)) for i in range(1, len(comms_dict) + 1)]

    # Split the data into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    ### Accuracy metric
    accuracies3_allf.append(metrics.accuracy_score(y_test, y_pred))
    f1_scores3_macro_allf.append(metrics.f1_score(y_test, y_pred, average="macro"))
    f1_scores3_weigh_allf.append(metrics.f1_score(y_test, y_pred, average="weighted"))
    
end_time3 = time.time()

In [29]:
import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False
})

### Comparison macro

In [31]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, f1_scores_macro_allf, "o", label="AVPRA F1-score-macro", markersize=10)
plt.plot(l, f1_scores2_macro_allf, "o", label="AVPRA 10MWL F1-score-macro", markersize=10)
plt.plot(l, f1_scores3_macro_allf, "o", label="AVPRA MWL F1-score-macro", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_allF_AVPRA_all_macro.png", dpi=500)
plt.show()

  plt.show()


In [32]:
max(f1_scores_macro_allf), f1_scores_macro_allf.index(max(f1_scores_macro_allf))

(0.8683384922391197, 3)

In [33]:
max(f1_scores2_macro_allf), f1_scores2_macro_allf.index(max(f1_scores2_macro_allf))

(0.7127304792607018, 6)

In [34]:
max(f1_scores3_macro_allf), f1_scores3_macro_allf.index(max(f1_scores3_macro_allf))

(0.587931180907374, 0)

### Comparison weighted

In [36]:
# Plot F1-macro comparison
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, f1_scores_weigh_allf, "o", label="AVPRA F1-score-weighted", markersize=10)
plt.plot(l, f1_scores2_weigh_allf, "o", label="AVPRA 10MWL F1-score-weighted", markersize=10)
plt.plot(l, f1_scores3_weigh_allf, "o", label="AVPRA MWL F1-score-weighted", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("F1-score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_allF_AVPRA_all_weighted.png", dpi=500)
plt.show()

  plt.show()


In [37]:
max(f1_scores_weigh_allf), f1_scores_weigh_allf.index(max(f1_scores_weigh_allf))

(0.8514162046363131, 7)

In [38]:
max(f1_scores2_weigh_allf), f1_scores2_weigh_allf.index(max(f1_scores2_weigh_allf))

(0.6702277740066446, 3)

In [39]:
max(f1_scores3_weigh_allf), f1_scores3_weigh_allf.index(max(f1_scores3_weigh_allf))

(0.3578809312879908, 0)

In [40]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, accuracies_allf, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_macro_allf, "x", label="AVPRA F1-score-macro", color="blue", markersize=12)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_allF_AVPRA_macro.png", dpi=500)
plt.show()

  plt.show()


In [41]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, accuracies_allf, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, f1_scores_weigh_allf, "x", label="AVPRA F1-score-weighted", color="blue", markersize=12)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza/F1-Score", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_allF_AVPRA_weighted.png", dpi=500)
plt.show()

  plt.show()


In [42]:
max(accuracies_allf), l[accuracies_allf.index(max(accuracies_allf))]

(0.8551289296017607, 7)

In [43]:
max(accuracies2_allf), l[accuracies2_allf.index(max(accuracies2_allf))]

(0.6921749992564613, 3)

In [44]:
max(accuracies3_allf), l[accuracies3_allf.index(max(accuracies3_allf))]

(0.4895755881390715, 0)

In [46]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, accuracies_allf, "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies2_allf, "o", label="AVPRA 10MWL Accuratezza", markersize=10)
plt.plot(l, accuracies3_allf, "o", label="AVPRA MWL Accuratezza", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="lower right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_comparisons_twitch_allF.png", dpi=500)
plt.show()

  plt.show()


### Only micro

In [47]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, accuracies_allf, "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_allF_micro.png", dpi=500)
plt.show()

  plt.show()


In [49]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [accuracies[i] for i in l], "o", label="AVPRA Accuratezza", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_onlyL_micro.png", dpi=500)
plt.show()

  plt.show()


In [50]:
# Plot accuracy graph
plt.figure(figsize=(10, 6))
l = list(range(0,10)) + list(range(10, 30, 2))
plt.plot(l, [accuracies[i] for i in l], "o", label="AVPRA Accuratezza", markersize=10)
plt.plot(l, accuracies_allf, "o", label="AVPRA* Accuratezza", markersize=10)

plt.axvline(x=7, label="Diametro", linestyle="--")

plt.xlabel("Iterazione", fontsize=20)
plt.ylabel("Accuratezza", fontsize=20)
plt.legend(loc="right", prop={'size': 16})
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylim(0,1)

plt.savefig("F1_twitch_AVPRA_+_STAR.png", dpi=500)
plt.show()

  plt.show()
