In [None]:
# import and dataset loading
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, classification_report, roc_curve, auc
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

dataset = pd.read_csv("example_dataset.csv")

In [None]:
#fasttext embedding
import fasttext
train_data_path = "bigrams.txt"
dim = 3
model = fasttext.train_unsupervised(train_data_path, model="skipgram", dim=3, epoch=20)
words = model.get_words()
dict_skipgram = {w: model.get_word_vector(w) for w in words}
domain_names = dataset["bigrams"].to_numpy()
max_len = np.max([len(x.split()) for x in domain_names])
embedded_domain_names = []
for name in domain_names:
    sequences = np.array(
        [dict_skipgram.get(token) if dict_skipgram.get(token) is not None else np.zeros(dim)
         for token in name.split()], dtype=np.single)
    pad = np.zeros(dim * (max_len - len(name.split())), dtype=np.single)
    embedded_domain_name = np.concatenate((sequences, pad), axis=None, dtype=np.single)
    embedded_domain_names.append(embedded_domain_name)

In [None]:
plt.figure(figsize=(30,15))
ax1 = sns.heatmap(embedded_domain_names_df[embedded_domain_names_df.columns].corr(),
                 cmap="Blues",annot=True,annot_kws={"size": 2})

In [None]:
#Lettura dati ed embedding
family_dict = {family: i for i, family in enumerate(sorted(set(dataset["label_multiclass"])), 1)}
true_labels = [family_dict[family] for family in dataset["label_multiclass"].to_numpy()]
#random state per test con lo stesso split ogni volta
train, test, label_train, label_test=train_test_split(embedded_domain_names, true_labels, test_size=0.1, 
                                                      random_state=30)

In [None]:
# setting altri parametri per la classificazione
kfold = StratifiedKFold(n_splits=5)
cross_validation_scores = [] # punteggi della cross validation
classification_reports = []
confusion_matrices = []
roc_aucs = [] #roc curve:
true_pos_rates = []
false_pos_rates = []
#prep classificatori
names=["SVC", "KNN", "Naive Bayes", "Decision Tree", "Random Forest", "AdaBoost", "Gradient Boosting", "MLP"]
classifiers=[ SVC(), KNeighborsClassifier(n_neighbors=5), GaussianNB(), DecisionTreeClassifier(),
              RandomForestClassifier(n_estimators=10), AdaBoostClassifier(n_estimators=10), 
              GradientBoostingClassifier(n_estimators=10), MLPClassifier()]

In [None]:
#train e test dei classificatori:
for name, classif in zip(names,classifiers):
    # fitting model and make predictions
    classif.fit(train, label_train)
    predictions = classif.predict(test)
    print(f"{name}. Accuracy: {accuracy_score(label_test, predictions)}")
    # cross-validation
    cv_score = cross_val_score(classif, train, label_train, scoring = "accuracy",cv = kfold)
    cross_validation_scores.append(cv_score)
    print("Done K-Fold cross-validation")
    # confusion matrix
    disp = ConfusionMatrixDisplay.from_predictions(
        label_test, 
        predictions,
        display_labels=family_dict.keys(),
        cmap=plt.cm.Blues,
        normalize="true",
    )
    disp.ax_.set_title(f"{name}. Confusion matrix")
    confusion_matrices.append(disp)
    print("Computed confusion matrix")
    # classification report
    class_report = classification_report(label_test, predictions)
    classification_reports.append(class_report)
    print("Done classification report")

In [None]:
cross_val_mean_acc = [] # accuretezza media della cross validation
cross_val_std = [] # deviazione standard della cross validation
# cross validation plot
for i in cross_validation_scores:
    cross_val_mean_acc.append(i.mean())
    cross_val_std.append(i.std())
cv_df = pd.DataFrame({"CrossValMeans":cross_val_mean_acc, "CrossValerrors": cross_val_std,"Algorithm":names})
print(cv_df)
plt.figure(figsize=(12,6))
sns.barplot(x="CrossValMeans",y="Algorithm", data=cv_df, palette="Set2",orient = "h", **{'xerr':cross_val_std})
plt.xlabel("Mean Accuracy")
plt.title("Cross validation scores")

In [None]:
# Confusion matrixes (not-normalized confusion matrix)
plt.figure(figsize=(12,8))
sns.set(font_scale=1.4)
for i in range(len(names)):
    plt.subplot(2,2, i+1) #adjust this acourding to the number of algorithms
    sns.heatmap(confusion_matrix[i], annot=True, fmt="d",cmap="Blues")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f"{names[i]} classifier")

In [None]:
#Classification reports
for i in range(len(names)):
    print (f"{names[i]} Classification Report:" );
    print (classification_reports[i]);