In [None]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [None]:
# Import Datasets

df_train = pd.read_csv('Data_for_Modelling/df_train.csv', sep=',')

df_test = pd.read_csv('Data_for_Modelling/df_test.csv', sep=',')

In [None]:
# Definizione dei modelli
models = {
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
}

In [None]:
# Oversampling con ADASYN
adasyn = ADASYN()
X_train_resampled, y_train_resampled = adasyn.fit_resample(df_train.drop(columns=['Response']), df_train['Response'])

In [None]:
# Cross-validation e ricerca casuale dei parametri
for name, model in models.items():
    print(f"Training {name}...")
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='accuracy', n_jobs=-1, cv=cv)
    search.fit(X_train_resampled, y_train_resampled)
    print(f"Best parameters: {search.best_params_}")
    best_model = search.best_estimator_

    # Valutazione del modello sui dati di training
    scores = cross_val_score(best_model, X_train_resampled, y_train_resampled, cv=cv, scoring='accuracy')
    print(f"Cross-validation Accuracy scores: {scores}")

    # Valutazione del modello sui dati di test
    y_pred = best_model.predict(df_test.drop(columns=['Response']))
    print(f"Test accuracy for {name}: {accuracy_score(df_test['Response'], y_pred)}")
    print(f"Test precision for {name}: {precision_score(df_test['Response'], y_pred)}")
    print(f"Test recall for {name}: {recall_score(df_test['Response'], y_pred)}")
    print(f"Test F1 score for {name}: {f1_score(df_test['Response'], y_pred)}")

    # Calcolo e visualizzazione della curva ROC e dell'AUC
    y_pred_proba = best_model.predict_proba(df_test.drop(columns=['Response']))[:,1]
    fpr, tpr, thresholds = roc_curve(df_test['Response'], y_pred_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc_score(df_test['Response'], y_pred_proba):.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()