### Routine per il fine tuning del modello 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import uniform, randint, loguniform
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, roc_curve, auc, 
                           precision_recall_curve, average_precision_score, f1_score,
                           accuracy_score, precision_score, recall_score)
import xgboost as xgb
from xgboost import plot_importance
import warnings
warnings.filterwarnings('ignore')

# Configurazione dello stile dei grafici
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


In [2]:
# Leggi i dati e le etichette dai file Parquet nella cartella preprocessed/binary/data_w_features
features_df_full = pd.read_parquet('processed_data/binary_classification/data_w_features/combined_features.parquet')
labels_df_full = pd.read_parquet('processed_data/binary_classification/data_w_features/labels_binary_stats_features_only.parquet')

# Campiona 10000 righe con distribuzione bilanciata tra le classi di labels_df per il training
n_samples_per_class = 20000  # 10000/2 per due classi
sampled_indices = (
    labels_df_full.groupby(labels_df_full.iloc[:, 0])
    .apply(lambda x: x.sample(n=n_samples_per_class, random_state=42))
    .index.get_level_values(1)
)
features_df = features_df_full.loc[sampled_indices].reset_index(drop=True)
labels_df = labels_df_full.loc[sampled_indices].reset_index(drop=True)

# Campiona 500000 righe con distribuzione bilanciata tra le classi di labels_df per il fit finale
n_samples_per_class = 250000  # 10000/2 per due classi
sampled_indices = (
    labels_df_full.groupby(labels_df_full.iloc[:, 0])
    .apply(lambda x: x.sample(n=n_samples_per_class, random_state=42))
    .index.get_level_values(1)
)
features_df_to_finalize = features_df_full.loc[sampled_indices].reset_index(drop=True)
labels_df_to_finalize = labels_df_full.loc[sampled_indices].reset_index(drop=True)

del features_df_full, labels_df_full, sampled_indices

In [3]:
features_df

Unnamed: 0,mean,std,min,max,median,range,skew,kurtosis,entropy,gini,...,q3_center_gini_diff,q4_center_gini_diff,q1_center_skew_diff,q2_center_skew_diff,q3_center_skew_diff,q4_center_skew_diff,q1_center_kurtosis_diff,q2_center_kurtosis_diff,q3_center_kurtosis_diff,q4_center_kurtosis_diff
0,3.121861e+08,4.368290e+08,55475660.0,2.189250e+09,124440480.0,2.133774e+09,2.455066,5.700624,4.649168,0.592030,...,0.191344,0.072571,1.233015,1.042956,1.646591,1.955638,3.938969,3.203617,5.804933,7.593583
1,1.354588e+08,1.098083e+08,64222012.0,5.633133e+08,81622784.0,4.990913e+08,2.036508,3.416886,5.051091,0.362163,...,0.103872,0.113342,1.120336,1.008453,0.200831,0.825555,3.424652,2.796656,0.143012,1.988379
2,1.479500e+08,1.864786e+08,51559340.0,1.193175e+09,78317840.0,1.141616e+09,3.345967,11.576972,4.844822,0.448503,...,0.064170,0.021108,1.208350,1.218808,0.670641,1.253292,6.030317,5.873643,2.879516,5.967516
3,1.077022e+08,8.848704e+07,58411136.0,5.154657e+08,73867872.0,4.570545e+08,2.970041,8.342006,5.076176,0.312177,...,0.021291,-0.074557,1.600993,1.426247,1.449779,1.873399,6.974224,5.901201,5.852446,8.404259
4,2.326736e+08,2.927024e+08,53648888.0,1.385398e+09,88280284.0,1.331750e+09,2.138952,3.908845,4.739701,0.553272,...,0.218619,0.170345,0.729945,0.768305,0.888435,1.415825,1.769937,1.969880,2.234781,4.623714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,7.004602e+07,2.515253e+07,47208740.0,1.873781e+08,62798850.0,1.401694e+08,2.798890,8.433389,5.246677,0.155405,...,0.006115,-0.062499,1.012151,1.451165,1.360036,1.249935,4.460397,5.819824,5.555737,5.388493
39996,1.035851e+08,6.763751e+07,47960028.0,3.633204e+08,80051012.0,3.153604e+08,2.143769,4.099183,5.133810,0.298381,...,-0.160923,-0.244436,0.777543,0.460487,1.917853,1.974445,2.373211,1.234803,7.637541,9.716307
39997,7.851728e+07,4.033808e+07,46933660.0,3.048775e+08,64682736.0,2.579438e+08,3.206119,11.574380,5.202330,0.202654,...,-0.012065,-0.089952,1.401116,1.506127,1.620153,1.817997,6.596219,7.174042,7.663633,9.074289
39998,7.170292e+07,1.845804e+07,51168392.0,1.681244e+08,65231598.0,1.169561e+08,2.807870,8.972333,5.270458,0.112257,...,-0.016714,-0.043487,1.051580,0.841895,1.079389,0.844559,6.356772,4.276291,5.778038,5.545435


In [4]:
len(features_df), len(labels_df)

(40000, 40000)

In [5]:
from collections import Counter
Counter(labels_df.iloc[:, 0])

Counter({0: 20000, 1: 20000})

In [6]:

class XGBoostPipeline:
    def __init__(self, features_df, labels_df, test_size=0.2, random_state=42):
        """
        Inizializza la pipeline per XGBoost
        
        Parameters:
        -----------
        features_df : pd.DataFrame
            DataFrame contenente le features
        labels_df : pd.DataFrame o pd.Series
            DataFrame/Series contenente le labels
        test_size : float
            Percentuale del dataset da usare per il test set
        random_state : int
            Seed per la riproducibilità
        """
        self.features_df = features_df
        self.labels_df = labels_df.values.ravel() if isinstance(labels_df, pd.DataFrame) else labels_df
        self.test_size = test_size
        self.random_state = random_state
        self.model = None
        self.best_params = None
        self.cv_results = None
        
    def prepare_data(self):
        """Prepara i dati dividendoli in train e test set"""
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.features_df, self.labels_df, 
            test_size=self.test_size, 
            random_state=self.random_state,
            stratify=self.labels_df
        )
        
        # Normalizzazione delle features (opzionale per XGBoost ma può migliorare la convergenza)
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        
        print("Dimensioni dataset:")
        print(f"Train set: {self.X_train.shape[0]} campioni")
        print(f"Test set: {self.X_test.shape[0]} campioni")
        print(f"Numero di features: {self.X_train.shape[1]}")
        print("\nDistribuzione classi nel training set:")
        print(pd.Series(self.y_train).value_counts(normalize=True))
        
    def hyperparameter_tuning(self, cv_folds=5, verbose=True, n_iter=250):
        
        

        self.param_distributions = {
            'n_estimators': randint(50, 2000),
            'max_depth': randint(2, 20),
            'learning_rate': loguniform(0.0001, 1.0),
            'subsample': uniform(0.3, 0.7),  # tra 0.3 e 1.0
            'colsample_bytree': uniform(0.3, 0.7),
            'colsample_bylevel': uniform(0.3, 0.7),
            'colsample_bynode': uniform(0.3, 0.7),
            'gamma': loguniform(1e-10, 10),
            'reg_alpha': loguniform(1e-10, 1000),
            'reg_lambda': loguniform(1e-10, 1000),
            'min_child_weight': randint(0, 50),
            'scale_pos_weight': uniform(0.1, 20),
            'max_delta_step': randint(0, 20),
            'booster': ['gbtree', 'dart', 'gblinear'],
        }
        
        # Modello base
        # Modello base
        base_model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False,
            random_state=self.random_state,
            n_jobs=-1
        )
    
    # Stratified K-Fold
        stratified_kfold = StratifiedKFold(
            n_splits=cv_folds, 
            shuffle=True, 
            random_state=self.random_state
        )
        
        # Stratified K-Fold per mantenere la proporzione delle classi
        stratified_kfold = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
        
        # Grid Search
        self.grid_search = RandomizedSearchCV(
            estimator=base_model,
            param_distributions=self.param_distributions,
            n_iter=n_iter,
            cv=stratified_kfold,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=2,
            random_state=self.random_state,
            return_train_score=True
        )
        
        # Esegui la ricerca
        self.grid_search.fit(self.X_train_scaled, self.y_train)
        
        self.best_params = self.grid_search.best_params_
        self.cv_results = pd.DataFrame(self.grid_search.cv_results_)
        
        print("\nMigliori parametri trovati:")
        for param, value in self.best_params.items():
            print(f"  {param}: {value}")
        print(f"\nMiglior score ROC-AUC (CV): {self.grid_search.best_score_:.4f}")
        
        # Visualizza i risultati della ricerca
        self._plot_hyperparameter_results()
        
    def train_final_model(self):
        """Addestra il modello finale con i migliori parametri"""
        print("\n=== ADDESTRAMENTO MODELLO FINALE ===")
        
        self.model = xgb.XGBClassifier(
            **self.best_params,
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False,
            random_state=self.random_state,
            n_jobs=12
        )
        
        # Addestra con early stopping
        eval_set = [(self.X_train_scaled, self.y_train), (self.X_test_scaled, self.y_test)]
        self.model.fit(
            self.X_train_scaled, self.y_train,
            eval_set=eval_set,
            early_stopping_rounds=50,
            verbose=False
        )
        
        # Predizioni
        self.y_pred = self.model.predict(self.X_test_scaled)
        self.y_pred_proba = self.model.predict_proba(self.X_test_scaled)[:, 1]
        
        # Metriche
        print("\nMetriche sul test set:")
        print(f"Accuracy: {accuracy_score(self.y_test, self.y_pred):.4f}")
        print(f"Precision: {precision_score(self.y_test, self.y_pred):.4f}")
        print(f"Recall: {recall_score(self.y_test, self.y_pred):.4f}")
        print(f"F1-Score: {f1_score(self.y_test, self.y_pred):.4f}")
        print(f"ROC-AUC: {auc(*roc_curve(self.y_test, self.y_pred_proba)[:2]):.4f}")
        
    def plot_all_visualizations(self):
        """Genera tutte le visualizzazioni"""
        print("\n=== GENERAZIONE VISUALIZZAZIONI ===")
        
        # 1. Feature Importance
        self._plot_feature_importance()
        
        # 2. Curva ROC
        self._plot_roc_curve()
        
        # 3. Curva Precision-Recall
        self._plot_pr_curve()
        
        # 4. Matrice di Confusione
        self._plot_confusion_matrix()
        
        # 5. Distribuzione delle probabilità predette
        self._plot_probability_distribution()
        
        # 6. Learning Curves
        self._plot_learning_curves()
        
        # 7. Calibration Plot
        self._plot_calibration_curve()
        
        # 8. Classification Report Heatmap
        self._plot_classification_report()
        
    def _plot_hyperparameter_results(self):
        """Visualizza i risultati della ricerca degli iperparametri"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Top 20 combinazioni
        top_results = self.cv_results.nlargest(20, 'mean_test_score')
        
        # 1. Heatmap dei migliori risultati
        ax = axes[0, 0]
        param_cols = [col for col in top_results.columns if col.startswith('param_')]
        heatmap_data = top_results[param_cols + ['mean_test_score']].set_index('mean_test_score')
        sns.heatmap(heatmap_data.T, annot=True, fmt='.3g', cmap='YlOrRd', ax=ax)
        ax.set_title('Top 20 Combinazioni di Parametri')
        
        # 2. Importanza relativa dei parametri
        ax = axes[0, 1]
        param_importance = {}
        for param in self.best_params.keys():
            col = f'param_{param}'
            if col in self.cv_results.columns:
                correlation = self.cv_results[[col, 'mean_test_score']].corr().iloc[0, 1]
                param_importance[param] = abs(correlation)
        
        if param_importance:
            pd.Series(param_importance).sort_values().plot(kind='barh', ax=ax)
            ax.set_title('Importanza Relativa dei Parametri (Correlazione con Score)')
            ax.set_xlabel('Correlazione Assoluta')
        
        # 3. Convergenza durante la ricerca
        ax = axes[1, 0]
        ax.plot(range(len(self.cv_results)), self.cv_results['mean_test_score'])
        ax.axhline(y=self.grid_search.best_score_, color='r', linestyle='--', label='Best Score')
        ax.set_xlabel('Iterazione')
        ax.set_ylabel('ROC-AUC Score')
        ax.set_title('Convergenza della Grid Search')
        ax.legend()
        
        # 4. Tempo di training vs performance
        ax = axes[1, 1]
        if 'mean_fit_time' in self.cv_results.columns:
            scatter = ax.scatter(self.cv_results['mean_fit_time'], 
                               self.cv_results['mean_test_score'],
                               c=self.cv_results['mean_test_score'],
                               cmap='viridis', alpha=0.6)
            ax.set_xlabel('Tempo di Training (secondi)')
            ax.set_ylabel('ROC-AUC Score')
            ax.set_title('Trade-off Tempo vs Performance')
            plt.colorbar(scatter, ax=ax)
        
        plt.tight_layout()
        plt.show()
        
    def _plot_feature_importance(self):
        """Visualizza l'importanza delle features"""
        fig, axes = plt.subplots(1, 3, figsize=(20, 6))
        
        # 1. XGBoost native importance
        ax = axes[0]
        plot_importance(self.model, max_num_features=20, ax=ax, importance_type='weight')
        ax.set_title('Feature Importance - Weight (Frequenza di utilizzo)')
        
        # 2. Gain importance
        ax = axes[1]
        plot_importance(self.model, max_num_features=20, ax=ax, importance_type='gain')
        ax.set_title('Feature Importance - Gain (Miglioramento medio)')
        
        # 3. Cover importance
        ax = axes[2]
        plot_importance(self.model, max_num_features=20, ax=ax, importance_type='cover')
        ax.set_title('Feature Importance - Cover (Copertura media)')
        
        plt.tight_layout()
        plt.show()
        
        # Grafico combinato delle top features
        fig, ax = plt.subplots(figsize=(10, 8))
        feature_importance = pd.DataFrame({
            'feature': self.features_df.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False).head(15)
        
        sns.barplot(data=feature_importance, y='feature', x='importance', ax=ax)
        ax.set_title('Top 15 Features più Importanti')
        ax.set_xlabel('Importance Score')
        plt.tight_layout()
        plt.show()
        
    def _plot_roc_curve(self):
        """Visualizza la curva ROC"""
        fig, ax = plt.subplots(figsize=(10, 8))
        
        fpr, tpr, thresholds = roc_curve(self.y_test, self.y_pred_proba)
        roc_auc = auc(fpr, tpr)
        
        # Plot della curva ROC
        ax.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {roc_auc:.3f})')
        ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
        
        # Trova il punto ottimale
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        ax.scatter(fpr[optimal_idx], tpr[optimal_idx], color='red', s=100, 
                  label=f'Optimal point (threshold={optimal_threshold:.3f})')
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('Receiver Operating Characteristic (ROC) Curve')
        ax.legend(loc="lower right")
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    def _plot_pr_curve(self):
        """Visualizza la curva Precision-Recall"""
        fig, ax = plt.subplots(figsize=(10, 8))
        
        precision, recall, thresholds = precision_recall_curve(self.y_test, self.y_pred_proba)
        average_precision = average_precision_score(self.y_test, self.y_pred_proba)
        
        # Plot della curva PR
        ax.plot(recall, precision, color='blue', lw=2,
                label=f'PR curve (AP = {average_precision:.3f})')
        
        # Baseline (proporzione della classe positiva)
        baseline = np.sum(self.y_test) / len(self.y_test)
        ax.axhline(y=baseline, color='red', linestyle='--', 
                  label=f'Baseline (Random) = {baseline:.3f}')
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('Recall')
        ax.set_ylabel('Precision')
        ax.set_title('Precision-Recall Curve')
        ax.legend(loc="lower left")
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    def _plot_confusion_matrix(self):
        """Visualizza la matrice di confusione"""
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        cm = confusion_matrix(self.y_test, self.y_pred)
        
        # 1. Matrice di confusione con valori assoluti
        ax = axes[0]
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        ax.set_title('Confusion Matrix - Valori Assoluti')
        
        # 2. Matrice di confusione normalizzata
        ax = axes[1]
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Greens', ax=ax)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        ax.set_title('Confusion Matrix - Normalizzata per Riga')
        
        plt.tight_layout()
        plt.show()
        
    def _plot_probability_distribution(self):
        """Visualizza la distribuzione delle probabilità predette"""
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # 1. Istogramma delle probabilità per classe
        ax = axes[0]
        ax.hist(self.y_pred_proba[self.y_test == 0], bins=50, alpha=0.5, 
                label='Classe 0', color='blue', density=True)
        ax.hist(self.y_pred_proba[self.y_test == 1], bins=50, alpha=0.5, 
                label='Classe 1', color='red', density=True)
        ax.axvline(x=0.5, color='black', linestyle='--', label='Threshold=0.5')
        ax.set_xlabel('Probabilità Predetta')
        ax.set_ylabel('Densità')
        ax.set_title('Distribuzione delle Probabilità per Classe')
        ax.legend()
        
        # 2. Box plot delle probabilità
        ax = axes[1]
        data_to_plot = [self.y_pred_proba[self.y_test == 0], 
                       self.y_pred_proba[self.y_test == 1]]
        ax.boxplot(data_to_plot, labels=['Classe 0', 'Classe 1'])
        ax.set_ylabel('Probabilità Predetta')
        ax.set_title('Box Plot delle Probabilità per Classe')
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    def _plot_learning_curves(self):
        """Visualizza le learning curves"""
        fig, ax = plt.subplots(figsize=(10, 8))
        
        # Estrai i risultati della valutazione durante il training
        results = self.model.evals_result()
        
        if results:
            epochs = len(results['validation_0']['logloss'])
            x_axis = range(0, epochs)
            
            # Plot training & validation loss
            ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
            ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
            ax.set_xlabel('Epoca')
            ax.set_ylabel('Log Loss')
            ax.set_title('Learning Curves - Log Loss durante il Training')
            ax.legend()
            ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    def _plot_calibration_curve(self):
        """Visualizza la curva di calibrazione"""
        from sklearn.calibration import calibration_curve
        
        fig, ax = plt.subplots(figsize=(10, 8))
        
        # Calcola la curva di calibrazione
        fraction_of_positives, mean_predicted_value = calibration_curve(
            self.y_test, self.y_pred_proba, n_bins=10
        )
        
        # Plot
        ax.plot(mean_predicted_value, fraction_of_positives, 's-', label='XGBoost')
        ax.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
        
        ax.set_xlabel('Mean Predicted Probability')
        ax.set_ylabel('Fraction of Positives')
        ax.set_title('Calibration Plot (Reliability Diagram)')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    def _plot_classification_report(self):
        """Visualizza il classification report come heatmap"""
        fig, ax = plt.subplots(figsize=(8, 6))
        
        # Genera il classification report
        report = classification_report(self.y_test, self.y_pred, output_dict=True)
        
        # Converti in DataFrame per la visualizzazione
        df_report = pd.DataFrame(report).transpose()
        df_report = df_report.iloc[:-1, :-1]  # Rimuovi 'accuracy' e 'support'
        
        # Heatmap
        sns.heatmap(df_report, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax)
        ax.set_title('Classification Report Heatmap')
        
        plt.tight_layout()
        plt.show()
        
    def run_complete_pipeline(self):
        """Esegue l'intera pipeline"""
        print("=== AVVIO PIPELINE COMPLETA ===\n")
        
        # 1. Preparazione dati
        self.prepare_data()
        
        # 2. Ricerca iperparametri
        self.hyperparameter_tuning()
        
        # 3. Training modello finale
        self.train_final_model()
        
        # 4. Visualizzazioni
        self.plot_all_visualizations()
        
        print("\n=== PIPELINE COMPLETATA ===")
        
        return self.model, self.best_params


In [7]:


pipeline = XGBoostPipeline(features_df, labels_df)
model, best_params = pipeline.run_complete_pipeline()


=== AVVIO PIPELINE COMPLETA ===

Dimensioni dataset:
Train set: 32000 campioni
Test set: 8000 campioni
Numero di features: 134

Distribuzione classi nel training set:
0    0.5
1    0.5
Name: proportion, dtype: float64
Fitting 5 folds for each of 250 candidates, totalling 1250 fits


KeyboardInterrupt: 

In [None]:
import mlflow
import mlflow.sklearn
from mlflow import xgboost
mlflow.log_params(best_params)
mlflow.xgboost.log_model(model, "xgboost_model")
# Salva il modello finale


In [None]:

# Per eseguire solo alcune parti:
pipeline = XGBoostPipeline(features_df, labels_df)
pipeline.prepare_data()
pipeline.hyperparameter_tuning()
pipeline.train_final_model()
pipeline.plot_all_visualizations()


In [None]:
# Salva il modello addestrato
# Addestra il modello XGBoost finale sui dati completi con i migliori iperparametri trovati
X_full = trainer.scaler.fit_transform(features_df_to_finalize)
if isinstance(labels_df_to_finalize, pd.DataFrame):
    y_full = labels_df_to_finalize.values.ravel()
else:
    y_full = labels_df_to_finalize.values

trainer.train_final_model(X_full, y_full, params=trainer.search_results.best_params_)
trainer.save_model('xgboost_model_final.pkl')