## 0. Introduction
Objectif: Fine-tuner le mod√®le XGBoost en explorant diff√©rents hyperparam√®tres.

Tra√ßabilit√© et reproductibilit√© via MLflow.

## 1. Import & configuration

In [11]:
import sys, os
import mlflow
import mlflow.xgboost
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    RocCurveDisplay,
    PrecisionRecallDisplay,
)
from imblearn.over_sampling import SMOTE
import seaborn as sns
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
import pandas as pd
from rich import print
import shap

# Dataset + constantes
sys.path.insert(0, "..")
from config import PROCESSED_DATA_PATH, TARGET_COL, SEED, FIG_DIR, CV_STRATEGY
from utils import compute_train_test_metrics

df = pd.read_csv(PROCESSED_DATA_PATH)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Identifier les colonnes num√©riques et binaires
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
binary_cols = [col for col in numeric_cols if X[col].nunique() == 2 or (X[col].dtype == 'int64' and set(X[col].unique()) == {0, 1})]
numeric_cols = [col for col in numeric_cols if col not in binary_cols]

NUMERIC_COLS = numeric_cols
BINARY_COLS = binary_cols

print(f"[bold cyan]üìä Colonnes d√©tect√©es par feature engineering:[/bold cyan]")
print(f"   - Num√©riques: {NUMERIC_COLS}")
print(f"   - Binaires: {BINARY_COLS}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

# Configuration MLflow
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Rugby Kicks - XGBoost Finetuning")
mlflow.set_experiment_tag("author", "Xavier")

# Fin de tout run actif avant de commencer
if mlflow.active_run():
    mlflow.end_run()

# Pr√©parer les datasets
X_trainfull = pd.concat([X_train, y_train], axis=1)
X_testfull = pd.concat([X_test, y_test], axis=1)

train_dataset = mlflow.data.from_pandas(X_trainfull, source=PROCESSED_DATA_PATH, name="Training_set")
test_dataset = mlflow.data.from_pandas(X_testfull, source=PROCESSED_DATA_PATH, name="Test_set")

print("[bold green]‚úÖ Configuration pr√™te[/bold green]")

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


## 2. Preprocessing & Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Cr√©er la pipeline de preprocessing
preprocessing_pipeline = ColumnTransformer([
    ('scaler', StandardScaler(), NUMERIC_COLS),
    ('passthrough', 'passthrough', BINARY_COLS)
], remainder='drop')

# FIT sur X_train UNIQUEMENT (prevent data leakage!)
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

# Convertir en DataFrame pour maintenabilit√©
feature_names = NUMERIC_COLS + BINARY_COLS
X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test.index)

print("[bold green]‚úÖ Pipeline de preprocessing cr√©√©e et appliqu√©e[/bold green]")
print(f"   - Colonnes num√©riques standardis√©es: {NUMERIC_COLS}")
print(f"   - Colonnes binaires conserv√©es: {BINARY_COLS}")
print(f"   - X_train shape: {X_train_processed.shape}")
print(f"   - X_test shape: {X_test_processed.shape}")

## 2.5 SMOTE - Synthetic Minority Over-sampling

In [13]:
# --- Appliquer SMOTE pour √©quilibrer les classes
print("[bold cyan]üîÑ Application de SMOTE (Synthetic Minority Over-sampling)...[/bold cyan]")

# Afficher la distribution AVANT SMOTE
print(f"\n[bold]Avant SMOTE:[/bold]")
print(f"  - Classe 0: {(y_train == 0).sum()} samples")
print(f"  - Classe 1: {(y_train == 1).sum()} samples")
print(f"  - Ratio: {(y_train == 0).sum() / (y_train == 1).sum():.2f}:1")

# Appliquer SMOTE
smote = SMOTE(random_state=SEED)
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

# Afficher la distribution APR√àS SMOTE
print(f"\n[bold]Apr√®s SMOTE:[/bold]")
print(f"  - Classe 0: {(y_train_smote == 0).sum()} samples")
print(f"  - Classe 1: {(y_train_smote == 1).sum()} samples")
print(f"  - Ratio: {(y_train_smote == 0).sum() / (y_train_smote == 1).sum():.2f}:1")

print(f"\n[bold green]‚úÖ SMOTE appliqu√© avec succ√®s[/bold green]")
print(f"  X_train_smote shape: {X_train_smote.shape}")
print(f"  y_train_smote shape: {y_train_smote.shape}")

## 3. XGBoost Fine-tuning with GridSearchCV

In [14]:
# Calcul du poids de la classe majoritaire / minoritaire
# neg, pos = np.bincount(y_train)
scale_pos_weight = 0.7

# --- Hyperparam√®tres de base (√† partir du mod√®le benchmark)
base_params = {
    "scale_pos_weight": scale_pos_weight,
    "random_state": SEED,
    "eval_metric": "logloss",
    "n_jobs": -1,
}

# --- Grille de param√®tres R√âDUITE (Option 1: 16 combos au lieu de 729)
# Focus sur les hyperparam√®tres cl√©s, moins de valeurs test√©es
param_grid = {
    "n_estimators": [250, 350],           # 2 valeurs (au lieu de 3)
    "max_depth": [5, 6],                  # 2 valeurs (au lieu de 3)
    "learning_rate": [0.05, 0.1],         # 2 valeurs (au lieu de 3)
    "subsample": [0.8, 0.9],              # 2 valeurs (au lieu de 3)
    "colsample_bytree": [0.8],            # 1 valeur (au lieu de 3) - bon √©quilibre
    "min_child_weight": [3],              # 1 valeur (au lieu de 3) - valeur benchmark
}

# Calcul du nombre de combinaisons
total_combos = np.prod([len(v) for v in param_grid.values()])
total_cv_fits = total_combos * CV_STRATEGY.get_n_splits()

print("[bold cyan]üîç Grille de param√®tres R√âDUITE pour RandomizedSearchCV:[/bold cyan]")
for param, values in param_grid.items():
    print(f"   - {param}: {values}")
print(f"\n   ‚úÖ Total combinations: {total_combos}")
print(f"   ‚úÖ Total CV fits (avec CV={CV_STRATEGY.get_n_splits()}): {total_cv_fits}")
print(f"   ‚úÖ ~85% moins de combinaisons qu'avant!")

In [None]:
with mlflow.start_run(
    run_name="XGBoost_GridSearchCV_SMOTE_Optimized_scale_pos_weight_0.7",
    description="Fine-tuning XGBoost avec grille r√©duite + SMOTE (16 combos) avec scale_pos_weight=0.7",
):

    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "xgboost_finetuned")
    mlflow.set_tag("optimization_method", "gridsearchcv")
    mlflow.set_tag("resampling", "SMOTE")
    mlflow.set_tag("search_strategy", "reduced_grid_option1")

    # --- Logger les param√®tres de base
    base_params_log = base_params.copy()
    base_params_log["scale_pos_weight"] = round(scale_pos_weight, 2)
    mlflow.log_params({f"base_{k}": str(v) for k, v in base_params_log.items()})
    
    # --- Logger la grille de param√®tres
    for param, values in param_grid.items():
        mlflow.log_param(f"grid_{param}", str(values))

    # --- GridSearchCV
    xgb_base = XGBClassifier(**base_params)
    grid_search = GridSearchCV(
        estimator=xgb_base,
        param_grid=param_grid,
        cv=CV_STRATEGY,
        scoring="f1_weighted",  # Utiliser F1 pond√©r√© pour imbalanced data
        n_jobs=-1,
        verbose=1
    )

    print("[bold cyan]üöÄ Lancement de GridSearchCV sur donn√©es SMOTE (grille r√©duite)...[/bold cyan]")
    grid_search.fit(X_train_smote, y_train_smote)

    # --- R√©cup√©rer le meilleur mod√®le
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_

    print(f"[bold green]‚úÖ GridSearchCV termin√©[/bold green]")
    print(f"   - Best CV Score (F1_weighted sur SMOTE): {best_cv_score:.4f}")
    print(f"[bold yellow]üèÜ Best Parameters found:[/bold yellow]")
    for param, value in best_params.items():
        print(f"   - {param}: {value}")
    
    # --- Logger les meilleurs param√®tres
    mlflow.log_params({f"best_{k}": str(v) for k, v in best_params.items()})
    mlflow.log_metric("best_cv_score_f1_weighted_smote", best_cv_score)

    # --- Pr√©dictions finales
    # ‚ö†Ô∏è  IMPORTANT: Les pr√©dictions utilisent les donn√©es originales (non r√©sampl√©es)
    y_pred_train = best_model.predict(X_train_processed)
    y_proba_train = best_model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = best_model.predict(X_test_processed)
    y_proba = best_model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )
    
    # --- Log metrics
    mlflow.log_metrics(final_metrics_test)

    # --- Log du mod√®le
    mlflow.xgboost.log_model(
        xgb_model=best_model,
        name="xgboost_finetuned_smote_optimized_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure()
    sns.heatmap(cm, annot=True, fmt="d", cmap="mako", cbar=False)
    plt.title("Confusion Matrix - XGBoost Finetuned (SMOTE + Optimized)")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(best_model, X_test_processed, y_test)
    plt.title("ROC Curve - XGBoost Finetuned (SMOTE + Optimized)")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(best_model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - XGBoost Finetuned (SMOTE + Optimized)")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    # --- Importance des features
    importances = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(
        ascending=False
    )
    plt.figure()
    imp_df = pd.DataFrame({'importance': importances[:10].values, 'feature': importances[:10].index})
    sns.barplot(data=imp_df, x='importance', y='feature', palette="rocket", hue='feature', legend=False)
    plt.title("Top 10 Feature Importances - XGBoost Finetuned (SMOTE + Optimized)")
    plt.tight_layout()
    feat_imp_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_feature_importances.png"
    plt.savefig(feat_imp_path)
    mlflow.log_artifact(feat_imp_path)
    plt.close()

    print("[bold cyan]‚úÖ XGBoost Finetuned (SMOTE + Optimized) logu√© dans MLflow avec succ√®s[/bold cyan]")

    # ========== SHAP Analysis - DANS LE M√äME RUN ==========
    print("\n[bold magenta]üîç SHAP Analysis - Model Interpretability[/bold magenta]")

    # Simple feature importance approximation using model's native feature importance
    print("\n[cyan]Computing feature importance (Fast method)...[/cyan]")

    importances_shap = pd.Series(
        best_model.feature_importances_, 
        index=X_test_processed.columns
    ).sort_values(ascending=False)

    # 1Ô∏è‚É£ SUMMARY BAR PLOT - Global Feature Importance
    print("\n[cyan]1. Summary Bar Plot - Top 10 Features[/cyan]")
    plt.figure()
    top_n = 10
    importances_shap.head(top_n).plot(kind='barh', color='steelblue')
    plt.xlabel('Feature Importance')
    plt.ylabel('Features')
    plt.title("Feature Importance - XGBoost Finetuned (SMOTE + Optimized)")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    shap_bar_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_shap_bar_plot.png"
    plt.savefig(shap_bar_path)
    plt.close()
    print(f"   ‚úÖ Saved: {shap_bar_path}")
    mlflow.log_artifact(shap_bar_path)

    # 2Ô∏è‚É£ FEATURE IMPORTANCE DISTRIBUTION
    print("\n[cyan]2. Feature Importance Distribution[/cyan]")
    plt.figure()
    importances_shap.plot(kind='bar', color='coral', alpha=0.7)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title("All Features Importance Distribution")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    shap_dot_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_shap_dot_plot.png"
    plt.savefig(shap_dot_path)
    plt.close()
    print(f"   ‚úÖ Saved: {shap_dot_path}")
    mlflow.log_artifact(shap_dot_path)

    # 3Ô∏è‚É£ PREDICTION EXPLANATION (First sample)
    print("\n[cyan]3. Prediction Explanation (First Test Sample)[/cyan]")
    first_pred = y_pred[0]
    first_proba = y_proba[0]
    actual_label = y_test.iloc[0]

    fig, ax = plt.subplots()
    top_features = importances_shap.head(10).index
    top_importance = importances_shap.head(10).values
    colors = ['green' if x > 0 else 'red' for x in top_importance]

    ax.barh(range(len(top_features)), top_importance, color=colors, alpha=0.7)
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features)
    ax.set_xlabel('Feature Importance', fontsize=12)
    ax.set_title(f"Prediction {0}: {first_pred} (Confidence: {first_proba:.2%}) | Actual: {actual_label}", 
                 fontsize=14, fontweight='bold')
    ax.invert_yaxis()
    plt.tight_layout()
    shap_force_path = f"{FIG_DIR}/xgb_finetuned_smote_optimized_shap_force_first_prediction.png"
    plt.savefig(shap_force_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"   ‚úÖ Saved: {shap_force_path}")
    mlflow.log_artifact(shap_force_path)

    # 4Ô∏è‚É£ FEATURE IMPORTANCE SUMMARY
    print("\n[cyan]4. Feature Importance Summary[/cyan]")
    print("\nTop 5 Most Important Features:")
    for idx, (feature, importance) in enumerate(importances_shap.head(5).items(), 1):
        print(f"   {idx}. {feature:30s}: {importance:.6f}")

    print("\n[green]‚úÖ SHAP artifacts logged to MLflow in same run![/green]")
    print("\n[bold green]‚ú® SHAP Analysis Complete! (‚ö° Fast version)[/bold green]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - XGBoost Finetuned (SMOTE + Optimized) :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



Fitting 5 folds for each of 16 candidates, totalling 80 fits


  self.get_booster().save_model(fname)
  self.get_booster().load_model(fname)
  self.get_booster().load_model(fname)
