## 0. Introduction
Objectif: comparer plusieurs mod√®les de classification (baseline, LogReg, RF, XGB).

Tra√ßabilit√© et reproductibilit√© via MLflow.


## 1. Import & configuration

In [23]:
import sys, os
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    RocCurveDisplay,
    PrecisionRecallDisplay,
)
import seaborn as sns
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
import pandas as pd
from rich import print

# Dataset + constantes
sys.path.insert(0, "..")
from config import PROCESSED_DATA_PATH, TARGET_COL, SEED, FIG_DIR, GLOBAL_SCORING, CV_STRATEGY
from utils import compute_final_metrics, compute_overfitting_metrics, compute_train_test_metrics, extract_cv_metrics

df = pd.read_csv(PROCESSED_DATA_PATH)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Identifier les colonnes num√©riques et binaires (d√©termin√©es par feature engineering)
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
# Retirer les colonnes binaires (valeurs 0/1 ou tr√®s peu de valeurs uniques)
binary_cols = [col for col in numeric_cols if X[col].nunique() == 2 or (X[col].dtype == 'int64' and set(X[col].unique()) == {0, 1})]
numeric_cols = [col for col in numeric_cols if col not in binary_cols]

NUMERIC_COLS = numeric_cols
BINARY_COLS = binary_cols

print(f"[bold cyan]üìä Colonnes d√©tect√©es par feature engineering:[/bold cyan]")
print(f"   - Num√©riques: {NUMERIC_COLS}")
print(f"   - Binaires: {BINARY_COLS}")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "false"
mlflow.autolog(disable=True)  # √©vite les runs fant√¥mes
mlflow.end_run()  # ferme tout run r√©siduel
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Rugby Kicks Classification - Benchmark")
mlflow.set_tag("author", "Xavier")
mlflow.set_tag("project", "OC P6")

X_trainfull = pd.concat([X_train, y_train], axis=1)
X_testfull = pd.concat([X_test, y_test], axis=1)

train_dataset = mlflow.data.from_pandas(X_trainfull, source=PROCESSED_DATA_PATH, name="Training_set")
test_dataset = mlflow.data.from_pandas(X_testfull, source=PROCESSED_DATA_PATH, name="Test_set")

if mlflow.active_run():
    print("‚ö†Ô∏è Run actif trouv√© :", mlflow.active_run().info.run_id)
    mlflow.end_run()

print("[bold green]‚úÖ Configuration MLflow pr√™te[/bold green]")

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [24]:
# ============================================================
# üîß PREPROCESSING & FEATURE SCALING (Pipeline)
# ============================================================
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Cr√©er la pipeline de preprocessing
# - StandardScaler pour colonnes num√©riques
# - Passthrough pour colonnes binaires (d√©j√† 0/1)
preprocessing_pipeline = ColumnTransformer([
    ('scaler', StandardScaler(), NUMERIC_COLS),
    ('passthrough', 'passthrough', BINARY_COLS)
], remainder='drop')

# FIT sur X_train UNIQUEMENT (prevent data leakage!)
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

# Convertir en DataFrame pour maintenabilit√©
feature_names = NUMERIC_COLS + BINARY_COLS
X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test.index)

print("[bold green]‚úÖ Pipeline de preprocessing cr√©√©e et appliqu√©e[/bold green]")
print(f"   - Colonnes num√©riques standardis√©es: {NUMERIC_COLS}")
print(f"   - Colonnes binaires conserv√©es: {BINARY_COLS}")
print(f"   - X_train shape: {X_train_processed.shape}")
print(f"   - X_test shape: {X_test_processed.shape}")

## 2. Baseline Dummy Classifier

In [25]:
with mlflow.start_run(
    run_name="Dummy_Baseline", description="Baseline avec strat√©gie 'most_frequent'"
):

    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "dummy")

    # --- D√©finir les hyperparam√®tres (AVANT la cr√©ation du mod√®le)
    model_params = {
        "strategy": "most_frequent",
        "random_state": SEED,
    }
    
    # --- Logger les param√®tres EN PREMIER
    mlflow.log_params(model_params)

    # --- Cr√©er le mod√®le avec les params
    model = DummyClassifier(**model_params)

    # --- Cross-Validation (avec donn√©es preprocess√©es)
    cv_results = cross_validate(
        model,
        X_train_processed,
        y_train,
        cv=CV_STRATEGY,
        scoring=GLOBAL_SCORING,
        return_train_score=True,
    )

    cv_mean, cv_std, train_mean = extract_cv_metrics(cv_results)

    # --- Entra√Ænement final (avec donn√©es preprocess√©es)
    model.fit(X_train_processed, y_train)

    # --- Pr√©dictions finales (avec donn√©es preprocess√©es)
    y_pred_train = model.predict(X_train_processed)
    y_proba_train = np.asarray(model.predict_proba(X_train_processed))[:, 1]
    
    y_pred = model.predict(X_test_processed)
    y_proba = np.asarray(model.predict_proba(X_test_processed))[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )
    
    # --- Log metrics
    mlflow.log_metrics(cv_mean)
    mlflow.log_metrics(cv_std)
    mlflow.log_metrics(train_mean)
    mlflow.log_metrics(final_metrics_test)

    mlflow.sklearn.log_model(model, name="dummy_model", input_example=X_test_processed.iloc[:5])

    # --- Matrice de confusion
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix - Dummy Baseline")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/dummy_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("ROC Curve - Dummy Baseline")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/dummy_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - Dummy Baseline")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/dummy_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    print(f"[bold cyan]‚úÖ DummyClassifier logu√© dans MLflow avec succ√®s[/bold cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - Dummy Baseline :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



## 3. Logistic Regression Classifier

In [26]:
import mlflow.sklearn as mlflow_sklearn

with mlflow.start_run(
    run_name="LogisticRegression_v1",
    description="R√©gression logistique binaire ‚Äì baseline explicative",
):
    
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "logistic_regression")

    # --- D√©finir les hyperparam√®tres (AVANT la cr√©ation du mod√®le)
    model_params = {
        "solver": "liblinear",
        "penalty": "l2",
        "C": 1.0,
        "random_state": SEED,
        "class_weight": "balanced",
    }
    
    # --- Logger les param√®tres EN PREMIER
    mlflow.log_params(model_params)

    # --- Cr√©er le mod√®le avec les params
    model = LogisticRegression(**model_params)

    # --- Cross-Validation (avec donn√©es preprocess√©es)
    cv_results = cross_validate(
        model,
        X_train_processed,
        y_train,
        cv=CV_STRATEGY,
        scoring=GLOBAL_SCORING,
        return_train_score=True,
    )
    
    cv_mean, cv_std, train_mean = extract_cv_metrics(cv_results)

    # --- Entra√Ænement final
    model.fit(X_train_processed, y_train)

    # --- Pr√©dictions finales
    y_pred_train = model.predict(X_train_processed)
    y_proba_train = model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )

    # --- Log metrics
    mlflow.log_metrics(cv_mean)
    mlflow.log_metrics(cv_std)
    mlflow.log_metrics(train_mean)
    mlflow.log_metrics(final_metrics_test)

    # --- Log du mod√®le
    mlflow_sklearn.log_model(
        sk_model=model,
        artifact_path="logreg_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False)
    plt.title("Confusion Matrix - Logistic Regression")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/logreg_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("ROC Curve - Logistic Regression")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/logreg_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - Logistic Regression")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/logreg_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    print("[bold cyan]‚úÖ LogisticRegression logu√© dans MLflow avec succ√®s[/bold cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - LogisticRegression :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



## 4. Random Forest Classifier

In [27]:
with mlflow.start_run(
    run_name="RandomForest_v1",
    description="Random Forest Classifier - mod√®le non lin√©aire",
):
    
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "random_forest")

    # --- D√©finir les hyperparam√®tres (AVANT la cr√©ation du mod√®le)
    model_params = {
        "n_estimators": 200,
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "class_weight": "balanced",
        "random_state": SEED,
        "n_jobs": -1,
    }
    
    # --- Logger les param√®tres EN PREMIER (convertir None en string pour MLflow)
    mlflow_params = {k: str(v) for k, v in model_params.items()}
    mlflow.log_params(mlflow_params)

    # --- Cr√©er le mod√®le avec les params
    model = RandomForestClassifier(**model_params)

    # --- Cross-Validation
    cv_results = cross_validate(
        model,
        X_train_processed,
        y_train,
        cv=CV_STRATEGY,
        scoring=GLOBAL_SCORING,
        return_train_score=True,
    )
    
    cv_mean, cv_std, train_mean = extract_cv_metrics(cv_results)

    # --- Entra√Ænement final
    model.fit(X_train_processed, y_train)

    # --- Pr√©dictions finales
    y_pred_train = model.predict(X_train_processed)
    y_proba_train = model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )
    
    # --- Log metrics
    mlflow.log_metrics(cv_mean)
    mlflow.log_metrics(cv_std)
    mlflow.log_metrics(train_mean)
    mlflow.log_metrics(final_metrics_test)

    # --- Log du mod√®le
    mlflow.sklearn.log_model(
        sk_model=model,
        name="random_forest_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Purples", cbar=False)
    plt.title("Confusion Matrix - Random Forest")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/rf_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("ROC Curve - Random Forest")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/rf_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - Random Forest")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/rf_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    # --- Importance des features
    importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(
        ascending=False
    )
    plt.figure(figsize=(8, 5))
    imp_df = pd.DataFrame({'importance': importances[:10].values, 'feature': importances[:10].index})
    sns.barplot(data=imp_df, x='importance', y='feature', palette="viridis", hue='feature', legend=False)
    plt.title("Top 10 Feature Importances - Random Forest")
    plt.tight_layout()
    feat_imp_path = f"{FIG_DIR}/rf_feature_importances.png"
    plt.savefig(feat_imp_path)
    mlflow.log_artifact(feat_imp_path)
    plt.close()

    print("[bold cyan]‚úÖ RandomForest logu√© dans MLflow avec succ√®s[/bold cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - RandomForest :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



## 5. Support Vector Machine (SVM) Classifier

In [28]:
from sklearn.svm import SVC

with mlflow.start_run(
    run_name="SVM_v1",
    description="Support Vector Machine Classifier - noyau RBF avec class_weight",
):
    
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "svm")

    # --- D√©finir les hyperparam√®tres (AVANT la cr√©ation du mod√®le)
    # Calcul du poids invers√© pour g√©rer le d√©s√©quilibre de classe
    neg = (y_train == 0).sum()  # count of negative class (0)
    pos = (y_train == 1).sum()  # count of positive class (1)
    class_weights = {0: pos / neg, 1: 1.0}  # classe 0 (minoritaire) re√ßoit plus de poids
    
    model_params = {
        "kernel": "linear",  # Changed from "rbf" to "linear" for faster training
        "C": 0.5,  # Reduced from 1.0 for better generalization
        "gamma": "scale",
        "class_weight": class_weights,
        "probability": True,
        "random_state": SEED,
        "max_iter": 5000,  # Increased from 2000 for convergence (data is already scaled)
    }
    
    # --- Logger les param√®tres EN PREMIER
    mlflow_params = model_params.copy()
    mlflow_params["class_weight"] = str(class_weights)  # convertir dict en string pour MLflow
    mlflow.log_params(mlflow_params)

    # --- Cr√©er le mod√®le avec les params
    model = SVC(**model_params)

    # --- Cross-Validation
    cv_results = cross_validate(
        model,
        X_train_processed,
        y_train,
        cv=CV_STRATEGY,
        scoring=GLOBAL_SCORING,
        return_train_score=True,
    )
    
    cv_mean, cv_std, train_mean = extract_cv_metrics(cv_results)

    # --- Entra√Ænement final
    model.fit(X_train_processed, y_train)

    # --- Pr√©dictions finales
    y_pred_train = model.predict(X_train_processed)
    y_proba_train = model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )
    
    # --- Log metrics
    mlflow.log_metrics(cv_mean)
    mlflow.log_metrics(cv_std)
    mlflow.log_metrics(train_mean)
    mlflow.log_metrics(final_metrics_test)

    # --- Log du mod√®le
    mlflow.sklearn.log_model(
        sk_model=model,
        name="svm_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Oranges", cbar=False)
    plt.title("Confusion Matrix - SVM")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/svm_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("ROC Curve - SVM")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/svm_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - SVM")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/svm_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    print("[bold cyan]‚úÖ SVM logu√© dans MLflow avec succ√®s[/bold cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - SVM :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



## 6. XGBoost Classifier

In [29]:
# Calcul du poids de la classe majoritaire / minoritaire
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

with mlflow.start_run(
    run_name="XGBoost_v1",
    description="XGBoost Classifier - mod√®le non lin√©aire avec scale_pos_weight",
):
    
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "xgboost")

    # --- D√©finir les hyperparam√®tres (AVANT la cr√©ation du mod√®le)
    model_params = {
        "n_estimators": 300,
        "max_depth": 5,
        "min_child_weight": 3,
        "max_delta_step": 1,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "scale_pos_weight": scale_pos_weight,
        "random_state": SEED,
        "eval_metric": "logloss",
        "n_jobs": -1,
    }
    
    # --- Logger les param√®tres EN PREMIER (arrondir scale_pos_weight)
    mlflow_params = model_params.copy()
    mlflow_params["scale_pos_weight"] = round(scale_pos_weight, 2)
    mlflow.log_params(mlflow_params)

    # --- Cr√©er le mod√®le avec les params
    model = XGBClassifier(**model_params)

    # --- Cross-Validation
    cv_results = cross_validate(
        model,
        X_train_processed,
        y_train,
        cv=CV_STRATEGY,
        scoring=GLOBAL_SCORING,
        return_train_score=True,
    )
    
    cv_mean, cv_std, train_mean = extract_cv_metrics(cv_results)

    # --- Entra√Ænement final
    model.fit(X_train_processed, y_train)

    # --- Pr√©dictions finales
    y_pred_train = model.predict(X_train_processed)
    y_proba_train = model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = model.predict(X_test_processed)
    y_proba = model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )

    # --- Log metrics
    mlflow.log_metrics(cv_mean)
    mlflow.log_metrics(cv_std)
    mlflow.log_metrics(train_mean)
    mlflow.log_metrics(final_metrics_test)

    # --- Log mod√®le
    mlflow.xgboost.log_model(
        xgb_model=model,
        name="xgboost_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="mako", cbar=False)
    plt.title("Confusion Matrix - XGBoost")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/xgb_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("ROC Curve - XGBoost")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/xgb_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - XGBoost")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/xgb_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    # --- Importance des features
    importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(
        ascending=False
    )
    plt.figure(figsize=(8, 5))
    imp_df = pd.DataFrame({'importance': importances[:10].values, 'feature': importances[:10].index})
    sns.barplot(data=imp_df, x='importance', y='feature', palette="rocket", hue='feature', legend=False)
    plt.title("Top 10 Feature Importances - XGBoost")
    plt.tight_layout()
    feat_imp_path = f"{FIG_DIR}/xgb_feature_importances.png"
    plt.savefig(feat_imp_path)
    mlflow.log_artifact(feat_imp_path)
    plt.close()

    print("[bold cyan]‚úÖ XGBoost logu√© dans MLflow avec succ√®s[/bold cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - XGBoost :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)

  self.get_booster().save_model(fname)
  self.get_booster().load_model(fname)
  self.get_booster().load_model(fname)
