## 0. Introduction
Objectif: Fine-tuner le mod√®le XGBoost en explorant diff√©rents hyperparam√®tres.

Tra√ßabilit√© et reproductibilit√© via MLflow.

## 1. Import & configuration

In [None]:
import sys, os
import mlflow
import mlflow.xgboost
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    RocCurveDisplay,
    PrecisionRecallDisplay,
)
import seaborn as sns
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
import pandas as pd
from rich import print

# Dataset + constantes
sys.path.insert(0, "..")
from config import PROCESSED_DATA_PATH, TARGET_COL, SEED, FIG_DIR, GLOBAL_SCORING, CV_STRATEGY
from utils import compute_train_test_metrics

df = pd.read_csv(PROCESSED_DATA_PATH)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Identifier les colonnes num√©riques et binaires
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
binary_cols = [col for col in numeric_cols if X[col].nunique() == 2 or (X[col].dtype == 'int64' and set(X[col].unique()) == {0, 1})]
numeric_cols = [col for col in numeric_cols if col not in binary_cols]

NUMERIC_COLS = numeric_cols
BINARY_COLS = binary_cols

print(f"[bold cyan]üìä Colonnes d√©tect√©es par feature engineering:[/bold cyan]")
print(f"   - Num√©riques: {NUMERIC_COLS}")
print(f"   - Binaires: {BINARY_COLS}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "false"
mlflow.autolog(disable=True)
mlflow.end_run()
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Rugby Kicks Classification - XGBoost Finetuning")
mlflow.set_tag("author", "Xavier")
mlflow.set_tag("project", "OC P6")

X_trainfull = pd.concat([X_train, y_train], axis=1)
X_testfull = pd.concat([X_test, y_test], axis=1)

train_dataset = mlflow.data.from_pandas(X_trainfull, source=PROCESSED_DATA_PATH, name="Training_set")
test_dataset = mlflow.data.from_pandas(X_testfull, source=PROCESSED_DATA_PATH, name="Test_set")

if mlflow.active_run():
    print("‚ö†Ô∏è Run actif trouv√© :", mlflow.active_run().info.run_id)
    mlflow.end_run()

print("[bold green]‚úÖ Configuration MLflow pr√™te[/bold green]")

‚úÖ Config initialis√©e depuis : /Users/xaviercoulon/Documents/OC/OC_P6_Rugby_MLOps


2025/11/13 15:59:13 INFO mlflow.tracking.fluent: Experiment with name 'Rugby Kicks Classification - XGBoost Finetuning' does not exist. Creating a new experiment.
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


## 2. Preprocessing & Feature Scaling

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Cr√©er la pipeline de preprocessing
preprocessing_pipeline = ColumnTransformer([
    ('scaler', StandardScaler(), NUMERIC_COLS),
    ('passthrough', 'passthrough', BINARY_COLS)
], remainder='drop')

# FIT sur X_train UNIQUEMENT (prevent data leakage!)
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

# Convertir en DataFrame pour maintenabilit√©
feature_names = NUMERIC_COLS + BINARY_COLS
X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test.index)

print("[bold green]‚úÖ Pipeline de preprocessing cr√©√©e et appliqu√©e[/bold green]")
print(f"   - Colonnes num√©riques standardis√©es: {NUMERIC_COLS}")
print(f"   - Colonnes binaires conserv√©es: {BINARY_COLS}")
print(f"   - X_train shape: {X_train_processed.shape}")
print(f"   - X_test shape: {X_test_processed.shape}")

## 3. XGBoost Fine-tuning with GridSearchCV

In [3]:
# Calcul du poids de la classe majoritaire / minoritaire
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

# --- Hyperparam√®tres de base (√† partir du mod√®le benchmark)
base_params = {
    "scale_pos_weight": scale_pos_weight,
    "random_state": SEED,
    "eval_metric": "logloss",
    "n_jobs": -1,
}

# --- Grille de param√®tres √† explorer
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [4, 5, 6],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "min_child_weight": [2, 3, 5],
}

print("[bold cyan]üîç Grille de param√®tres pour GridSearchCV:[/bold cyan]")
for param, values in param_grid.items():
    print(f"   - {param}: {values}")
print(f"\n   Total combinations: {np.prod([len(v) for v in param_grid.values()])} (will use CV={CV_STRATEGY.get_n_splits()})") 

In [4]:
with mlflow.start_run(
    run_name="XGBoost_GridSearchCV",
    description="Fine-tuning XGBoost avec GridSearchCV - exhaustive search",
):

    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "xgboost_finetuned")
    mlflow.set_tag("optimization_method", "gridsearchcv")

    # --- Logger les param√®tres de base
    base_params_log = base_params.copy()
    base_params_log["scale_pos_weight"] = round(scale_pos_weight, 2)
    mlflow.log_params({f"base_{k}": str(v) for k, v in base_params_log.items()})
    
    # --- Logger la grille de param√®tres
    for param, values in param_grid.items():
        mlflow.log_param(f"grid_{param}", str(values))

    # --- GridSearchCV
    xgb_base = XGBClassifier(**base_params)
    grid_search = GridSearchCV(
        estimator=xgb_base,
        param_grid=param_grid,
        cv=CV_STRATEGY,
        scoring="f1_weighted",  # Utiliser F1 pond√©r√© pour imbalanced data
        n_jobs=-1,
        verbose=1
    )

    print("[bold cyan]üîÑ Lancement de GridSearchCV... (cela peut prendre du temps)[/bold cyan]")
    grid_search.fit(X_train_processed, y_train)

    # --- R√©cup√©rer le meilleur mod√®le
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_

    print(f"[bold green]‚úÖ GridSearchCV termin√©[/bold green]")
    print(f"   - Best CV Score (F1): {best_cv_score:.4f}")
    print(f"[bold yellow]üèÜ Best Parameters found:[/bold yellow]")
    for param, value in best_params.items():
        print(f"   - {param}: {value}")
    
    # --- Logger les meilleurs param√®tres
    mlflow.log_params({f"best_{k}": str(v) for k, v in best_params.items()})
    mlflow.log_metric("best_cv_score_f1_weighted", best_cv_score)

    # --- Pr√©dictions finales
    y_pred_train = best_model.predict(X_train_processed)
    y_proba_train = best_model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = best_model.predict(X_test_processed)
    y_proba = best_model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )
    
    # --- Log metrics
    mlflow.log_metrics(final_metrics_test)

    # --- Log du mod√®le
    mlflow.xgboost.log_model(
        xgb_model=best_model,
        name="xgboost_finetuned_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="mako", cbar=False)
    plt.title("Confusion Matrix - XGBoost Finetuned")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/xgb_finetuned_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(best_model, X_test_processed, y_test)
    plt.title("ROC Curve - XGBoost Finetuned")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/xgb_finetuned_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(best_model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - XGBoost Finetuned")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/xgb_finetuned_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    # --- Importance des features
    importances = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(
        ascending=False
    )
    plt.figure(figsize=(8, 5))
    imp_df = pd.DataFrame({'importance': importances[:10].values, 'feature': importances[:10].index})
    sns.barplot(data=imp_df, x='importance', y='feature', palette="rocket", hue='feature', legend=False)
    plt.title("Top 10 Feature Importances - XGBoost Finetuned")
    plt.tight_layout()
    feat_imp_path = f"{FIG_DIR}/xgb_finetuned_feature_importances.png"
    plt.savefig(feat_imp_path)
    mlflow.log_artifact(feat_imp_path)
    plt.close()

    print("[bold cyan]‚úÖ XGBoost Finetuned logu√© dans MLflow avec succ√®s[/bold cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - XGBoost Finetuned :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



Fitting 5 folds for each of 729 candidates, totalling 3645 fits


  self.get_booster().save_model(fname)
  self.get_booster().load_model(fname)
