## 0. Introduction
Objectif: Fine-tuner le mod√®le Logistic Regression en explorant diff√©rents hyperparam√®tres.

Tra√ßabilit√© et reproductibilit√© via MLflow.

## 1. Import & configuration

In [1]:
import sys, os
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    RocCurveDisplay,
    PrecisionRecallDisplay,
)
from imblearn.over_sampling import SMOTE
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
from rich import print
import shap

# Dataset + constantes
sys.path.insert(0, "..")
from config import PROCESSED_DATA_PATH, TARGET_COL, SEED, FIG_DIR, CV_STRATEGY
from utils import compute_train_test_metrics

df = pd.read_csv(PROCESSED_DATA_PATH)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# --- Identifier les colonnes num√©riques et binaires
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
binary_cols = [col for col in numeric_cols if X[col].nunique() == 2 or (X[col].dtype == 'int64' and set(X[col].unique()) == {0, 1})]
numeric_cols = [col for col in numeric_cols if col not in binary_cols]

NUMERIC_COLS = numeric_cols
BINARY_COLS = binary_cols

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

# Configuration MLflow
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Rugby Kicks - LogisticRegression Finetuning")
mlflow.set_experiment_tag("author", "Xavier")

# Fin de tout run actif avant de commencer
if mlflow.active_run():
    mlflow.end_run()

# Pr√©parer les datasets
X_trainfull = pd.concat([X_train, y_train], axis=1)
X_testfull = pd.concat([X_test, y_test], axis=1)

train_dataset = mlflow.data.from_pandas(X_trainfull, source=PROCESSED_DATA_PATH, name="Training_set")
test_dataset = mlflow.data.from_pandas(X_testfull, source=PROCESSED_DATA_PATH, name="Test_set")

print("[bold green]‚úÖ Configuration pr√™te[/bold green]")

‚úÖ Config initialis√©e depuis : /Users/xaviercoulon/Documents/OC/OC_P6_Rugby_MLOps


  from .autonotebook import tqdm as notebook_tqdm
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


## 2. Preprocessing & Feature Scaling

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Cr√©er la pipeline de preprocessing
preprocessing_pipeline = ColumnTransformer([
    ('scaler', StandardScaler(), NUMERIC_COLS),
    ('passthrough', 'passthrough', BINARY_COLS)
], remainder='drop')

# FIT sur X_train UNIQUEMENT (prevent data leakage!)
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

# Convertir en DataFrame pour maintenabilit√©
feature_names = NUMERIC_COLS + BINARY_COLS
X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test.index)

print("[bold green]‚úÖ Pipeline de preprocessing cr√©√©e et appliqu√©e[/bold green]")
print(f"   - Colonnes num√©riques standardis√©es: {NUMERIC_COLS}")
print(f"   - Colonnes binaires conserv√©es: {BINARY_COLS}")
print(f"   - X_train shape: {X_train_processed.shape}")
print(f"   - X_test shape: {X_test_processed.shape}")

## 2.5 SMOTE - Synthetic Minority Over-sampling

In [3]:
# --- Appliquer SMOTE pour √©quilibrer les classes
print("[bold cyan]üîÑ Application de SMOTE (Synthetic Minority Over-sampling)...[/bold cyan]")

# Afficher la distribution AVANT SMOTE
print(f"\n[bold]Avant SMOTE:[/bold]")
print(f"  - Classe 0: {(y_train == 0).sum()} samples")
print(f"  - Classe 1: {(y_train == 1).sum()} samples")
print(f"  - Ratio: {(y_train == 0).sum() / (y_train == 1).sum():.2f}:1")

# Appliquer SMOTE
smote = SMOTE(random_state=SEED)
X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)

# Afficher la distribution APR√àS SMOTE
print(f"\n[bold]Apr√®s SMOTE:[/bold]")
print(f"  - Classe 0: {(y_train_smote == 0).sum()} samples")
print(f"  - Classe 1: {(y_train_smote == 1).sum()} samples")
print(f"  - Ratio: {(y_train_smote == 0).sum() / (y_train_smote == 1).sum():.2f}:1")

print(f"\n[bold green]‚úÖ SMOTE appliqu√© avec succ√®s[/bold green]")
print(f"  X_train_smote shape: {X_train_smote.shape}")
print(f"  y_train_smote shape: {y_train_smote.shape}")

## 3. Logistic Regression Fine-tuning with GridSearchCV

In [4]:
# --- Grille de param√®tres √† explorer (hyperparam√®tres cl√©s pour LogisticRegression)
# ‚ö†Ô∏è  IMPORTANT: √©viter les combinaisons invalides
# - lbfgs: supporte UNIQUEMENT l2 ou None
# - liblinear: supporte l1 ou l2
from sklearn.model_selection import ParameterGrid

param_grid = [
    # Solver: lbfgs (uniquement avec penalty l2 ou None)
    {
        "solver": ["lbfgs"],
        "C": [0.1, 0.5, 1.0, 5.0],
        "penalty": ["l2"],
        "max_iter": [1000, 5000],
    },
    # Solver: liblinear (supporte l1 et l2)
    {
        "solver": ["liblinear"],
        "C": [0.1, 0.5, 1.0, 5.0],
        "penalty": ["l1", "l2"],
        "max_iter": [1000, 5000],
    },
]

# Calculer le nombre total de combinaisons
total_combos = sum(len(list(ParameterGrid(grid))) for grid in param_grid)

print("[bold cyan]üîç Grille de param√®tres pour GridSearchCV:[/bold cyan]")
print(f"   Solver lbfgs (penalty=l2): 4 C √ó 1 penalty √ó 2 max_iter = 8 combos")
print(f"   Solver liblinear (penalty=l1,l2): 4 C √ó 2 penalty √ó 2 max_iter = 16 combos")
print(f"   Total valid combinations: {total_combos}") 

## 4. GridSearchCV Execution

In [5]:
with mlflow.start_run(
    run_name="LogisticRegression_GridSearchCV_SMOTE_0.7",
    description="Fine-tuning LogisticRegression avec GridSearchCV + SMOTE + class_weight 0.7",
):

    # --- Hyperparam√®tres de base
    base_params = {
    "random_state": SEED,
    "class_weight": {0:1, 1:0.7},
    }
    
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")
    mlflow.set_tag("model_type", "logistic_regression_finetuned")
    mlflow.set_tag("optimization_method", "gridsearchcv")
    mlflow.set_tag("resampling", "SMOTE")

    # --- Logger les param√®tres de base
    mlflow.log_params({f"base_{k}": str(v) for k, v in base_params.items()})
    
    # --- Logger la grille de param√®tres (param_grid est une liste de dicts)
    mlflow.log_param("grid_config", f"2 parameter groups: lbfgs(l2) + liblinear(l1,l2)")

    # --- Cr√©er le mod√®le de base
    base_model = LogisticRegression(**base_params)

    # --- Cr√©er GridSearchCV
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=CV_STRATEGY,
        scoring="f1_weighted",  # M√™me m√©trique que XGBoost
        n_jobs=-1,
        verbose=2,
    )

    # --- Entra√Æner avec GridSearchCV sur les donn√©es SMOTE
    print("[bold cyan]üöÄ Lancement du GridSearchCV sur donn√©es SMOTE...[/bold cyan]")
    grid_search.fit(X_train_smote, y_train_smote)
    
    # --- R√©sultats
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    
    print(f"[bold green]‚úÖ GridSearchCV termin√©[/bold green]")
    print(f"\n[bold cyan]üèÜ Meilleurs param√®tres (F1_weighted sur donn√©es SMOTE):[/bold cyan]")
    for param, value in best_params.items():
        print(f"   - {param}: {value}")
    print(f"\n   Meilleur score CV (F1_weighted): {best_cv_score:.4f}")

    # --- Logger les meilleurs param√®tres
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_score_f1_weighted_smote", best_cv_score)

    # --- Pr√©dictions finales sur le test set avec le meilleur mod√®le
    # ‚ö†Ô∏è  IMPORTANT: Le test set n'est PAS r√©sampl√©, on l'√©value tel quel
    y_pred_train = best_model.predict(X_train_processed)
    y_proba_train = best_model.predict_proba(X_train_processed)[:, 1]
    
    y_pred = best_model.predict(X_test_processed)
    y_proba = best_model.predict_proba(X_test_processed)[:, 1]

    # --- M√©triques train/test + overfitting gaps
    final_metrics_test, cm, (fp_rate, fn_rate) = compute_train_test_metrics(
        y_train, y_pred_train, y_proba_train,
        y_test, y_pred, y_proba
    )
    
    # --- Log metrics
    mlflow.log_metrics(final_metrics_test)

    # --- Log du mod√®le
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="logreg_finetuned_smote_model",
        input_example=X_test_processed.iloc[:5],
    )

    # --- Matrice de confusion
    plt.figure()
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False)
    plt.title("Confusion Matrix - LogisticRegression Finetuned (SMOTE)")
    plt.xlabel("Pr√©dictions")
    plt.ylabel("V√©rit√©s terrain")
    plt.tight_layout()
    cm_path = f"{FIG_DIR}/logreg_finetuned_smote_confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # --- Courbe ROC
    RocCurveDisplay.from_estimator(best_model, X_test_processed, y_test)
    plt.title("ROC Curve - LogisticRegression Fine-tuned (SMOTE)")
    plt.tight_layout()
    roc_path = f"{FIG_DIR}/logreg_finetuned_smote_roc_curve.png"
    plt.savefig(roc_path)
    mlflow.log_artifact(roc_path)
    plt.close()

    # --- Courbe Precision-Recall
    PrecisionRecallDisplay.from_estimator(best_model, X_test_processed, y_test)
    plt.title("Precision-Recall Curve - LogisticRegression Fine-tuned (SMOTE)")
    plt.tight_layout()
    pr_path = f"{FIG_DIR}/logreg_finetuned_smote_pr_curve.png"
    plt.savefig(pr_path)
    mlflow.log_artifact(pr_path)
    plt.close()

    # --- Feature Importance (Coefficients)
    coef = best_model.coef_[0]
    importances = pd.Series(np.abs(coef), index=X_train_processed.columns).sort_values(ascending=False)
    plt.figure()
    imp_df = pd.DataFrame({'importance': importances[:10].values, 'feature': importances[:10].index})
    sns.barplot(data=imp_df, x='importance', y='feature', palette="coolwarm", hue='feature', legend=False)
    plt.title("Top 10 Feature Importances - LogisticRegression Finetuned (SMOTE)")
    plt.tight_layout()
    feat_imp_path = f"{FIG_DIR}/logreg_finetuned_smote_feature_importances.png"
    plt.savefig(feat_imp_path)
    mlflow.log_artifact(feat_imp_path)
    plt.close()

    print("[bold cyan]‚úÖ LogisticRegression fine-tuned (SMOTE) logu√© dans MLflow avec succ√®s[/bold cyan]")

    # ========== SHAP Analysis - DANS LE M√äME RUN ==========
    print("\n[bold magenta]üîç SHAP Analysis - Model Interpretability[/bold magenta]")

    # LinearExplainer pour LogisticRegression (optimis√© pour mod√®les lin√©aires)
    print("\n[cyan]Computing SHAP values (LinearExplainer for LogisticRegression)...[/cyan]")
    
    try:
        explainer = shap.LinearExplainer(best_model, X_train_processed)
        shap_values = explainer.shap_values(X_test_processed)
        
        # Pour classification binaire, shap_values est une liste [class0, class1]
        # On prend class 1 (le cas d'int√©r√™t)
        if isinstance(shap_values, list):
            shap_values_class1 = shap_values[1]
        else:
            shap_values_class1 = shap_values

        # 1Ô∏è‚É£ SUMMARY PLOT - Global Feature Importance
        print("\n[cyan]1. Summary Plot - Feature Importance[/cyan]")
        plt.figure()
        shap.summary_plot(shap_values_class1, X_test_processed, plot_type="bar", show=False)
        plt.title("SHAP Summary Plot (Bar) - LogisticRegression Finetuned (SMOTE)")
        plt.tight_layout()
        shap_summary_path = f"{FIG_DIR}/logreg_finetuned_smote_shap_summary_bar.png"
        plt.savefig(shap_summary_path)
        plt.close()
        print(f"   ‚úÖ Saved: {shap_summary_path}")
        mlflow.log_artifact(shap_summary_path)

        # 2Ô∏è‚É£ SUMMARY PLOT (Bee swarm)
        print("\n[cyan]2. Summary Plot (Bee Swarm)[/cyan]")
        plt.figure()
        shap.summary_plot(shap_values_class1, X_test_processed, show=False)
        plt.title("SHAP Summary Plot (Bee Swarm) - LogisticRegression Finetuned (SMOTE)")
        plt.tight_layout()
        shap_bee_path = f"{FIG_DIR}/logreg_finetuned_smote_shap_summary_bee.png"
        plt.savefig(shap_bee_path)
        plt.close()
        print(f"   ‚úÖ Saved: {shap_bee_path}")
        mlflow.log_artifact(shap_bee_path)

        # 3Ô∏è‚É£ DEPENDENCE PLOT - Top feature
        print("\n[cyan]3. Dependence Plot - Top Feature[/cyan]")
        top_feature_idx = np.abs(shap_values_class1).mean(axis=0).argsort()[-1]
        top_feature_name = X_test_processed.columns[top_feature_idx]
        
        plt.figure()
        shap.dependence_plot(top_feature_idx, shap_values_class1, X_test_processed, show=False)
        plt.title(f"SHAP Dependence Plot - {top_feature_name}")
        plt.tight_layout()
        shap_dep_path = f"{FIG_DIR}/logreg_finetuned_smote_shap_dependence_top_feature.png"
        plt.savefig(shap_dep_path)
        plt.close()
        print(f"   ‚úÖ Saved: {shap_dep_path}")
        mlflow.log_artifact(shap_dep_path)

        # 4Ô∏è‚É£ FORCE PLOT - First sample
        print("\n[cyan]4. Force Plot - First Test Sample[/cyan]")
        first_idx = 0
        plt.figure()
        shap.force_plot(explainer.expected_value, shap_values_class1[first_idx], 
                       X_test_processed.iloc[first_idx], matplotlib=True, show=False)
        plt.title(f"SHAP Force Plot - Sample {first_idx} (Actual: {y_test.iloc[first_idx]}, Pred: {y_pred[first_idx]})")
        plt.tight_layout()
        shap_force_path = f"{FIG_DIR}/logreg_finetuned_smote_shap_force_first_sample.png"
        plt.savefig(shap_force_path)
        plt.close()
        print(f"   ‚úÖ Saved: {shap_force_path}")
        mlflow.log_artifact(shap_force_path)

        # 5Ô∏è‚É£ WATERFALL PLOT - First sample
        print("\n[cyan]5. Waterfall Plot - First Test Sample[/cyan]")
        plt.figure()
        shap.waterfall_plot(shap.Explanation(values=shap_values_class1[first_idx], 
                                            base_values=explainer.expected_value,
                                            data=X_test_processed.iloc[first_idx],
                                            feature_names=X_test_processed.columns),
                           show=False)
        plt.title(f"SHAP Waterfall Plot - Sample {first_idx}")
        plt.tight_layout()
        shap_waterfall_path = f"{FIG_DIR}/logreg_finetuned_smote_shap_waterfall_first_sample.png"
        plt.savefig(shap_waterfall_path)
        plt.close()
        print(f"   ‚úÖ Saved: {shap_waterfall_path}")
        mlflow.log_artifact(shap_waterfall_path)

        # 6Ô∏è‚É£ FEATURE IMPORTANCE SUMMARY
        print("\n[cyan]6. SHAP Feature Importance Summary[/cyan]")
        mean_abs_shap = np.abs(shap_values_class1).mean(axis=0)
        top_features_shap = pd.Series(mean_abs_shap, index=X_test_processed.columns).sort_values(ascending=False)
        
        print("\nTop 5 Features by SHAP Impact:")
        for idx, (feature, impact) in enumerate(top_features_shap.head(5).items(), 1):
            print(f"   {idx}. {feature:30s}: {impact:.6f}")

        print("\n[green]‚úÖ SHAP artifacts logged to MLflow in same run![/green]")
        print("\n[bold green]‚ú® SHAP Analysis Complete![/bold green]")

    except Exception as e:
        print(f"[bold yellow]‚ö†Ô∏è  SHAP Analysis skipped: {str(e)}[/bold yellow]")
        print("[cyan]This can happen with certain model configurations. Continuing without SHAP analysis.[/cyan]")

# üîé R√©sum√© local
print(
    f"""
üìä R√©sum√© des m√©triques - LogisticRegression Fine-tuned (SMOTE) :
  Train Accuracy: {final_metrics_test['train_accuracy']:.3f}
  Test Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  Overfitting Gap (Accuracy): {final_metrics_test['overfit_gap_accuracy']:.3f}
  ---
  Train F1 classe 0: {final_metrics_test['train_f1_0']:.3f}
  Test F1 classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Overfitting Gap (F1_0): {final_metrics_test['overfit_gap_f1_0']:.3f}
  ---
  Accuracy:  {final_metrics_test['final_test_accuracy']:.3f}
  ROC-AUC:   {final_metrics_test['final_test_auc']:.3f}
  Precision classe 0: {final_metrics_test['final_test_precision_0']:.3f}
  Recall classe 0:    {final_metrics_test['final_test_recall_0']:.3f}
  F1-score classe 0:  {final_metrics_test['final_test_f1_0']:.3f}
  Precision classe 1: {final_metrics_test['final_test_precision_1']:.3f}
  Recall classe 1:    {final_metrics_test['final_test_recall_1']:.3f}
  F1-score classe 1:  {final_metrics_test['final_test_f1_1']:.3f}
  FPR:       {fp_rate:.3f}
  FNR:       {fn_rate:.3f}
"""
)



Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END .....C=0.1, max_iter=5000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=5000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=5000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.5, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.5, max_iter=1000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.1, max_iter=5000, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END .....C=0.5, max_iter=1000, penalty=l2,



<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>