In [2]:
# ============================================
# 1) IMPORTS + CONFIGURATION GPU & MLFLOW
# ============================================

import os
import mlflow
from mlflow import MlflowClient
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ton EDA perso
from app.features_optiweb import apply_eda


# --------------------------------------------
# EXPÉRIENCE MLFLOW
# --------------------------------------------
EXPERIMENT_NAME = "optiweb_pipeline"
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"MLflow experiment initialisé : {EXPERIMENT_NAME}")


2025/11/13 12:28:27 INFO mlflow.tracking.fluent: Experiment with name 'optiweb_pipeline' does not exist. Creating a new experiment.


MLflow experiment initialisé : optiweb_pipeline


Cellule 2 — Paramètres globaux + Fonction utilitaires

In [None]:
# ============================================
# 2) PARAMÈTRES, CV & UTILITAIRES
# ============================================

N_ROWS = 200_000                
N_SPLITS = 3                    U
RANDOM_STATE = 42

cv = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE
)

# fonction utilitaire pour importance features
def plot_feature_importance(importances, feature_names, top=20, title="Feature Importance"):
    idx = np.argsort(importances)[::-1][:top]
    
    plt.figure(figsize=(10,6))
    plt.title(title)
    plt.barh(np.array(feature_names)[idx][::-1], importances[idx][::-1])
    plt.tight_layout()
    return plt


Cellule 3 — Chargement EDA (200k lignes)

In [4]:
# ============================================
# 3) CHARGEMENT & EDA
# ============================================

print("Chargement EDA sur 200k lignes...")

X_train, y_train, X_test, test_ids = apply_eda(
    nrows=N_ROWS,
    nan_as_category=False
)

print(f"Shape X_train = {X_train.shape}")
print(f"Shape X_test  = {X_test.shape}")
print(f"% class 1 = {np.mean(y_train):.4f}")


Chargement EDA sur 200k lignes...
Shape X_train = (199996, 722)
Shape X_test  = (48744, 722)
% class 1 = 0.0812


Cellule 4 — Définition des modèles GPU only


In [9]:
# ============================================
# 4) MODÈLES GPU ONLY
# ============================================

models_gpu = {
    "xgboost_gpu": XGBClassifier(
        tree_method="hist",
        device="cuda",
        eval_metric="auc",
        n_estimators=300,
        max_depth=6,
        learning_rate=0.07,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE
    ),
    "lightgbm_gpu": LGBMClassifier(
        device="gpu",
        n_estimators=300,
        learning_rate=0.07,
        subsample=0.8,
        colsample_bytree=0.8,
        num_leaves=40,
        random_state=RANDOM_STATE,
        verbose = -1
    ),
    "catboost_gpu": CatBoostClassifier(
        task_type="GPU",
        devices="0",
        depth=6,
        iterations=300,
        learning_rate=0.07,
        eval_metric="AUC",
        verbose=False,
        random_seed=RANDOM_STATE
    )
}

print("Modèles GPU prêts.")


Modèles GPU prêts.


Cellule 5 — Run principal MLflow (nested) + entraînements modèles

In [10]:
# ============================================
# 5) RUN PRINCIPAL MLFLOW + NESTED CV
# ============================================

main_run_name = "gpu_baseline_comparison"

results = []

with mlflow.start_run(run_name=main_run_name) as parent_run:

    # 1) log features utilisées
    mlflow.log_dict({"features_used": list(X_train.columns)}, "features_used.json")

    # 2) pour chaque modèle GPU
    for name, model in models_gpu.items():

        with mlflow.start_run(run_name=f"cv_{name}", nested=True) as child_run:

            print(f"\n=== Entraînement {name} ===")

            # Cross-validated AUC
            scores = cross_val_score(
                model,
                X_train,
                y_train,
                cv=cv,
                scoring="roc_auc",
                n_jobs=1  # GPU = 1 job
            )
            mean_auc = float(np.mean(scores))
            std_auc = float(np.std(scores))

            mlflow.log_metric("roc_auc_mean", mean_auc)
            mlflow.log_metric("roc_auc_std", std_auc)

            # Fit complet pour sauvegarder modèle
            model.fit(X_train, y_train)

            # Log hyperparamètres
            mlflow.log_params(model.get_params())

            # Log modèle MLflow
            model_info = mlflow.sklearn.log_model(
                sk_model=model,
                name=name,
                input_example=X_train.iloc[:5]
            )

            # Log feature importance (si dispo)
            if name == "catboost_gpu":
                importances = model.get_feature_importance()
            else:
                importances = model.feature_importances_

            plt = plot_feature_importance(importances, X_train.columns)
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png", artifact_path=f"{name}/plots")
            plt.close()

            # Log config hyperparams
            config = {"model_name": name, "params": model.get_params()}
            with open("config.yaml", "w") as f:
                import yaml
                yaml.dump(config, f)
            mlflow.log_artifact("config.yaml", artifact_path=name)

            # Stocker résultat pour plus tard
            results.append({
                "name": name,
                "auc": mean_auc,
                "run_id": child_run.info.run_id,
                "model_uri": model_info.model_uri
            })

print("Training terminé.")
results



=== Entraînement xgboost_gpu ===


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1410.87it/s]



=== Entraînement lightgbm_gpu ===


Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]



=== Entraînement catboost_gpu ===


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 982.63it/s] 


Training terminé.


[{'name': 'xgboost_gpu',
  'auc': 0.7600237324765727,
  'run_id': '9af9ee08a3014bd3aea7fa81f6553614',
  'model_uri': 'models:/m-45bbc065d64a4462a407dd69e63e4de8'},
 {'name': 'lightgbm_gpu',
  'auc': 0.7571940637511031,
  'run_id': 'd889b831e76047f0bbcda62e255c5181',
  'model_uri': 'models:/m-1f06cc46c16f409197209c26f8dad4aa'},
 {'name': 'catboost_gpu',
  'auc': 0.7586995872985325,
  'run_id': '1c6bcb364c6449d6ab75af999c2b0533',
  'model_uri': 'models:/m-f9615446951c43dfac9de7cfd383520b'}]

Cellule 6 — Sélection du meilleur modèle + enregistrement registry

In [11]:
# ============================================
# 6) SÉLECTION & ENREGISTREMENT MLFLOW MODEL REGISTRY
# ============================================

BEST_MODEL_NAME = "model_opti_web_test"

best = max(results, key=lambda r: r["auc"])

client = MlflowClient()

try:
    client.get_registered_model(BEST_MODEL_NAME)
except:
    client.create_registered_model(BEST_MODEL_NAME)

# Création d'une version dans le Model Registry
mv = client.create_model_version(
    name=BEST_MODEL_NAME,
    source=best["model_uri"],
    run_id=best["run_id"]
)

# Optionnel : mettre en STAGING automatiquement
client.transition_model_version_stage(
    name=BEST_MODEL_NAME,
    version=mv.version,
    stage="Staging"
)

print(f"⭐ Meilleur modèle = {best['name']} | AUC = {best['auc']:.4f}")
print(f"Enregistré dans MLflow Registry : {BEST_MODEL_NAME} (version {mv.version})")


⭐ Meilleur modèle = xgboost_gpu | AUC = 0.7600
Enregistré dans MLflow Registry : model_opti_web_test (version 1)


  client.transition_model_version_stage(


Cellule 7 — Visualisation Résultats Synthétiques

In [12]:
# ============================================
# 7) SYNTHÈSE DES RÉSULTATS
# ============================================

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,name,auc,run_id,model_uri
0,xgboost_gpu,0.760024,9af9ee08a3014bd3aea7fa81f6553614,models:/m-45bbc065d64a4462a407dd69e63e4de8
1,lightgbm_gpu,0.757194,d889b831e76047f0bbcda62e255c5181,models:/m-1f06cc46c16f409197209c26f8dad4aa
2,catboost_gpu,0.7587,1c6bcb364c6449d6ab75af999c2b0533,models:/m-f9615446951c43dfac9de7cfd383520b
