In [5]:
# ============================================
# 1) IMPORTS + CONFIGURATION GPU & MLFLOW
# ============================================



import mlflow
from mlflow import MlflowClient
import mlflow.sklearn

import pandas as pd
import numpy as np
import json
import yaml
import time
from datetime import datetime

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

# GPU models
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

import warnings
warnings.filterwarnings("ignore")

# MLflow experiment configuration
EXPERIMENT_NAME = "optiweb_params"
mlflow.set_experiment(EXPERIMENT_NAME)

SEED = 42
np.random.seed(SEED)

print("MLflow experiment:", EXPERIMENT_NAME)



MLflow experiment: optiweb_params


 CELLULE 2 — Chargement data complète (OptiWeb 100%)

In [6]:
### CELLULE 2 — Chargement data complète (OptiWeb 100%)

from app.features_optiweb import apply_eda

print("Chargement du dataset complet OptiWeb…")


X_train, y_train, X_test, test_ids = apply_eda()

print("Train shape:", X_train.shape)
print("Test  shape:", X_test.shape)

# Sauvegarde des features
features_used = list(X_train.columns)

with open("features_used.json", "w") as f:
    json.dump(features_used, f, indent=2)

print("Features logged:", len(features_used))


Chargement du dataset complet OptiWeb…
Train shape: (307507, 765)
Test  shape: (48744, 765)
Features logged: 765


Cellule 3 — Définition des modèles GPU

In [7]:
# ============================================
# 3) Cellule 3 — Définition des modèles GPU
# ============================================

# Calcul automatique du poids
n_pos = y_train.sum()
n_neg = len(y_train) - n_pos
scale_pos = n_neg / n_pos
print("scale_pos_weight =", scale_pos)

models_gpu = {
    "xgboost_gpu": XGBClassifier(
        tree_method="hist",
        device="cuda",
        n_estimators=400,
        max_depth=7,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="auc",
        scale_pos_weight=scale_pos,   
        random_state=SEED,
        n_jobs=-1
    ),

    "lightgbm_gpu": lgb.LGBMClassifier(
        device_type="gpu",
        boosting_type="gbdt",
        n_estimators=300,
        num_leaves=40,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos,   
        random_state=SEED
    ),

    "catboost_gpu": CatBoostClassifier(
        task_type="GPU",
        devices='0',
        iterations=500,
        depth=8,
        learning_rate=0.05,
        loss_function="Logloss",
        scale_pos_weight=scale_pos,     
        verbose=False,
        random_state=SEED
    )
}

print("Modèles GPU chargés :", list(models_gpu.keys()))


scale_pos_weight = 11.386988922457201
Modèles GPU chargés : ['xgboost_gpu', 'lightgbm_gpu', 'catboost_gpu']


Cellule 4 — Cross-validation GPU + MLflow logging


In [None]:
# ============================================
# 4) Cellule 4 — Cross-validation GPU + MLflow logging
# ============================================


from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


results = []

with mlflow.start_run(run_name="comparison_gpu_models") as parent_run:

    # Log des features
    mlflow.log_dict({"features_used": features_used}, "features_used.json")

    print("=== Début CV GPU ===")

    for name, model in models_gpu.items():
        print(f"\n=== CV {name} ===")

        with mlflow.start_run(run_name=f"cv_{name}", nested=True):

            scores = cross_val_score(
                model,
                X_train,
                y_train,
                cv=cv,
                scoring="roc_auc",
                n_jobs=11
            )

            mean_auc = float(np.mean(scores))
            std_auc = float(np.std(scores))

            mlflow.log_metric("roc_auc_mean", mean_auc)
            mlflow.log_metric("roc_auc_std", std_auc)

            print(f"{name} → mean AUC={mean_auc:.4f} std={std_auc:.4f}")

            results.append({
                "name": name,
                "auc": mean_auc
            })

print("\nRésultats :", results)



=== Début CV GPU ===

=== CV xgboost_gpu ===
xgboost_gpu → mean AUC=0.7812 std=0.0033

=== CV lightgbm_gpu ===
lightgbm_gpu → mean AUC=0.7860 std=0.0041

=== CV catboost_gpu ===
catboost_gpu → mean AUC=0.7835 std=0.0035

Résultats : [{'name': 'xgboost_gpu', 'auc': 0.7812497664360525}, {'name': 'lightgbm_gpu', 'auc': 0.786015698485207}, {'name': 'catboost_gpu', 'auc': 0.7834986175806}]


Cellule 5 — Baseline AUC full features (référence)

In [None]:
# ============================================
# 5) Cellule 5 — Sélection du meilleur modèle
# ============================================

best = max(results, key=lambda r: r["auc"])
best_family = best["name"]

print("BEST MODEL:", best_family, "AUC:", best["auc"])


BEST MODEL: lightgbm_gpu AUC: 0.786015698485207


Cellule 6 — OPTUNA (30 min max)

In [None]:
### CELLULE 6 — OPTUNA : Optimisation du meilleur modèle (30min max)  
# ==> peut absorber tout le monde mais normalement cest lgbm :)

def build_model(family, params):
    """Construit un modèle GPU selon la famille gagnante."""
    if family == "xgboost_gpu":
        return XGBClassifier(
            tree_method="hist",
            device="cuda",
            eval_metric="auc",
            scale_pos_weight=scale_pos,
            random_state=SEED,
            n_jobs=1,
            **params
        )
    elif family == "lightgbm_gpu":
        return lgb.LGBMClassifier(
            device_type="gpu",
            scale_pos_weight=scale_pos,
            verbosity=-1,
            random_state=SEED,
            **params
        )
    else:  # catboost
        return CatBoostClassifier(
            task_type="GPU",
            devices="0",
            scale_pos_weight=scale_pos,
            random_state=SEED,
            verbose=False,
            **params
        )


def suggest_params(trial, family):
    """Hyperparameter search space selon modèle."""
    if family == "xgboost_gpu":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        }

    if family == "lightgbm_gpu":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
            "num_leaves": trial.suggest_int("num_leaves", 20, 200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        }

    return {  # catboost
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3)
    }


def objective(trial):
    params = suggest_params(trial, best_family)
    model = build_model(best_family, params)

    scores = cross_val_score(
        model, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1
    )

    return float(np.mean(scores))


study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=1800)   # <=== 30 minutes max

best_params = study.best_params
best_auc_optuna = study.best_value
print("BEST AUC (Optuna CV):", best_auc_optuna)
print("BEST PARAMS:", best_params)


[I 2025-11-13 14:12:50,930] A new study created in memory with name: no-name-3f242d13-9771-4501-ab6f-21862fd7894a
[I 2025-11-13 14:13:48,800] Trial 0 finished with value: 0.7805691793141006 and parameters: {'n_estimators': 547, 'num_leaves': 20, 'learning_rate': 0.11584583329083413, 'subsample': 0.6402478279673515, 'colsample_bytree': 0.867329489950565}. Best is trial 0 with value: 0.7805691793141006.
[I 2025-11-13 14:17:18,728] Trial 1 finished with value: 0.7699641491917149 and parameters: {'n_estimators': 511, 'num_leaves': 148, 'learning_rate': 0.06243369160916571, 'subsample': 0.9290548986040734, 'colsample_bytree': 0.8792398879521577}. Best is trial 0 with value: 0.7805691793141006.
[I 2025-11-13 14:23:05,859] Trial 2 finished with value: 0.7720678388918845 and parameters: {'n_estimators': 746, 'num_leaves': 179, 'learning_rate': 0.04116626075537672, 'subsample': 0.6629506631020313, 'colsample_bytree': 0.835662235142895}. Best is trial 0 with value: 0.7805691793141006.
[I 2025-11

BEST PARAMS: {'n_estimators': 875, 'num_leaves': 23, 'learning_rate': 0.02533714281950245, 'subsample': 0.8798790451127203, 'colsample_bytree': 0.9082740591241196}


javais lancé avec job=-1 (pas optimal pour cpu : voici le result pour pouvoir comparer avec le bon :) 
edit = en fait vu que ma carte a tenu (difficilement) -1 était plus rapide : il fait tout d'un coup, mais il cré du cache qui fume mon disk ... 

Par soucis de temps je vais consservé la valeur avec job=1 mais si je run optuna sans timeout, il faudrait faire un -1 ( et laisser le pc vivre ça vie aha)

[I 2025-11-13 13:31:32,689] A new study created in memory with name: no-name-459a1743-7b94-4f66-8091-12af1b015fd5
[I 2025-11-13 13:34:18,256] Trial 0 finished with value: 0.7509510403387881 and parameters: {'n_estimators': 691, 'num_leaves': 131, 'learning_rate': 0.10326228130759496, 'subsample': 0.7042807499704653, 'colsample_bytree': 0.7068775132139175}. Best is trial 0 with value: 0.7509510403387881.
[I 2025-11-13 13:36:01,012] Trial 1 finished with value: 0.7758625111784033 and parameters: {'n_estimators': 459, 'num_leaves': 108, 'learning_rate': 0.06286525284068775, 'subsample': 0.6852783165671008, 'colsample_bytree': 0.6904180360324381}. Best is trial 1 with value: 0.7758625111784033.
[I 2025-11-13 13:37:07,227] Trial 2 finished with value: 0.7748524488031616 and parameters: {'n_estimators': 484, 'num_leaves': 49, 'learning_rate': 0.09883850630579995, 'subsample': 0.891699274515704, 'colsample_bytree': 0.7647692528508342}. Best is trial 1 with value: 0.7758625111784033.
[I 2025-11-13 13:39:13,451] Trial 3 finished with value: 0.7847320777833844 and parameters: {'n_estimators': 376, 'num_leaves': 140, 'learning_rate': 0.03552715539936053, 'subsample': 0.6891832979237428, 'colsample_bytree': 0.7065525510765979}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:41:03,892] Trial 4 finished with value: 0.73285447582984 and parameters: {'n_estimators': 814, 'num_leaves': 69, 'learning_rate': 0.19521903806869692, 'subsample': 0.6853986622953963, 'colsample_bytree': 0.8015465331552765}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:45:04,979] Trial 5 finished with value: 0.7496802637001826 and parameters: {'n_estimators': 734, 'num_leaves': 192, 'learning_rate': 0.12304428289479118, 'subsample': 0.8556453377352304, 'colsample_bytree': 0.9669232273493538}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:47:39,097] Trial 6 finished with value: 0.7523722318055741 and parameters: {'n_estimators': 616, 'num_leaves': 151, 'learning_rate': 0.10191033484332025, 'subsample': 0.9442774167574826, 'colsample_bytree': 0.7588455432033429}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:49:53,093] Trial 7 finished with value: 0.7498672455114246 and parameters: {'n_estimators': 867, 'num_leaves': 82, 'learning_rate': 0.10943230452379701, 'subsample': 0.9388844293126615, 'colsample_bytree': 0.749876167155267}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:52:15,692] Trial 8 finished with value: 0.7730559514345131 and parameters: {'n_estimators': 830, 'num_leaves': 77, 'learning_rate': 0.057595722009926326, 'subsample': 0.7565971542243666, 'colsample_bytree': 0.8195053671816284}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:53:41,491] Trial 9 finished with value: 0.746271025903615 and parameters: {'n_estimators': 315, 'num_leaves': 149, 'learning_rate': 0.15470785920400332, 'subsample': 0.9278315306035411, 'colsample_bytree': 0.8059105563490823}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 13:59:16,749] Trial 10 finished with value: 0.7792609445167787 and parameters: {'n_estimators': 996, 'num_leaves': 197, 'learning_rate': 0.023116679185742305, 'subsample': 0.6016680503322742, 'colsample_bytree': 0.6129559216273155}. Best is trial 3 with value: 0.7847320777833844.
[I 2025-11-13 14:05:09,378] Trial 11 finished with value: 0.7853946908952195 and parameters: {'n_estimators': 992, 'num_leaves': 198, 'learning_rate': 0.014666006284901015, 'subsample': 0.6099907428072658, 'colsample_bytree': 0.6322315875944823}. Best is trial 11 with value: 0.7853946908952195.
BEST PARAMS: {'n_estimators': 992, 'num_leaves': 198, 'learning_rate': 0.014666006284901015, 'subsample': 0.6099907428072658, 'colsample_bytree': 0.6322315875944823}


Cellule 7 — Entrainement final + MLflow Registry

In [11]:
### CELLULE 7 — Final training + MLflow Registry (avec AUC baseline)

from sklearn.model_selection import cross_val_score
from mlflow import MlflowClient
import pandas as pd
import yaml

BEST_MODEL_NAME = "optiweb_params"

# 1) Réentraînement final sur tout le train
final_model = build_model(best_family, best_params)
final_model.fit(X_train, y_train)

# 2) Feature importance → valable pour xgb/lgb/cat
try:
    importance = final_model.feature_importances_
    fi = pd.DataFrame({"feature": features_used, "importance": importance})
    fi.to_csv("feature_importance.csv", index=False)
except Exception as e:
    print("Impossible de calculer les feature_importances_ :", e)
    fi = None

# 3) AUC baseline du modèle final (full features)
scores_final = cross_val_score(
    final_model,
    X_train,
    y_train,
    cv=cv,                  # même StratifiedKFold que plus haut
    scoring="roc_auc",
    n_jobs=1
)
baseline_auc = float(scores_final.mean())
baseline_std = float(scores_final.std())
print(f"Baseline AUC (final model) = {baseline_auc:.4f} ± {baseline_std:.4f}")

# 4) Save params YAML
with open("best_params.yaml", "w") as f:
    yaml.dump(best_params, f)

# 5) ----- MLflow Logging -----
with mlflow.start_run(run_name="final_optiweb_model") as run:

    # features + hyperparams (en artefacts)
    mlflow.log_dict({"features_used": features_used}, "features_used.json")
    mlflow.log_artifact("best_params.yaml")
    if fi is not None:
        mlflow.log_artifact("feature_importance.csv")

    # métriques de perf du modèle final (baseline pour ton futur top-K)
    mlflow.log_metric("roc_auc_mean_final_cv", baseline_auc)
    mlflow.log_metric("roc_auc_std_final_cv", baseline_std)
    mlflow.log_param("best_family", best_family)  # pratique pour filtrer

    # Log du modèle dans l’artefact store
    model_info = mlflow.sklearn.log_model(
        sk_model=final_model,
        name="optiweb_params",
        input_example=X_train.iloc[:5],
    )

# 6) Model Registry
client = MlflowClient()

try:
    client.create_registered_model(BEST_MODEL_NAME)
except Exception:
    # déjà créé, on ignore
    pass

mv = client.create_model_version(
    name=BEST_MODEL_NAME,
    source=model_info.model_uri.replace("runs:/", "mlflow-artifacts:/"),
    run_id=run.info.run_id
)

client.transition_model_version_stage(
    name=BEST_MODEL_NAME,
    version=mv.version,
    stage="Staging"
)

print("Modèle final publié :", BEST_MODEL_NAME, "version", mv.version)
print("AUC baseline stockée dans MLflow : roc_auc_mean_final_cv")


Baseline AUC (final model) = 0.7872 ± 0.0039


Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]


Modèle final publié : optiweb_params version 2
AUC baseline stockée dans MLflow : roc_auc_mean_final_cv
