Cellule 1 ‚Äî Imports & config MLflow

In [1]:
# CELL 1 ‚Äî Imports & configuration

import json
import yaml
import numpy as np
import pandas as pd
import mlflow
from mlflow import MlflowClient

from sklearn.model_selection import StratifiedKFold, cross_val_score

import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from app.features_optiweb import apply_eda

SEED = 42
np.random.seed(SEED)

EXPERIMENT_NAME = "optiweb_topk"
mlflow.set_experiment(EXPERIMENT_NAME)

MODEL_NAME = "optiweb_params"
MODEL_VERSION = 2   # <-- ta version optimis√©e

print("Experiment:", EXPERIMENT_NAME)
print("Using model:", MODEL_NAME, "version", MODEL_VERSION)


Experiment: optiweb_topk
Using model: optiweb_params version 2


Cellule 2 ‚Äî Charger mod√®le + artifacts depuis MLflow Registry

In [2]:
# CELL 2 ‚Äî Chargement du mod√®le & des artifacts depuis le Model Registry

client = MlflowClient()

mv = client.get_model_version(MODEL_NAME, str(MODEL_VERSION))
run_id = mv.run_id
print("ModelVersion run_id:", run_id)

model_uri = f"models:/{MODEL_NAME}/{MODEL_VERSION}"

# === Charger le mod√®le depuis MLflow (flavor sklearn) ===
from mlflow.sklearn import load_model as load_sklearn_model
base_model = load_sklearn_model(model_uri)

print("Loaded estimator type:", type(base_model))

# === Charger les artifacts ===
features_path = mlflow.artifacts.download_artifacts(
    run_id=run_id,
    artifact_path="features_used.json"
)
fi_path = mlflow.artifacts.download_artifacts(
    run_id=run_id,
    artifact_path="feature_importance.csv"
)
params_path = mlflow.artifacts.download_artifacts(
    run_id=run_id,
    artifact_path="best_params.yaml"
)

with open(features_path, "r") as f:
    features_payload = json.load(f)

if isinstance(features_payload, dict) and "features_used" in features_payload:
    features_used = features_payload["features_used"]
else:
    features_used = features_payload

fi = pd.read_csv(fi_path)
with open(params_path, "r") as f:
    best_params = yaml.safe_load(f)

fi = fi.sort_values("importance", ascending=False)
features_ranked = fi["feature"].tolist()

print("Nb features used:", len(features_used))
print("Top 10 features:", features_ranked[:10])


  from .autonotebook import tqdm as notebook_tqdm


ModelVersion run_id: 6afd1e84d8f449919dca1b36c3103286
Loaded estimator type: <class 'lightgbm.sklearn.LGBMClassifier'>


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 1001.03it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 479.73it/s]

Nb features used: 765
Top 10 features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH', 'AMT_ANNUITY', 'APPROVED_CNT_PAYMENT_MEAN', 'DAYS_ID_PUBLISH', 'INSTAL_DPD_MEAN', 'AMT_CREDIT']





Cellule 3 ‚Äî Recharger le dataset avec apply_eda + imbalance

In [3]:
# CELL 3 ‚Äî Rechargement du dataset & alignement des features

# tu peux mettre nrows=None (full) ou une sous-sample temporaire pour aller plus vite
NROWS = None

X_train, y_train, X_test, test_ids = apply_eda(nrows=NROWS)
print("Raw X_train shape:", X_train.shape)

# On restreint X_train aux features utilis√©es par le mod√®le (et m√™me ordre)
X_train = X_train[features_used]
print("Aligned X_train shape:", X_train.shape)

from collections import Counter
cnt = Counter(y_train)
pos = cnt[1]
neg = cnt[0]
scale_pos = neg / pos
print("Target distribution:", cnt, " -> scale_pos_weight =", scale_pos)

# CV commun
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)


Raw X_train shape: (307507, 765)
Aligned X_train shape: (307507, 765)
Target distribution: Counter({0: 282682, 1: 24825})  -> scale_pos_weight = 11.386988922457201


Cellule 4 ‚Äî Helper build_model pour garder m√™me famille / params

On va utiliser les m√™mes hyperparams que le mod√®le du registry.


In [8]:
# CELLULE 4 ‚Äî Reconstruction propre du mod√®le LightGBM venant du registry

# 1) V√©rification mod√®le
if not isinstance(base_model, lgb.LGBMClassifier):
    raise TypeError(
        "Le mod√®le charg√© n'est pas un LightGBMClassifier. "
        "Ce notebook top-K est con√ßu pour LightGBM uniquement."
    )

# 2) R√©cup√©ration des hyperparams optimis√©s (Optuna)
params = base_model.get_params()

# 3) Mise √† jour des param√®tres critiques
params.update({
    "device_type": "gpu",
    "verbosity": -1,
    "random_state": SEED,
    "scale_pos_weight": scale_pos,   # recalcul√© dans la Cellule 3
})

# 4) Reconstruction propre d‚Äôun mod√®le NEUF (non fit)
def build_lgbm_clone():
    """Construit un clone LightGBM propre avec les bons params & GPU."""
    return lgb.LGBMClassifier(**params)

baseline_model = build_lgbm_clone()

print("\n=== Mod√®le LightGBM reconstruit ===")
print("Type:", type(baseline_model))
print("Params cl√©s:")
print(" - n_estimators:", params["n_estimators"])
print(" - num_leaves:", params["num_leaves"])
print(" - learning_rate:", params["learning_rate"])
print(" - scale_pos_weight:", params["scale_pos_weight"])
print(" - device_type:", params["device_type"])
print("\nbuild_lgbm_clone() pr√™t pour le top-K !")



=== Mod√®le LightGBM reconstruit ===
Type: <class 'lightgbm.sklearn.LGBMClassifier'>
Params cl√©s:
 - n_estimators: 875
 - num_leaves: 23
 - learning_rate: 0.02533714281950245
 - scale_pos_weight: 11.386988922457201
 - device_type: gpu

build_lgbm_clone() pr√™t pour le top-K !


Cellule 5 ‚Äî Baseline AUC full features (r√©f√©rence)

In [9]:
# CELL 5 ‚Äî AUC baseline du mod√®le optimis√© avec toutes les features

scores_full = cross_val_score(
    baseline_model,
    X_train,
    y_train,
    cv=cv,
    scoring="roc_auc",
    n_jobs=1
)

baseline_auc = float(scores_full.mean())
baseline_std = float(scores_full.std())

print(f"Baseline (full features) AUC = {baseline_auc:.4f} ¬± {baseline_std:.4f}")


Baseline (full features) AUC = 0.7871 ¬± 0.0039


Cellule 6 ‚Äî Exp√©rience top-K (r√©utilise le mod√®le du registry comme gabarit)

In [11]:
# CELL 6 ‚Äî Sweep sur K (top-K features) avec MLflow + PRINTS lisibles

K_GRID = [30, 60, 100, 300, len(features_ranked)]

results_topk = []

print("\n==============================================")
print("     üîé D√âBUT DU TOP-K SWEEP (LightGBM) ")
print("==============================================\n")
print(f"AUC baseline (full model) = {baseline_auc:.5f} ¬± {baseline_std:.5f}\n")


with mlflow.start_run(run_name="topk_sweep_from_registry") as parent_run:

    mlflow.log_param("source_model_name", MODEL_NAME)
    mlflow.log_param("source_model_version", MODEL_VERSION)
    mlflow.log_metric("baseline_auc_full", baseline_auc)
    mlflow.log_metric("baseline_auc_std_full", baseline_std)

    for K in K_GRID:

        topk_features = features_ranked[:K]
        Xk = X_train[topk_features]

        print(f"\n=== üîµ K = {K} features ===")
        print("Top features:", topk_features[:5], "...")

        with mlflow.start_run(run_name=f"topK_{K}", nested=True):

            model_k = build_lgbm_clone()

            scores_k = cross_val_score(
                model_k,
                Xk,
                y_train,
                cv=cv,
                scoring="roc_auc",
                n_jobs=1
            )

            mean_k = float(scores_k.mean())
            std_k  = float(scores_k.std())
            delta  = mean_k - baseline_auc

            # MLflow logs
            mlflow.log_param("K", K)
            mlflow.log_metric("roc_auc_mean", mean_k)
            mlflow.log_metric("roc_auc_std", std_k)
            mlflow.log_metric("delta_auc_vs_full", delta)

            with open("topk_features.txt", "w") as f:
                f.write("\n".join(topk_features))
            mlflow.log_artifact("topk_features.txt")

            # Ajout au tableau final
            results_topk.append(
                {"K": K, "auc": mean_k, "std": std_k, "delta": delta}
            )

            # PRINTS LISIBLE
            print(f"AUC = {mean_k:.5f} ¬± {std_k:.5f}   Œî vs full = {delta:+.5f}")


# R√©sultat final sous forme de tableau
results_topk_df = pd.DataFrame(results_topk).sort_values("K")

print("\n\n==============================================")
print(" üî• R√©sultats finaux du TOP-K sweep")
print("==============================================\n")
print(results_topk_df.to_string(index=False))

results_topk_df



     üîé D√âBUT DU TOP-K SWEEP (LightGBM) 

AUC baseline (full model) = 0.78715 ¬± 0.00393


=== üîµ K = 30 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.77467 ¬± 0.00371   Œî vs full = -0.01248

=== üîµ K = 60 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.78350 ¬± 0.00378   Œî vs full = -0.00365

=== üîµ K = 100 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.78662 ¬± 0.00395   Œî vs full = -0.00053

=== üîµ K = 300 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.78742 ¬± 0.00388   Œî vs full = +0.00027

=== üîµ K = 765 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.78706 ¬± 0.00377   Œî vs full = -0.00009


 üî• R√©sultats fina

Unnamed: 0,K,auc,std,delta
0,30,0.774669,0.003711,-0.01248
1,60,0.783498,0.003784,-0.003651
2,100,0.786615,0.003955,-0.000534
3,300,0.787418,0.003876,0.000269
4,765,0.78706,0.003767,-8.9e-05


Au vue de ses r√©sultats, je vais affiner un peu plus avec tres peu de features !

In [12]:
# CELL 6 ‚Äî Sweep sur K (top-K features) avec MLflow + PRINTS lisibles

K_GRID = [5, 15, 20, 40]

results_topk = []

print("\n==============================================")
print("     üîé D√âBUT DU TOP-K SWEEP (LightGBM) ")
print("==============================================\n")
print(f"AUC baseline (full model) = {baseline_auc:.5f} ¬± {baseline_std:.5f}\n")


with mlflow.start_run(run_name="topk_sweep_from_registry") as parent_run:

    mlflow.log_param("source_model_name", MODEL_NAME)
    mlflow.log_param("source_model_version", MODEL_VERSION)
    mlflow.log_metric("baseline_auc_full", baseline_auc)
    mlflow.log_metric("baseline_auc_std_full", baseline_std)

    for K in K_GRID:

        topk_features = features_ranked[:K]
        Xk = X_train[topk_features]

        print(f"\n=== üîµ K = {K} features ===")
        print("Top features:", topk_features[:5], "...")

        with mlflow.start_run(run_name=f"topK_{K}", nested=True):

            model_k = build_lgbm_clone()

            scores_k = cross_val_score(
                model_k,
                Xk,
                y_train,
                cv=cv,
                scoring="roc_auc",
                n_jobs=1
            )

            mean_k = float(scores_k.mean())
            std_k  = float(scores_k.std())
            delta  = mean_k - baseline_auc

            # MLflow logs
            mlflow.log_param("K", K)
            mlflow.log_metric("roc_auc_mean", mean_k)
            mlflow.log_metric("roc_auc_std", std_k)
            mlflow.log_metric("delta_auc_vs_full", delta)

            with open("topk_features.txt", "w") as f:
                f.write("\n".join(topk_features))
            mlflow.log_artifact("topk_features.txt")

            # Ajout au tableau final
            results_topk.append(
                {"K": K, "auc": mean_k, "std": std_k, "delta": delta}
            )

            # PRINTS LISIBLE
            print(f"AUC = {mean_k:.5f} ¬± {std_k:.5f}   Œî vs full = {delta:+.5f}")


# R√©sultat final sous forme de tableau
results_topk_df = pd.DataFrame(results_topk).sort_values("K")

print("\n\n==============================================")
print(" üî• R√©sultats finaux du TOP-K sweep")
print("==============================================\n")
print(results_topk_df.to_string(index=False))

results_topk_df



     üîé D√âBUT DU TOP-K SWEEP (LightGBM) 

AUC baseline (full model) = 0.78715 ¬± 0.00393


=== üîµ K = 5 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.75230 ¬± 0.00478   Œî vs full = -0.03485

=== üîµ K = 15 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.76565 ¬± 0.00341   Œî vs full = -0.02150

=== üîµ K = 20 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.77045 ¬± 0.00419   Œî vs full = -0.01669

=== üîµ K = 40 features ===
Top features: ['PAYMENT_RATE', 'EXT_SOURCE_3', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'DAYS_BIRTH'] ...
AUC = 0.78062 ¬± 0.00426   Œî vs full = -0.00653


 üî• R√©sultats finaux du TOP-K sweep

 K      auc      std     delta
 5 0.752300 0.004776 -0.034849
15 0.765653 0.003414 -0.021496
20 0.770454 0.004186 -0.016695
40 0.780620 0.004258 -0.006529


Unnamed: 0,K,auc,std,delta
0,5,0.7523,0.004776,-0.034849
1,15,0.765653,0.003414,-0.021496
2,20,0.770454,0.004186,-0.016695
3,40,0.78062,0.004258,-0.006529


OK ! on va save le top 20 et top 40 as light et med, la version full sera dev si le projet est vendu ‚úåÔ∏èü§£

CELLULE 8 ‚Äî Export des mod√®les Top-20 & Top-40 dans le registry

In [13]:
# ================================================
# CELLULE 8 ‚Äî Enregistrer optiweb_top20 et optiweb_top40
# ================================================

from mlflow import MlflowClient

client = MlflowClient()

def save_topk_model(K, model_name):
    """
    Entra√Æne le mod√®le avec top-K features,
    loggue les artefacts, et publie dans le Model Registry.
    """

    print(f"\n=== üöÄ Publication mod√®le {model_name} (K={K}) ===")

    # --- 1) Select top-K features
    topk_features = features_ranked[:K]
    Xk = X_train[topk_features]

    # --- 2) Rebuild LightGBM
    model_k = build_lgbm_clone()
    model_k.fit(Xk, y_train)

    # --- 3) Create feature_importance.csv for this sub-model
    fi_k = pd.DataFrame({
        "feature": topk_features,
        "importance": model_k.feature_importances_
    })
    fi_k.to_csv("feature_importance.csv", index=False)

    # --- 4) Save features_used.json
    with open("features_used.json", "w") as f:
        json.dump({"features_used": topk_features}, f, indent=2)

    # --- 5) Save params.yaml (same Optuna hyperparams)
    with open("best_params.yaml", "w") as f:
        yaml.dump(best_params, f)

    # --- 6) MLflow logging
    with mlflow.start_run(run_name=f"export_{model_name}") as run:

        # Log artifacts
        mlflow.log_artifact("features_used.json")
        mlflow.log_artifact("feature_importance.csv")
        mlflow.log_artifact("best_params.yaml")

        # Log model
        model_info = mlflow.lightgbm.log_model(
            lgb_model=model_k,
            name=model_name,
            input_example=Xk.iloc[:5]
        )

        run_id = run.info.run_id

    # --- 7) Register in MLflow Model Registry
    try:
        client.create_registered_model(model_name)
    except:
        pass  # Already exists

    mv = client.create_model_version(
        name=model_name,
        source=model_info.model_uri.replace("runs:/", "mlflow-artifacts:/"),
        run_id=run_id
    )

    client.transition_model_version_stage(
        name=model_name,
        version=mv.version,
        stage="Staging"
    )

    print(f"üéâ Mod√®le {model_name} publi√© ‚Äî version {mv.version}")


# ----- Enregistrement Top-20 -----
save_topk_model(20, "optiweb_top20")

# ----- Enregistrement Top-40 -----
save_topk_model(40, "optiweb_top40")



=== üöÄ Publication mod√®le optiweb_top20 (K=20) ===


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 685.12it/s]
  client.transition_model_version_stage(


üéâ Mod√®le optiweb_top20 publi√© ‚Äî version 1

=== üöÄ Publication mod√®le optiweb_top40 (K=40) ===


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 432.01it/s]

üéâ Mod√®le optiweb_top40 publi√© ‚Äî version 1



  client.transition_model_version_stage(
