In [4]:
import numpy as np
import pandas as pd
import shap
import os
import matplotlib
matplotlib.use("Agg")               
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_validate, cross_val_predict
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.inspection import permutation_importance
from shap import TreeExplainer
from scipy import sparse
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.calibration import CalibratedClassifierCV

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, 
    classification_report, confusion_matrix,
    confusion_matrix, roc_auc_score, 
    average_precision_score, precision_recall_curve,
    precision_recall_fscore_support, auc
)

import importlib
import fonctions.fonctions_features as ff
importlib.reload(ff)
from fonctions.fonctions_features import FeatureEngineer, ColumnDropper

import fonctions.fonctions_CV as funcCV
importlib.reload(funcCV)
from fonctions.fonctions_CV import perm_importance_cv, get_transformed_feature_names_and_source_map

import fonctions.fonctions_grid as fg
importlib.reload(fg)
from fonctions.fonctions_grid import get_positive_scores, evaluate_at_threshold, pick_threshold, pick_threshold_oof

### Import du jeu de données 
 Dans cette partie j'importe les données et je parametre des élèments qui sont utilisés dans plusieurs parties du code comme les élèments à supprimer 


In [5]:
data = pd.read_csv("data/jeu_donnee_RH_complet_transforme.csv")

In [6]:
#code réutilisé dans l'ensemble des cellules

cols_a_supprimer = [
    "hors_entreprise_majoritaire",
    "perf_degrade_flag",
    "perf_degrade_niv",
    "a_connu_mvmnt_interne",
    "genre",
    "heure_supplementaires",
    "pee_participation_flag",
    "pee_participation_2plus",
    "revenu_mensuel",
    "niveau_education",
    "niveau_hierarchique_poste",
    "annees_dans_le_poste_actuel",
    "poste",
    "annees_dans_l_entreprise",
    "age",
    "annee_experience_totale",
    "annes_sous_responsable_actuel",
]


thr = 0.3

over = SMOTE(sampling_strategy=0.2, k_neighbors=5)
under = RandomUnderSampler(sampling_strategy=0.5)

# Test des différents modèles sur une version basique pour selectionner le modèle

Cette partie du code lance différents modèles avec la même méthodologie pour tenter de choisir le modèle qui aura les performances les plus prometteuses. 
Le modèle Dummy sert d'étalon. 

In [7]:
#DUMMY

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_dummy = Pipeline(steps=[
    ("fe", FeatureEngineer()),   # <- ton feature engineering, sans fuite
    ("prep", preprocess),
    ("clf", DummyClassifier(strategy="most_frequent", random_state=42)),
])

# --- 4A) En TRAIN / TEST "clas
# sique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_dummy.fit(X_train, y_train) 

y_pred_dummy = pipe_dummy.predict(X_test)

# --- 4B) En Cross-Validation stratifiée (pas de fuite) ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe_dummy, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)

print("\n=== Dummy ===")

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe_dummy.predict_proba(X_test)[:, 1]
y_pred_06 = (proba_te >= thr).astype(int)

print(f"\n-- RAPPORT (seuil à {thr}) --")
print(classification_report(y_test, y_pred_06, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_06))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")


=== Dummy ===
[precision] train=0.000±0.000 | test=0.000±0.000
[recall] train=0.000±0.000 | test=0.000±0.000
[roc_auc] train=0.500±0.000 | test=0.500±0.000

-- RAPPORT (seuil à 0.3) --
              precision    recall  f1-score   support

           0      0.840     1.000     0.913       247
           1      0.000     0.000     0.000        47

    accuracy                          0.840       294
   macro avg      0.420     0.500     0.457       294
weighted avg      0.706     0.840     0.767       294

-- MATRICE DE CONFUSION --
[[247   0]
 [ 47   0]]
-- AUCs (seuil-indep.) --
ROC AUC (test) = 0.500
PR AUC  (test) = 0.160


In [8]:
#RegressionLogistique

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_logit = ImbPipeline(steps=[
    ("fe", FeatureEngineer()),   # <- feature engineering, sans fuite
    ("drop", ColumnDropper(columns=cols_a_supprimer)),
    ("prep", preprocess),
    ("over", over),
    ("under", under),
    ("clf", LogisticRegression(class_weight="balanced", max_iter=2000)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_logit.fit(X_train, y_train)
yp_logit_proba = pipe_logit.predict_proba(X_test)[:, 1]
yp_logit = (yp_logit_proba >= thr).astype(int)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe_logit, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)

print("Rapport RegressionLogistique")

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe_logit.predict_proba(X_test)[:, 1]
y_pred_thr = (proba_te >= thr).astype(int)

print(f"\n-- RAPPORT (seuil à {thr}) --")
print(classification_report(y_test, y_pred_thr, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_thr))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")

Rapport RegressionLogistique
[precision] train=0.408±0.010 | test=0.380±0.016
[recall] train=0.801±0.012 | test=0.772±0.033
[roc_auc] train=0.867±0.004 | test=0.832±0.014

-- RAPPORT (seuil à 0.3) --
              precision    recall  f1-score   support

           0      0.958     0.559     0.706       247
           1      0.273     0.872     0.416        47

    accuracy                          0.609       294
   macro avg      0.616     0.716     0.561       294
weighted avg      0.849     0.609     0.660       294

-- MATRICE DE CONFUSION --
[[138 109]
 [  6  41]]
-- AUCs (seuil-indep.) --
ROC AUC (test) = 0.826
PR AUC  (test) = 0.548


In [9]:
#ForetAleatoire

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_random = ImbPipeline(steps=[
    ("fe", FeatureEngineer()),   # <- feature engineering, sans fuite
    ("drop", ColumnDropper(columns=cols_a_supprimer)),
    ("prep", preprocess),
    ("over", over),
    ("under", under),
    ("rf", RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_random.fit(X_train, y_train)
yp_random = pipe_random.predict(X_test)
yp_random_proba = pipe_random.predict_proba(X_test)[:, 1]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe_random, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)

print("Rapport ForetAleatoire")

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe_random.predict_proba(X_test)[:, 1]
y_pred_thr = (proba_te >= thr).astype(int)

print(f"\n-- RAPPORT (seuil à {thr}) --")
print(classification_report(y_test, y_pred_thr, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_thr))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")

Rapport ForetAleatoire
[precision] train=0.779±0.023 | test=0.504±0.040
[recall] train=1.000±0.000 | test=0.473±0.027
[roc_auc] train=0.997±0.001 | test=0.809±0.024

-- RAPPORT (seuil à 0.3) --
              precision    recall  f1-score   support

           0      0.953     0.575     0.717       247
           1      0.276     0.851     0.417        47

    accuracy                          0.619       294
   macro avg      0.614     0.713     0.567       294
weighted avg      0.845     0.619     0.669       294

-- MATRICE DE CONFUSION --
[[142 105]
 [  7  40]]
-- AUCs (seuil-indep.) --
ROC AUC (test) = 0.810
PR AUC  (test) = 0.518


In [10]:
#HistGradientBoosting

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_hgb = ImbPipeline(steps=[
    ("fe", FeatureEngineer()),   # <- feature engineering, sans fuite
    ("drop", ColumnDropper(columns=cols_a_supprimer)),
    ("prep", preprocess),
    ("over", over),
    ("under", under),    
    ("hgb", HistGradientBoostingClassifier(
    max_depth=None,
    learning_rate=0.1,
    max_iter=300,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
    # (pas de class_weight ici; on compensera avec la métrique choisie et, plus tard, le tuning)
)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_hgb.fit(X_train, y_train)
yp_hgb = pipe_hgb.predict(X_test)
yp_hgb_proba = pipe_hgb.predict_proba(X_test)[:, 1]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe_hgb, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)
print("Rapport HistGradientBoosting")

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe_hgb.predict_proba(X_test)[:, 1]
y_pred_thr = (proba_te >= thr).astype(int)

print(f"\n-- RAPPORT (seuil à {thr}) --")
print(classification_report(y_test, y_pred_thr, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_thr))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")

Rapport HistGradientBoosting
[precision] train=0.704±0.027 | test=0.465±0.039
[recall] train=0.848±0.052 | test=0.515±0.068
[roc_auc] train=0.952±0.009 | test=0.793±0.017

-- RAPPORT (seuil à 0.3) --
              precision    recall  f1-score   support

           0      0.941     0.717     0.814       247
           1      0.340     0.766     0.471        47

    accuracy                          0.724       294
   macro avg      0.641     0.741     0.642       294
weighted avg      0.845     0.724     0.759       294

-- MATRICE DE CONFUSION --
[[177  70]
 [ 11  36]]
-- AUCs (seuil-indep.) --
ROC AUC (test) = 0.802
PR AUC  (test) = 0.571


In [11]:
#XGBOOST
# --- 2) Préparation des données d’entrée (pandas) ---
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_cont_sel = selector(dtype_include=["number"], dtype_exclude=["bool"])
bool_sel         = selector(dtype_include=["bool"])
categorical_sel  = selector(dtype_exclude=["number", "bool"])

num_poly_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    #("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scaler", StandardScaler()),
])

bool_pipe = Pipeline([
    ("to_int", FunctionTransformer(lambda X: X.astype(np.int8))),   # <- bool → 0/1
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # dense
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_poly_pipe, numeric_cont_sel),
        ("bool", bool_pipe, bool_sel),
        ("cat",  cat_pipe, categorical_sel),
    ],
    remainder="drop",
)

pipe_XG = ImbPipeline(steps=[
    ("fe", FeatureEngineer()),   # <- ton feature engineering, sans fuite
    ("drop", ColumnDropper(columns=cols_a_supprimer)),
    ("prep", preprocess),
    ("over", over),
    ("under", under),    
    ("xgb", XGBClassifier(
        random_state=42,
        eval_metric="logloss",
        n_estimators=300,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        n_jobs=-1
    )),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_XG.fit(X_train, y_train)              

# --- 4B) En Cross-Validation stratifiée (pas de fuite) ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe_XG, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)

print("\n=== XgBoost ===")

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe_XG.predict_proba(X_test)[:, 1]
y_pred_06 = (proba_te >= thr).astype(int)

print(f"\n-- RAPPORT (seuil à {0.3}) --")
print(classification_report(y_test, y_pred_06, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_06))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")


=== XgBoost ===
[precision] train=0.730±0.013 | test=0.459±0.031
[recall] train=1.000±0.000 | test=0.510±0.050
[roc_auc] train=0.996±0.001 | test=0.803±0.012

-- RAPPORT (seuil à 0.3) --
              precision    recall  f1-score   support

           0      0.925     0.802     0.859       247
           1      0.388     0.660     0.488        47

    accuracy                          0.779       294
   macro avg      0.656     0.731     0.674       294
weighted avg      0.839     0.779     0.800       294

-- MATRICE DE CONFUSION --
[[198  49]
 [ 16  31]]
-- AUCs (seuil-indep.) --
ROC AUC (test) = 0.813
PR AUC  (test) = 0.485


# Calibrage des hyperparametres 

Cette partie du code calibre les hyperparametres. 
Deux solutions pour le rééquilibrage des données sont testé (SMOTE et scale_pos_weight) dans deux code similaires
Un des deux codes sera à retirer selon les résultats

In [12]:
# ===============================================================
# XGBoost + (option) SMOTE/Under + GridSearch + Seuil PR + (option) Calibration
# ===============================================================

# ===============================================================
# 0) Paramètres généraux à ajuster
# ===============================================================
RANDOM_STATE = 42

# Choix de stratégie d'équilibrage
USE_SMOTE = True          # True => SMOTE+UnderSampler ; False => pas de resampling
SMOTE_RATIO = 0.20        # ratio cible (minorité / majorité) pour SMOTE
UNDER_RATIO = 0.50        # ratio cible (minorité / majorité) après under-sampling

# Calibration des probabilités (isotonic ou sigmoid). Mets True si tu veux calibrer
USE_CALIBRATION = False
CALIB_METHOD = "isotonic"  # "isotonic" (plus souple) ou "sigmoid" (Platt)

# Objectif business (seuil)
USE_TARGET_RECALL = True   # True => choisir un seuil pour atteindre un rappel cible
RECALL_TARGET = 0.75       # rappel cible
# Si False => on prendra le seuil qui maximise F1 (sur OOF)


# ===============================================================
# 3) Données & split
# ===============================================================
# On part d'un DataFrame `data` avec une cible 'a_quitte_l_entreprise' ("Non"/"Oui")
data_pd = data.copy()
X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# ===============================================================
# 4) Préprocessing (imputer+scale num, imputer+OHE cat)
#     NB: on laisse OHE même pour XGBoost pour rester simple & compatible SMOTE
# ===============================================================
numeric_cont_sel = selector(dtype_include=["number"], dtype_exclude=["bool"])
bool_sel         = selector(dtype_include=["bool"])
categorical_sel  = selector(dtype_exclude=["number", "bool"])

num_poly_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    #("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scaler", StandardScaler()),
])

bool_pipe = Pipeline([
    ("to_int", FunctionTransformer(lambda X: X.astype(np.int8),
                                  feature_names_out="one-to-one")),   # <- bool → 0/1
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # dense
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_poly_pipe, numeric_cont_sel),
        ("bool", bool_pipe, bool_sel),
        ("cat",  cat_pipe, categorical_sel),
    ],
    remainder="drop",
)

# ===============================================================
# 5) XGBClassifier + (option) SMOTE/Under + GridSearchCV
# ===============================================================
# Option alternative au SMOTE: scale_pos_weight = (#neg/#pos) recommandé par XGBoost
def compute_scale_pos_weight(y):
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    return float(neg) / float(pos) if pos > 0 else 1.0

base_xgb_params = dict(
    objective="binary:logistic",
    eval_metric="aucpr",          # pertinent quand classes déséquilibrées
    tree_method="hist",           # rapide
    random_state=RANDOM_STATE,
    n_jobs=-1
)

if not USE_SMOTE:
    base_xgb_params["scale_pos_weight"] = compute_scale_pos_weight(y_train)  # cf. doc XGBoost

xgb_clf = XGBClassifier(**base_xgb_params)

if USE_SMOTE:
    # IMPORTANT : imblearn.Pipeline pour chaîner des resamplers
    pipe = ImbPipeline(steps=[
        ("fe", FeatureEngineer()),
        ("drop", ColumnDropper(columns=cols_a_supprimer)),
        ("prep", preprocess),
        ("over", SMOTE(sampling_strategy=SMOTE_RATIO, k_neighbors=5, random_state=RANDOM_STATE)),
        ("under", RandomUnderSampler(sampling_strategy=UNDER_RATIO, random_state=RANDOM_STATE)),
        ("xgb", xgb_clf),
    ])
else:
    pipe = Pipeline(steps=[
        ("fe", FeatureEngineer()),
        ("drop", ColumnDropper(columns=cols_a_supprimer)),
        ("prep", preprocess),
        ("xgb", xgb_clf),
    ])

# Grille XGB "raisonnable" (pas trop grosse) — on refit sur AP (PR-AUC)
param_grid = {
    "xgb__n_estimators": [400, 800],
    "xgb__learning_rate": [0.03, 0.08, 0.15],
    "xgb__max_depth": [3, 4, 5],
    "xgb__min_child_weight": [1, 3, 5],
    "xgb__subsample": [0.7, 0.9, 1.0],
    "xgb__colsample_bytree": [0.7, 0.9, 1.0],
    "xgb__reg_alpha": [0.0, 0.1],
    "xgb__reg_lambda": [1.0, 2.0, 5.0],
    # "xgb__gamma": [0.0, 0.1],  # à activer si tu veux durcir le split
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scoring = {"ap": "average_precision", "roc_auc": "roc_auc"}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="ap",            # on refit sur la meilleure AP (PR-AUC)
    cv=cv,
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train, y_train)
print("\n=== Meilleurs hyperparamètres (refit=AP) ===")
print(gs.best_params_)
print(f"Meilleure AP (CV) : {gs.best_score_:.4f}")

best_model = gs.best_estimator_

# ===============================================================
# 6) (option) Calibration des probabilités
#     ⚠️ Idéalement faire la calibration sur un split de validation séparé.
#     Ici on illustre avec cv=5 pour limiter la fuite.
# ===============================================================
if USE_CALIBRATION:
    calibrator = CalibratedClassifierCV(
        base_estimator=best_model,
        method=CALIB_METHOD,
        cv=5
    )
    calibrator.fit(X_train, y_train)
    final_model = calibrator
else:
    final_model = best_model

# ===============================================================
# 7) Choix de seuil SANS regarder le test (OOF sur le TRAIN)
# ===============================================================
mode = "target_recall" if USE_TARGET_RECALL else "max_f1"
thr_oof, thr_summary = pick_threshold_oof(
    final_model, X_train, y_train,
    mode=mode, recall_target=RECALL_TARGET,
    n_splits=5, random_state=RANDOM_STATE
)
print("\n=== Seuil choisi (OOF) ===")
print(thr_summary)

# ===============================================================
# 8) Évaluation finale sur TEST
# ===============================================================
# refit final sur tout le TRAIN (GridSearchCV l’a déjà fait sur folds; on refit encore par sécurité)
final_model.fit(X_train, y_train)

# Scores seuil-indépendants
proba_test = get_positive_scores(final_model, X_test)
print("\n=== AUC/AP sur test (seuil-indépendant) ===")
print("ROC AUC (test) :", f"{roc_auc_score(y_test, proba_test):.3f}")
print("Avg Precision (test) :", f"{average_precision_score(y_test, proba_test):.3f}")

# Rapport au seuil OOF
res = evaluate_at_threshold(y_test, proba_test, thr_oof)
print(f"\n=== Rapport @ seuil OOF ({mode}) ===")
print(f"Seuil: {res['threshold']:.4f}")
print(res["report"])
print("Confusion matrix [[tn, fp],[fn, tp]]:\n", res["confusion_matrix"])

# (Option diagnostic) point F1-max sur le TEST (ne PAS geler ce seuil en production)
prec, rec, thr = precision_recall_curve(y_test, proba_test)
P, R, T = prec[:-1], rec[:-1], thr
F1 = (2 * P * R) / (P + R + 1e-12)
ix = int(np.nanargmax(F1))
print("\n[INFO] Test F1-max (diagnostic) : "
      f"thr={T[ix]:.4f} | P={P[ix]:.3f} R={R[ix]:.3f} F1={F1[ix]:.3f}")


Fitting 5 folds for each of 2916 candidates, totalling 14580 fits

=== Meilleurs hyperparamètres (refit=AP) ===
{'xgb__colsample_bytree': 0.7, 'xgb__learning_rate': 0.03, 'xgb__max_depth': 3, 'xgb__min_child_weight': 3, 'xgb__n_estimators': 400, 'xgb__reg_alpha': 0.1, 'xgb__reg_lambda': 1.0, 'xgb__subsample': 0.7}
Meilleure AP (CV) : 0.5788

=== Seuil choisi (OOF) ===
{'chosen_mode': 'target_recall', 'recall_target': 0.75, 'threshold': 0.2880828082561493, 'precision': 0.35049019607843135, 'recall': 0.7526315789473684, 'f1': 0.4782608695652174, 'avg_precision_pr': 0.5631597784050519, 'roc_auc': 0.8189708551297107}

=== AUC/AP sur test (seuil-indépendant) ===
ROC AUC (test) : 0.795
Avg Precision (test) : 0.536

=== Rapport @ seuil OOF (target_recall) ===
Seuil: 0.2881
              precision    recall  f1-score   support

           0      0.948     0.733     0.826       247
           1      0.359     0.787     0.493        47

    accuracy                          0.741       294
   ma

In [13]:
# ===============================================================
# XGBoost + (option) SMOTE/Under + GridSearch + Seuil PR + (option) Calibration
# ===============================================================

# ===============================================================
# 0) Paramètres généraux à ajuster
# ===============================================================
RANDOM_STATE = 42

# Choix de stratégie d'équilibrage
USE_SMOTE = False          # True => SMOTE+UnderSampler ; False => pas de resampling
SMOTE_RATIO = 0.20        # ratio cible (minorité / majorité) pour SMOTE
UNDER_RATIO = 0.50        # ratio cible (minorité / majorité) après under-sampling

# Calibration des probabilités (isotonic ou sigmoid). Mets True si tu veux calibrer
USE_CALIBRATION = True
CALIB_METHOD = "isotonic"  # "isotonic" (plus souple) ou "sigmoid" (Platt)

# Objectif business (seuil)
USE_TARGET_RECALL = True   # True => choisir un seuil pour atteindre un rappel cible
RECALL_TARGET = 0.75       # rappel cible
# Si False => on prendra le seuil qui maximise F1 (sur OOF)

# ===============================================================
# 1) Utilitaires (seuils & évaluations)
# ===============================================================
def get_positive_scores(model, X):
    """Retourne un score pour la classe positive (1): predict_proba[:,1] sinon decision_function."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    elif hasattr(model, "decision_function"):
        s = model.decision_function(X)
        if s.ndim == 1:
            return s
        # Multi-sorties: essaye d'attraper la colonne de la classe 1
        pos_idx = -1
        if hasattr(model, "classes_"):
            cls = np.array(model.classes_)
            idx = np.where(cls == 1)[0]
            if len(idx):
                pos_idx = int(idx[0])
        return s[:, pos_idx]
    else:
        raise AttributeError("Le modèle ne fournit ni predict_proba ni decision_function.")

def evaluate_at_threshold(y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", zero_division=0
    )
    acc = accuracy_score(y_true, y_pred)
    bacc = balanced_accuracy_score(y_true, y_pred)
    ap = average_precision_score(y_true, scores)
    roc = roc_auc_score(y_true, scores)
    cm = confusion_matrix(y_true, y_pred)  # [[tn, fp],[fn, tp]]
    rep = classification_report(y_true, y_pred, digits=3, zero_division=0)
    return {
        "threshold": float(thr),
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "accuracy": float(acc),
        "balanced_accuracy": float(bacc),
        "avg_precision_pr": float(ap),
        "roc_auc": float(roc),
        "confusion_matrix": cm,
        "report": rep
    }

def pick_threshold(y_true, scores, mode="max_f1", recall_target=0.75):
    """
    Choisit un seuil depuis la courbe PR (F1-max ou rappel cible), via precision_recall_curve (sklearn).
    """
    precisions, recalls, thresholds = precision_recall_curve(y_true, scores)
    # Alignement: thresholds a une longueur len(precisions)-1
    P, R, T = precisions[:-1], recalls[:-1], thresholds
    F1 = np.where((P + R) > 0, 2 * P * R / (P + R), 0.0)

    if mode == "max_f1":
        idx = int(np.nanargmax(F1))
    elif mode == "target_recall":
        feas = np.where(R >= recall_target)[0]
        idx = int(feas[np.nanargmax(F1[feas])]) if len(feas) else int(np.nanargmax(F1))
    else:
        raise ValueError("mode doit être 'max_f1' ou 'target_recall'")

    return float(T[idx]), {
        "chosen_mode": mode,
        "recall_target": recall_target if mode == "target_recall" else None,
        "threshold": float(T[idx]),
        "precision": float(P[idx]),
        "recall": float(R[idx]),
        "f1": float(F1[idx]),
        "avg_precision_pr": float(average_precision_score(y_true, scores)),
        "roc_auc": float(roc_auc_score(y_true, scores)),
    }

def pick_threshold_oof(model, X_train, y_train, mode="max_f1", recall_target=0.75, n_splits=5, random_state=42):
    """
    Choix de seuil sur des scores OOF (out-of-fold) pour éviter toute fuite sur le test.
    """
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_scores = cross_val_predict(model, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
    thr, summary = pick_threshold(y_train, oof_scores, mode=mode, recall_target=recall_target)
    return float(thr), summary

# ===============================================================
# 3) Données & split
# ===============================================================
# On part d'un DataFrame `data` avec une cible 'a_quitte_l_entreprise' ("Non"/"Oui")
data_pd = data.copy()
X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

# ===============================================================
# 4) Préprocessing (imputer+scale num, imputer+OHE cat)
#     NB: on laisse OHE même pour XGBoost pour rester simple & compatible SMOTE
# ===============================================================
numeric_cont_sel = selector(dtype_include=["number"], dtype_exclude=["bool"])
bool_sel         = selector(dtype_include=["bool"])
categorical_sel  = selector(dtype_exclude=["number", "bool"])

num_poly_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    #("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("scaler", StandardScaler()),
])

bool_pipe = Pipeline([
    ("to_int", FunctionTransformer(lambda X: X.astype(np.int8),
                                  feature_names_out="one-to-one")),   # <- bool → 0/1
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # dense
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_poly_pipe, numeric_cont_sel),
        ("bool", bool_pipe, bool_sel),
        ("cat",  cat_pipe, categorical_sel),
    ],
    remainder="drop",
)

# ===============================================================
# 5) XGBClassifier + (option) SMOTE/Under + GridSearchCV
# ===============================================================
# Option alternative au SMOTE: scale_pos_weight = (#neg/#pos) recommandé par XGBoost
def compute_scale_pos_weight(y):
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    return float(neg) / float(pos) if pos > 0 else 1.0

base_xgb_params = dict(
    objective="binary:logistic",
    eval_metric="aucpr",          # pertinent quand classes déséquilibrées
    tree_method="hist",           # rapide
    random_state=RANDOM_STATE,
    n_jobs=-1
)

if not USE_SMOTE:
    base_xgb_params["scale_pos_weight"] = compute_scale_pos_weight(y_train)  # cf. doc XGBoost

xgb_clf = XGBClassifier(**base_xgb_params)

if USE_SMOTE:
    # IMPORTANT : imblearn.Pipeline pour chaîner des resamplers
    pipe = ImbPipeline(steps=[
        ("fe", FeatureEngineer()),
        ("drop", ColumnDropper(columns=cols_a_supprimer)),
        ("prep", preprocess),
        ("over", SMOTE(sampling_strategy=SMOTE_RATIO, k_neighbors=5, random_state=RANDOM_STATE)),
        ("under", RandomUnderSampler(sampling_strategy=UNDER_RATIO, random_state=RANDOM_STATE)),
        ("xgb", xgb_clf),
    ])
else:
    pipe = Pipeline(steps=[
        ("fe", FeatureEngineer()),
        ("drop", ColumnDropper(columns=cols_a_supprimer)),
        ("prep", preprocess),
        ("xgb", xgb_clf),
    ])

# Grille XGB "raisonnable" (pas trop grosse) — on refit sur AP (PR-AUC)
param_grid = {
    "xgb__n_estimators": [400, 800],
    "xgb__learning_rate": [0.03, 0.08, 0.15],
    "xgb__max_depth": [3, 4, 5],
    "xgb__min_child_weight": [1, 3, 5],
    "xgb__subsample": [0.7, 0.9, 1.0],
    "xgb__colsample_bytree": [0.7, 0.9, 1.0],
    "xgb__reg_alpha": [0.0, 0.1],
    "xgb__reg_lambda": [1.0, 2.0, 5.0],
    # "xgb__gamma": [0.0, 0.1],  # à activer si tu veux durcir le split
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scoring = {"ap": "average_precision", "roc_auc": "roc_auc"}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="ap",            # on refit sur la meilleure AP (PR-AUC)
    cv=cv,
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train, y_train)
print("\n=== Meilleurs hyperparamètres (refit=AP) ===")
print(gs.best_params_)
print(f"Meilleure AP (CV) : {gs.best_score_:.4f}")

best_model = gs.best_estimator_

# ===============================================================
# 6) (option) Calibration des probabilités
#     ⚠️ Idéalement faire la calibration sur un split de validation séparé.
#     Ici on illustre avec cv=5 pour limiter la fuite.
# ===============================================================
if USE_CALIBRATION:
    
    calibrator = CalibratedClassifierCV(
        estimator=best_model,     # <-- et plus 'base_estimator'
        method=CALIB_METHOD,      # "isotonic" ou "sigmoid"
        cv=5,                     # CV interne pour calibrer
        # n_jobs=-1               # dispo selon ta version de sklearn
    )
    calibrator.fit(X_train, y_train)
    final_model = calibrator

else:
    final_model = best_model

# ===============================================================
# 7) Choix de seuil SANS regarder le test (OOF sur le TRAIN)
# ===============================================================
mode = "target_recall" if USE_TARGET_RECALL else "max_f1"
thr_oof, thr_summary = pick_threshold_oof(
    final_model, X_train, y_train,
    mode=mode, recall_target=RECALL_TARGET,
    n_splits=5, random_state=RANDOM_STATE
)
print("\n=== Seuil choisi (OOF) ===")
print(thr_summary)

# ===============================================================
# 8) Évaluation finale sur TEST
# ===============================================================
# refit final sur tout le TRAIN (GridSearchCV l’a déjà fait sur folds; on refit encore par sécurité)
final_model.fit(X_train, y_train)

# Scores seuil-indépendants
proba_test = get_positive_scores(final_model, X_test)
print("\n=== AUC/AP sur test (seuil-indépendant) ===")
print("ROC AUC (test) :", f"{roc_auc_score(y_test, proba_test):.3f}")
print("Avg Precision (test) :", f"{average_precision_score(y_test, proba_test):.3f}")

# Rapport au seuil OOF
res = evaluate_at_threshold(y_test, proba_test, thr_oof)
print(f"\n=== Rapport @ seuil OOF ({mode}) ===")
print(f"Seuil: {res['threshold']:.4f}")
print(res["report"])
print("Confusion matrix [[tn, fp],[fn, tp]]:\n", res["confusion_matrix"])

# (Option diagnostic) point F1-max sur le TEST (ne PAS geler ce seuil en production)
prec, rec, thr = precision_recall_curve(y_test, proba_test)
P, R, T = prec[:-1], rec[:-1], thr
F1 = (2 * P * R) / (P + R + 1e-12)
ix = int(np.nanargmax(F1))
print("\n[INFO] Test F1-max (diagnostic) : "
      f"thr={T[ix]:.4f} | P={P[ix]:.3f} R={R[ix]:.3f} F1={F1[ix]:.3f}")

Fitting 5 folds for each of 2916 candidates, totalling 14580 fits

=== Meilleurs hyperparamètres (refit=AP) ===
{'xgb__colsample_bytree': 0.9, 'xgb__learning_rate': 0.03, 'xgb__max_depth': 3, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 400, 'xgb__reg_alpha': 0.0, 'xgb__reg_lambda': 5.0, 'xgb__subsample': 0.7}
Meilleure AP (CV) : 0.5894

=== Seuil choisi (OOF) ===
{'chosen_mode': 'target_recall', 'recall_target': 0.75, 'threshold': 0.1479661077260971, 'precision': 0.34375, 'recall': 0.7526315789473684, 'f1': 0.471947194719472, 'avg_precision_pr': 0.5781744669838718, 'roc_auc': 0.8145857798654852}

=== AUC/AP sur test (seuil-indépendant) ===
ROC AUC (test) : 0.819
Avg Precision (test) : 0.564

=== Rapport @ seuil OOF (target_recall) ===
Seuil: 0.1480
              precision    recall  f1-score   support

           0      0.953     0.737     0.831       247
           1      0.369     0.809     0.507        47

    accuracy                          0.748       294
   macro avg      

# Courbes Precision–Recall

ici trois test de courbes précision recall :
- courbe de base sur le test
- courbe sur le test avec point selon le choix métier sur OOF ou F1 max
- courbe de comparaison train test avec point selon le choix métier sur OOF ou F1 max

In [None]:
plt.figure()
plt.plot(rec, prec, label="PR (test)")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall (Test)")
plt.legend()
plt.tight_layout()
plt.savefig("pr_curve_test_base.png", dpi=200)
plt.show()

pr_auc_test = auc(rec, prec)
print("PR AUC :", pr_auc_test)

PR AUC : 0.5602192757077782


In [None]:
# --- Données PR ---
prec, rec, thr = precision_recall_curve(y_test, proba_test)
ap = average_precision_score(y_test, proba_test)
pr_auc = auc(rec, prec)

# --- Figure ---
fig, ax = plt.subplots()
ax.plot(rec, prec, label=f"PR (AP={ap:.3f}, AUC={pr_auc:.3f})")

# Si tu as 'res' (résumé @ seuil OOF) déjà calculé :
try:
    ax.scatter(res["recall"], res["precision"], s=60, marker="o",
               label=f"Seuil OOF={res['threshold']:.3f}")
except Exception:
    pass

# Point F1-max sur le test (diagnostic)
P, R, T = prec[:-1], rec[:-1], thr
F1 = (2*P*R)/(P+R+1e-12)
ix = int(np.nanargmax(F1))
ax.scatter(R[ix], P[ix], s=80, marker="x", label=f"F1-max @ {T[ix]:.3f}")

ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
ax.set_title("Precision–Recall (Test)")
ax.legend()
fig.tight_layout()

# Sauvegarde + affichage
fig.savefig("pr_curve_test.png", dpi=200)  
plt.show()

Matplotlib backend: Agg
Courbe enregistrée : pr_curve_test.png


In [23]:
# === Switch : afficher/masquer les points F1-max ===
SHOW_F1_MAX = True  

def _prec_rec_at_thr(y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    p, r, _, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    return float(p), float(r)

def _f1_max_point(y_true, scores):
    """Retourne (R*, P*, T*, F1*) au point de F1 maximal sur la courbe PR."""
    prec, rec, thr = precision_recall_curve(y_true, scores)
    P, R, T = prec[:-1], rec[:-1], thr
    if T.size == 0:
        return None  # cas dégénéré
    F1 = np.where((P + R) > 0, 2 * P * R / (P + R), 0.0)
    ix = int(np.nanargmax(F1))
    return float(R[ix]), float(P[ix]), float(T[ix]), float(F1[ix])

# --- PR Test ---
prec_te, rec_te, thr_te = precision_recall_curve(y_test, proba_test)
ap_te  = average_precision_score(y_test, proba_test)
auc_te = auc(rec_te, prec_te)

# --- PR Train OOF ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
proba_tr_oof = cross_val_predict(final_model, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
prec_tr, rec_tr, thr_tr = precision_recall_curve(y_train, proba_tr_oof)
ap_tr  = average_precision_score(y_train, proba_tr_oof)
auc_tr = auc(rec_tr, prec_tr)

# --- Plot comparatif ---
fig, ax = plt.subplots()
ax.plot(rec_te, prec_te, label=f"Test (AP={ap_te:.3f}, AUC={auc_te:.3f})")
ax.plot(rec_tr, prec_tr, linestyle="--", label=f"Train OOF (AP={ap_tr:.3f}, AUC={auc_tr:.3f})")

# Points @ seuil OOF (si tu utilises le seuil OOF)
p_te, r_te = _prec_rec_at_thr(y_test, proba_test, thr_oof)
ax.scatter(r_te, p_te, s=60, marker="o", label=f"Seuil OOF test @ {thr_oof:.3f}")

p_tr, r_tr = _prec_rec_at_thr(y_train, proba_tr_oof, thr_oof)
ax.scatter(r_tr, p_tr, s=60, marker="s", label=f"Seuil OOF train-OOF @ {thr_oof:.3f}")

# ✚ Croix F1-max (affichée seulement si souhaité)
if SHOW_F1_MAX:
    f1_te = _f1_max_point(y_test, proba_test)
    if f1_te is not None:
        Rte, Pte, Tte, F1te = f1_te
        ax.scatter(Rte, Pte, s=90, marker="x", label=f"F1-max test @ {Tte:.3f} (F1={F1te:.3f})")

    f1_tr = _f1_max_point(y_train, proba_tr_oof)
    if f1_tr is not None:
        Rtr, Ptr, Ttr, F1tr = f1_tr
        ax.scatter(Rtr, Ptr, s=90, marker="D", label=f"F1-max train-OOF @ {Ttr:.3f} (F1={F1tr:.3f})")

ax.set_xlabel("Recall")
ax.set_ylabel("Precision")
ax.set_title("Precision–Recall : Test vs Train (OOF)")
ax.legend()
fig.tight_layout()
fig.savefig("pr_curve_train_test_compare.png", dpi=200)
plt.show()

# Test de l'importance des features 

Cette partie permet de tester différentes technique pour mettre à jour l'importance des features dans le modèle et ainsi répondre à la question du terrain. 
Sont testé : permutation importance, feature importance et shap

In [17]:
# importance selon la PR AUC (plus sensible aux positifs rares)
perm = permutation_importance(
    gs, X_test, y_test,
    scoring="average_precision",
    n_repeats=20, random_state=42, n_jobs=-1
)

perm_imp = (
    pd.DataFrame({
        "feature": X_test.columns,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std
    })
    .sort_values("importance_mean", ascending=False)
)

print("\nPermutation importance (PR AUC) — top 50 :")
print(perm_imp.head(50).to_string(index=False))

best_pipe = gs.best_estimator_
prep = best_pipe.named_steps["prep"]

# 2) Récupérer l’estimateur final (adapte 'xgb' au nom réel de ta step)
xgb = best_pipe.named_steps["xgb"]  # ou "clf" si tu l’as appelée ainsi

# 3) Importances "impurity-based" de XGBoost (sur les features TRANSFORMÉES)
importances = xgb.feature_importances_  # shape = nb de colonnes après preprocess

# 4) Récupérer les noms de features transformées depuis le preprocess
feat_names = prep.get_feature_names_out(prep.feature_names_in_)

import pandas as pd
fi = (pd.DataFrame({"feature": feat_names, "importance": importances})
        .sort_values("importance", ascending=False))
print(fi.head(100).to_string(index=False))
booster = xgb.get_booster()
gain_dict = booster.get_score(importance_type="gain")  # dict: nom_feature -> score
# Attention : les noms sont du type 'f0','f1',... dans l’ordre des colonnes transformées.


Permutation importance (PR AUC) — top 50 :
                                  feature  importance_mean  importance_std
                    heure_supplementaires         0.191201        0.030905
     satisfaction_employee_nature_travail         0.047262        0.016885
                           revenu_mensuel         0.038071        0.040469
                 nombre_participation_pee         0.038060        0.023016
                              departement         0.026003        0.021578
                    frequence_deplacement         0.021086        0.014806
               note_evaluation_precedente         0.020374        0.012579
      satisfaction_employee_environnement         0.018286        0.023224
                distance_domicile_travail         0.013308        0.016521
                niveau_hierarchique_poste         0.011865        0.005408
                           statut_marital         0.010730        0.006777
             satisfaction_employee_equipe         0.0103

In [18]:
# ===============================================================
# SHAP sur un Pipeline sklearn (fe -> drop -> prep -> xgb)
# ===============================================================

# 0) Hypothèse: pipe_XG est DEJA fit, et X_train / X_test existent.

# 1) Récupération des étapes utiles
fe   = best_pipe.named_steps["fe"]     # ton FeatureEngineer() déjà fit
drop = best_pipe.named_steps["drop"]   # ton ColumnDropper() déjà fit
prep = best_pipe.named_steps["prep"]   # ton ColumnTransformer déjà fit
xgb  = best_pipe.named_steps["xgb"]    # l'estimateur final (XGBClassifier / HistGB / RF ...)

# 2) Fonction utilitaire: projeter X brut -> espace features du modèle
def to_model_space(X):
    X2 = fe.transform(X)             # mêmes features qu'au fit
    #X2 = drop.transform(X2)          # mêmes colonnes retirées qu'au fit
    Z  = prep.transform(X2)          # encodage + scaling identiques
    if sparse.issparse(Z):           # (beaucoup de ploteurs SHAP préfèrent du dense)
        Z = Z.toarray()
    cols = prep.get_feature_names_out()  # noms post-encodage (scikit-learn >= 1.0)
    return pd.DataFrame(Z, columns=cols)

# 3) Construire les matrices transformées train/test comme “vues” par le modèle
Xtr_df = to_model_space(X_train)
Xte_df = to_model_space(X_test)

# 4) Explainer SHAP pour modèles d’arbres
#    - background = petit échantillon du train transformé (plus rapide/stable)
background = shap.sample(Xtr_df, 200, random_state=42)
explainer = shap.TreeExplainer(
    xgb,
    data=background,
    feature_perturbation="interventional",   # robuste pour les arbres
    model_output="probability"               # SHAP sur l’échelle proba (sinon log-odds)
)

# 5) Valeurs SHAP sur le TEST (check_additivity=False pour éviter un warning XGB)
shap_values = explainer(Xte_df, check_additivity=False)

# 6) Plots globaux/local (adapter max_display au besoin)
import os
import matplotlib
matplotlib.use("Agg")               # backend non interactif (sécurise les exports)
import matplotlib.pyplot as plt
import numpy as np
import shap

os.makedirs("visualisations", exist_ok=True)

# 1) BEESWARM (global) — utiliser la fonction legacy, très fiable à l’export
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values.values if hasattr(shap_values, "values") else shap_values,
    Xte_df,
    feature_names=getattr(Xte_df, "columns", None),
    plot_type="dot",
    max_display=20,
    show=False
)
plt.tight_layout()
plt.savefig("visualisations/beeswarm.png", format="png", dpi=300, bbox_inches="tight")
plt.close()

# 2) BAR (global) — importance moyenne |SHAP|
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values.values if hasattr(shap_values, "values") else shap_values,
    Xte_df,
    feature_names=getattr(Xte_df, "columns", None),
    plot_type="bar",
    max_display=20,
    show=False
)
plt.tight_layout()
plt.savefig("visualisations/summary_bar.png", format="png", dpi=300, bbox_inches="tight")
plt.close()

# 3) WATERFALL (local) — nouvelle API, avec fallback legacy si besoin
i = 0  # l'individu que tu veux tracer
try:
    # nouvelle API (si dispo dans ta version de shap)
    shap.plots.waterfall(shap_values[i], max_display=20, show=False)
    plt.tight_layout()
    plt.savefig("visualisations/waterfall.png", format="png", dpi=300, bbox_inches="tight")
    plt.close()
except Exception:
    # Fallback legacy 100% matplotlib
    base_value = (
        shap_values.base_values[i]
        if hasattr(shap_values, "base_values")
        else (explainer.expected_value if np.isscalar(explainer.expected_value) else explainer.expected_value[1])
    )
    vals = shap_values.values[i] if hasattr(shap_values, "values") else shap_values[i]
    shap.plots._waterfall.waterfall_legacy(
        base_value,
        vals,
        feature_names=getattr(Xte_df, "columns", None),
        features=Xte_df.iloc[i] if hasattr(Xte_df, "iloc") else None,
        max_display=20,
        show=False
    )
    plt.tight_layout()
    plt.savefig("visualisations/waterfall.png", format="png", dpi=300, bbox_inches="tight")
    plt.close()

# 7) (Option) Exporter l’importance moyenne absolue par feature
imp_df = (
    pd.DataFrame({
        "feature": Xte_df.columns,
        "mean_abs_shap": np.abs(shap_values.values).mean(axis=0)
    })
    .sort_values("mean_abs_shap", ascending=False)
)
print("\nTop-25 features (|SHAP| moyen) :")
print(imp_df.head(25).to_string(index=False))

# 8) (Option) Agréger par variable source (avant OHE) si tu utilises un schéma "num__" / "cat__"
#    Regroupe toutes les dummies d’une même variable d’origine (somme des |SHAP|)
def source_from_feature_name(name: str) -> str:
    # Exemple d’extractions: "num__age" -> "age" ; "cat__poste_Manager" -> "poste"
    if name.startswith("num__"):
        return name.split("num__", 1)[1].split("_", 1)[0] if "__" in name else name[5:]
    if name.startswith("cat__"):
        base = name.split("cat__", 1)[1]
        return base.split("_", 1)[0]   # avant le premier "_", ex: "poste_Manager" -> "poste"
    return name

agg_df = (
    imp_df.assign(source=imp_df["feature"].map(source_from_feature_name))
          .groupby("source", as_index=False)["mean_abs_shap"].sum()
          .sort_values("mean_abs_shap", ascending=False)
)
print("\nTop-20 variables sources (agrégation des dummies par somme |SHAP|) :")
print(agg_df.head(20).to_string(index=False))



Top-25 features (|SHAP| moyen) :
                                   feature  mean_abs_shap
                      num__heure_supp_flag       0.090710
                   num__revenu_vs_dept_med       0.081649
             num__nombre_participation_pee       0.056102
            num__distance_domicile_travail       0.042282
       num__nombre_experiences_precedentes       0.039504
                             num__sat_mean       0.037430
            num__tenure_ratio_current_post       0.036677
               cat__departement_Consulting       0.034115
            num__nb_annnee_hors_entreprise       0.031935
                    num__formations_par_an       0.031616
                              num__sat_min       0.030945
       cat__frequence_deplacement_Frequent       0.029818
                              num__sat_std       0.027890
 num__satisfaction_employee_nature_travail       0.023095
             num__Ecart_nb_annee_sur_poste       0.020789
                     num__augmentation

In [19]:
import shap
import matplotlib.pyplot as plt

# --- Helper: dependence plot robuste sur un nom de feature transformée ---
def shap_dependence(name, color=None, max_display=None):
    assert name in Xte_df.columns, f"{name} absent de Xte_df.columns"
    if color is not None and color not in Xte_df.columns:
        color = None  # si pas trouvé, on ignore la couleur
    shap.plots.scatter(
        shap_values[:, name],
        color=shap_values[:, color] if color else None,
        show=False
    )
    plt.title(f"SHAP dependence — {name}" + (f" (color: {color})" if color else ""))
    plt.tight_layout()
    plt.show()

top6 = [
    ("num__revenu_vs_dept_med",       "num__revenu_vs_niveau_med"),   # effet rémunération relative
    ("num__heure_supp_flag",          "num__sat_mean"),               # surcharge vs satisfaction
    ("num__distance_domicile_travail","cat__frequence_deplacement_Frequent"),
    ("num__nombre_experiences_precedentes","num__formations_par_an"),
    ("num__tenure_ratio_current_post","num__Ecart_nb_annee_sur_poste"),
    ("num__sat_mean",                 "num__sat_std"),
]

for feat, color in top6:
    shap_dependence(feat, color=color)
