In [89]:
import numpy as np
import pandas as pd

from fonctions.fonctions_features import FeatureEngineer
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate

from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, 
    classification_report, confusion_matrix,
    confusion_matrix, roc_auc_score, 
    average_precision_score, precision_recall_curve,
    precision_recall_fscore_support,
)

from sklearn.inspection import permutation_importance

import importlib
import fonctions.fonctions_features as ff
importlib.reload(ff)
from fonctions.fonctions_features import FeatureEngineer


In [90]:
data = pd.read_csv("data/jeu_donnee_RH_complet_transforme.csv")

In [91]:
#DUMMY

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_dummy = Pipeline(steps=[
    ("fe", FeatureEngineer()),   # <- ton feature engineering, sans fuite
    ("prep", preprocess),
    ("clf", DummyClassifier(strategy="most_frequent", random_state=42)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_dummy.fit(X_train, y_train) 

y_pred_dummy = pipe_dummy.predict(X_test)

print("Accuracy           :", accuracy_score(y_test, y_pred_dummy))
print("Balanced accuracy  :", balanced_accuracy_score(y_test, y_pred_dummy))
print("\nClassification report:\n", classification_report(y_test, y_pred_dummy))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_dummy))

# (Optionnel) contrôle rapide du baseline théorique
print("\nBaseline (majorité dans tout le set) :", y.value_counts(normalize=True).max())

Accuracy           : 0.8401360544217688
Balanced accuracy  : 0.5

Classification report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       247
           1       0.00      0.00      0.00        47

    accuracy                           0.84       294
   macro avg       0.42      0.50      0.46       294
weighted avg       0.71      0.84      0.77       294


Confusion matrix:
 [[247   0]
 [ 47   0]]

Baseline (majorité dans tout le set) : 0.8387755102040816


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [92]:
#RegressionLogistique

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_logit = Pipeline(steps=[
    ("fe", FeatureEngineer()),   # <- feature engineering, sans fuite
    ("prep", preprocess),
    ("clf", LogisticRegression(class_weight="balanced", max_iter=2000)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_logit.fit(X_train, y_train)
yp_logit_proba = pipe_logit.predict_proba(X_test)[:, 1]
thr = 0.35  # ou ce que tu as choisi
yp_logit = (yp_logit_proba >= thr).astype(int)


print("\n=== LogisticRegression (balanced) ===")
print("Accuracy           :", accuracy_score(y_test, yp_logit))
print("Balanced accuracy  :", balanced_accuracy_score(y_test, yp_logit))
print("Avg Precision (PR) :", average_precision_score(y_test, yp_logit_proba))
print("ROC AUC            :", roc_auc_score(y_test, yp_logit_proba))
print("Report:\n", classification_report(y_test, yp_logit))
print("Confusion:\n", confusion_matrix(y_test, yp_logit))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe_logit, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe_logit.predict_proba(X_test)[:, 1]
y_pred_06 = (proba_te >= thr).astype(int)

print("\n-- RAPPORT (seuil 0.6) --")
print(classification_report(y_test, y_pred_06, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_06))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")


=== LogisticRegression (balanced) ===
Accuracy           : 0.6462585034013606
Balanced accuracy  : 0.7205616332156086
Avg Precision (PR) : 0.589014756777982
ROC AUC            : 0.8204841071582393
Report:
               precision    recall  f1-score   support

           0       0.95      0.61      0.74       247
           1       0.29      0.83      0.43        47

    accuracy                           0.65       294
   macro avg       0.62      0.72      0.59       294
weighted avg       0.84      0.65      0.69       294

Confusion:
 [[151  96]
 [  8  39]]
[precision] train=0.411±0.007 | test=0.362±0.014
[recall] train=0.794±0.010 | test=0.730±0.032
[roc_auc] train=0.873±0.004 | test=0.827±0.010

-- RAPPORT (seuil 0.6) --
              precision    recall  f1-score   support

           0      0.950     0.611     0.744       247
           1      0.289     0.830     0.429        47

    accuracy                          0.646       294
   macro avg      0.619     0.721     0.586 

In [93]:
#ForetAleatoire

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_random = Pipeline(steps=[
    ("fe", FeatureEngineer()),   # <- feature engineering, sans fuite
    ("prep", preprocess),
    ("rf", RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42
)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_random.fit(X_train, y_train)
yp_random = pipe_random.predict(X_test)
yp_random_proba = pipe_random.predict_proba(X_test)[:, 1]

print("\n=== ForetAleatoire ===")
print("Accuracy           :", accuracy_score(y_test, yp_random))
print("Balanced accuracy  :", balanced_accuracy_score(y_test, yp_random))
print("Avg Precision (PR) :", average_precision_score(y_test, yp_random_proba))
print("ROC AUC            :", roc_auc_score(y_test, yp_random_proba))
print("Report:\n", classification_report(y_test, yp_random))
print("Confusion:\n", confusion_matrix(y_test, yp_random))


=== ForetAleatoire ===
Accuracy           : 0.826530612244898
Balanced accuracy  : 0.5522008786286502
Avg Precision (PR) : 0.40853023868132476
ROC AUC            : 0.7931777069515031
Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.90       247
           1       0.39      0.15      0.22        47

    accuracy                           0.83       294
   macro avg       0.62      0.55      0.56       294
weighted avg       0.78      0.83      0.79       294

Confusion:
 [[236  11]
 [ 40   7]]


In [94]:
#HistGradientBoosting

# --- 2) Préparation des données d’entrée 
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe_hgb = Pipeline(steps=[
    ("fe", FeatureEngineer()),   # <- feature engineering, sans fuite
    ("prep", preprocess),
    ("hgb", HistGradientBoostingClassifier(
    max_depth=None,
    learning_rate=0.1,
    max_iter=300,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
    # (pas de class_weight ici; on compensera avec la métrique choisie et, plus tard, le tuning)
)),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe_hgb.fit(X_train, y_train)
yp_hgb = pipe_hgb.predict(X_test)
yp_hgb_proba = pipe_hgb.predict_proba(X_test)[:, 1]

print("\n=== HistGradientBoosting ===")
print("Accuracy           :", accuracy_score(y_test, yp_hgb))
print("Balanced accuracy  :", balanced_accuracy_score(y_test, yp_hgb))
print("Avg Precision (PR) :", average_precision_score(y_test, yp_hgb_proba))
print("ROC AUC            :", roc_auc_score(y_test, yp_hgb_proba))
print("Report:\n", classification_report(y_test, yp_hgb))
print("Confusion:\n", confusion_matrix(y_test, yp_hgb))


=== HistGradientBoosting ===
Accuracy           : 0.8503401360544217
Balanced accuracy  : 0.6180549573606684
Avg Precision (PR) : 0.506492658480869
ROC AUC            : 0.8271168920665002
Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.92       247
           1       0.57      0.28      0.37        47

    accuracy                           0.85       294
   macro avg       0.72      0.62      0.64       294
weighted avg       0.83      0.85      0.83       294

Confusion:
 [[237  10]
 [ 34  13]]


In [95]:
# --- 2) Préparation des données d’entrée (pandas) ---
data_pd = data.copy()

X = data_pd.drop(columns=["a_quitte_l_entreprise"])
y = data_pd["a_quitte_l_entreprise"].map({"Non": 0, "Oui": 1}).astype(int)

# --- 3) Pipeline complète avec TON FE + préprocessing classique + XGB ---
numeric_sel = selector(dtype_include=["number", "bool"])   # inclut les bools comme numériques
categorical_sel = selector(dtype_exclude=["number", "bool"])

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())   # centrage + mise à l'échelle des numériques
        ]), numeric_sel),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical_sel),
    ],
    remainder="drop",
)

pipe = Pipeline(steps=[
    ("fe", FeatureEngineer()),   # <- ton feature engineering, sans fuite
    ("prep", preprocess),
    ("xgb", XGBClassifier(
        random_state=42,
        eval_metric="logloss",
        n_estimators=300,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        n_jobs=-1
    )),
])

# --- 4A) En TRAIN / TEST "classique" (pas de fuite) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe.fit(X_train, y_train)              

# --- 4B) En Cross-Validation stratifiée (pas de fuite) ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ("precision", "recall", "roc_auc")
scores = cross_validate(
    pipe, X, y, cv=cv,
    scoring=("precision", "recall", "roc_auc"),
    return_train_score=True
)

for m in scoring:
    tr_mean, tr_std = scores[f"train_{m}"].mean(), scores[f"train_{m}"].std()
    te_mean, te_std = scores[f"test_{m}"].mean(), scores[f"test_{m}"].std()
    print(f"[{m}] train={tr_mean:.3f}±{tr_std:.3f} | test={te_mean:.3f}±{te_std:.3f}")

proba_te = pipe.predict_proba(X_test)[:, 1]
y_pred_06 = (proba_te >= 0.6).astype(int)

print("\n-- RAPPORT (seuil 0.6) --")
print(classification_report(y_test, y_pred_06, digits=3, zero_division=0))

print("-- MATRICE DE CONFUSION --")
print(confusion_matrix(y_test, y_pred_06))  # [[tn, fp], [fn, tp]]

print("-- AUCs (seuil-indep.) --")
print(f"ROC AUC (test) = {roc_auc_score(y_test, proba_te):.3f}")
print(f"PR AUC  (test) = {average_precision_score(y_test, proba_te):.3f}")

[precision] train=1.000±0.000 | test=0.634±0.062
[recall] train=1.000±0.000 | test=0.350±0.067
[roc_auc] train=1.000±0.000 | test=0.820±0.025

-- RAPPORT (seuil 0.6) --
              precision    recall  f1-score   support

           0      0.873     0.976     0.922       247
           1      0.667     0.255     0.369        47

    accuracy                          0.861       294
   macro avg      0.770     0.616     0.645       294
weighted avg      0.840     0.861     0.833       294

-- MATRICE DE CONFUSION --
[[241   6]
 [ 35  12]]
-- AUCs (seuil-indep.) --
ROC AUC (test) = 0.817
PR AUC  (test) = 0.493


In [96]:
# importance selon la PR AUC (plus sensible aux positifs rares)
perm = permutation_importance(
    pipe_logit, X_test, y_test,
    scoring="average_precision",
    n_repeats=20, random_state=42, n_jobs=-1
)

perm_imp = (
    pd.DataFrame({
        "feature": X_test.columns,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std
    })
    .sort_values("importance_mean", ascending=False)
)

print("\nPermutation importance (PR AUC) — top 30 :")
print(perm_imp.head(50).to_string(index=False))



Permutation importance (PR AUC) — top 30 :
                                  feature  importance_mean  importance_std
                    heure_supplementaires         0.228049        0.030934
              annees_dans_le_poste_actuel         0.090910        0.038253
           nombre_experiences_precedentes         0.090527        0.022038
                           revenu_mensuel         0.081878        0.039725
      satisfaction_employee_environnement         0.062482        0.026376
      annees_depuis_la_derniere_promotion         0.061184        0.022201
     satisfaction_employee_nature_travail         0.058355        0.019396
                           statut_marital         0.052569        0.016940
             satisfaction_employee_equipe         0.035524        0.018496
               note_evaluation_precedente         0.028812        0.021726
                  annee_experience_totale         0.022268        0.024387
                distance_domicile_travail         0.0212

In [97]:
# 1) Sépare le pipeline : (fe + prep) puis le modèle seul
fe_prep = pipe_logit[:-1]                 # garde "fe" + "prep"
clf = pipe_logit.named_steps["clf"]       # modèle entraîné (LogisticRegression ici)

# 2) Transforme X_test au niveau features (après FE+preprocess)
Xt_test = fe_prep.transform(X_test)

# Si sortie sparse (probable à cause de l'OHE), on densifie juste pour la permutation
# (le jeu de test est petit => pas de souci mémoire)
if hasattr(Xt_test, "toarray"):
    Xt_test_dense = Xt_test.toarray()
else:
    Xt_test_dense = Xt_test

# 3) Récupère les noms de *toutes* les features après preprocess (num + OHE)
feat_names = pipe_logit.named_steps["prep"].get_feature_names_out()

# 4) Permutation importance AU NIVEAU TRANSFORMÉ (on permute les colonnes de Xt_test_dense)
perm = permutation_importance(
    clf, Xt_test_dense, y_test,
    scoring="average_precision",   # PR AUC (sensible aux positifs rares)
    n_repeats=20, random_state=42, n_jobs=-1
)

imp_df = (
    pd.DataFrame({
        "feature": feat_names,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std
    })
    .sort_values("importance_mean", ascending=False)
)

print("\nPermutation importance (PR AUC) — top 50 features transformées :")
print(imp_df.head(50).to_string(index=False))

# 5) (Optionnel) Agréger par variable *source* (avant OHE)
def source_from_feature(transformed_name: str) -> str:
    # ColumnTransformer met des préfixes "num__" / "cat__"
    base = transformed_name.split("__", 1)[-1]
    # Les colonnes OHE ressemblent à "cat__poste_Agent" -> on coupe au premier "_"
    # ⚠️ Si tes noms sources contiennent déjà des "_" (ex: "note_actuelle"),
    # adapte cette logique (par ex. split sur un séparateur personnalisé).
    if transformed_name.startswith("cat__"):
        return base.split("_", 1)[0]
    return base

imp_df["source"] = [source_from_feature(n) for n in feat_names]

agg_df = (
    imp_df.groupby("source", as_index=False)["importance_mean"]
          .sum()
          .sort_values("importance_mean", ascending=False)
)

print("\nPermutation importance agrégée par variable source (somme des OHE) — top 50 :")
print(agg_df.head(500).to_string(index=False))


Permutation importance (PR AUC) — top 50 features transformées :
                                   feature  importance_mean  importance_std
                      num__heure_supp_flag         0.228049        0.030934
       num__nombre_experiences_precedentes         0.090527        0.022038
                       num__revenu_mensuel         0.079729        0.031512
            num__niveau_hierarchique_poste         0.070263        0.023377
  num__annees_depuis_la_derniere_promotion         0.054072        0.021973
                             num__sat_mean         0.042738        0.021162
                num__a_connu_mvmnt_interne         0.033117        0.015949
 num__satisfaction_employee_nature_travail         0.028808        0.011733
            num__distance_domicile_travail         0.028185        0.018641
                num__nb_formations_suivies         0.025324        0.013012
  num__satisfaction_employee_environnement         0.022140        0.015196
                   num