In [None]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score

In [None]:
# Exemple : ton dataset d’origine avant encodage
data_path = "../../data/processed/employee_data_merged_clean.csv"

df = pd.read_csv(data_path)

# Variable cible
TARGET_VARIABLE = "a_quitte_l_entreprise"

# Convertir le target Oui/Non → 1/0
df[TARGET_VARIABLE] = df[TARGET_VARIABLE].replace({"Oui": 1, "Non": 0})

df["augementation_salaire_precedente"] = (
    df["augementation_salaire_precedente"].str.rstrip(" %").astype(float)
)
df.rename(
    columns={"augementation_salaire_precedente": "augmentation_salaire_precedente"},
    inplace=True,
)

# Feature engineering basique (à adapter selon ton dataset)
epsilon = 1e-6
df["mobilite_interne_ratio"] = df["annees_dans_le_poste_actuel"] / (df["annees_dans_l_entreprise"] + epsilon)
# df.drop(columns=["annees_dans_le_poste_actuel"], inplace=True)

# Ratio ancienneté
df["ratio_anciennete"] = df["annees_dans_l_entreprise"] / (df["annees_dans_l_entreprise"] + df["nombre_experiences_precedentes"] + epsilon)

# Evolution de la note d'évaluation
df["delta_evaluation"] = df["note_evaluation_actuelle"] - df["note_evaluation_precedente"]
df.drop(columns=["note_evaluation_precedente"], inplace=True)

# Séparer features et target
X = df.drop(columns=[TARGET_VARIABLE])
y = df[TARGET_VARIABLE]

# Split train/test stratifié
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"✅ X: {X.shape}, y: {y.shape}")

In [None]:
# Colonnes catégorielles nominales → one-hot
categorical_features = ["departement", "statut_marital", "poste", "domaine_etude", "genre", "heure_supplementaires"]

# Colonnes binaires → ordinal encoder (M/F, Oui/Non)
binary_features = []
binary_mapping = [["M", "F"], ["Non", "Oui"]]

# Colonnes ordinales → ordinal encoder
ordinal_features = ["frequence_deplacement"]
ordinal_mapping = [["Aucun", "Occasionnel", "Frequent"]]

# Colonnes numériques → passthrough
numeric_features = [
    "age",
    "revenu_mensuel",
    "augmentation_salaire_precedente",
	"nombre_experiences_precedentes",
    "annee_experience_totale",
    "annees_dans_l_entreprise",
    "annees_dans_le_poste_actuel",
    "satisfaction_employee_environnement",
    "niveau_hierarchique_poste",
    "satisfaction_employee_nature_travail",
    "satisfaction_employee_equipe",
    "satisfaction_employee_equilibre_pro_perso",
    "note_evaluation_actuelle",
    "nombre_participation_pee",
    "nb_formations_suivies",
    "distance_domicile_travail",
    "niveau_education",
    "annees_depuis_la_derniere_promotion",
    "annes_sous_responsable_actuel",
    "mobilite_interne_ratio",
    "ratio_anciennete",
    "delta_evaluation",
]

In [None]:
# Création des encodeurs
one_hot = OneHotEncoder(drop="first", sparse_output=False, dtype=int)
# binary_encoder = OrdinalEncoder(categories=binary_mapping)
ordinal_encoder = OrdinalEncoder(categories=ordinal_mapping)

# Composition du préprocesseur global
preprocessor = ColumnTransformer(
    [
        ("one_hot", one_hot, categorical_features),
        # ("binary", binary_encoder, binary_features),
        ("ordinal", ordinal_encoder, ordinal_features),
        ("numeric", "passthrough", numeric_features),
    ]
)

In [None]:
pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("rf", RandomForestClassifier(random_state=42, n_jobs=-1)),
    ]
)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    "rf__n_estimators": [200, 300, 400],
    "rf__max_depth": [6, 7, 8],
    "rf__min_samples_split": [4, 8, 12],
    "rf__min_samples_leaf": [2, 4, 6],
    "rf__max_features": ["sqrt", "log2"],
    "rf__class_weight": [None, "balanced", "balanced_subsample"],
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="recall",
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

In [None]:
print("\n🚀 Lancement du GridSearchCV...")
grid_search.fit(X, y)

print("\n✅ Meilleurs paramètres trouvés :")
print(grid_search.best_params_)
print(f"Meilleur Recall (CV): {grid_search.best_score_:.4f}")

In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
test_recall = recall_score(y_test, y_pred)

print("\n📊 Résultats sur le jeu de test :")
print(f"Recall (test): {test_recall:.4f}")
print(classification_report(y_test, y_pred))

In [None]:
MODEL_DIR = "../../models"
os.makedirs(MODEL_DIR, exist_ok=True)

model_path = os.path.join(MODEL_DIR, "random_forest_pipeline.pkl")
joblib.dump(best_model, model_path)

print(f"💾 Modèle sauvegardé : {model_path}")

In [None]:
print(X["genre"].unique())
print(X["genre"].value_counts())


model = joblib.load(model_path)

sample = pd.DataFrame(
    [
        {
            "departement": "Consulting",
			"augmentation_salaire_precedente": 5,
            "statut_marital": "Marié(e)",
            "poste": "Manager",
            "domaine_etude": "Transformation Digitale",
            "genre": "M",
            "heure_supplementaires": "Oui",
            "frequence_deplacement": "Occasionnel",
            "age": 35,
            "revenu_mensuel": 100,
            "nombre_experiences_precedentes": 2,
            "annee_experience_totale": 1,
            "annees_dans_l_entreprise": 1,
            "annees_dans_le_poste_actuel": 1,
            "satisfaction_employee_environnement": 1,
            "niveau_hierarchique_poste": 3,
            "satisfaction_employee_nature_travail": 1,
            "satisfaction_employee_equipe": 1,
            "satisfaction_employee_equilibre_pro_perso": 1,
            "note_evaluation_actuelle": 1,
            "nombre_participation_pee": 1,
            "nb_formations_suivies": 3,
            "distance_domicile_travail": 120.5,
            "niveau_education": 3,
            "annees_depuis_la_derniere_promotion": 2,
            "annes_sous_responsable_actuel": 1,
            "mobilite_interne_ratio": 0.5,
            "ratio_anciennete": 0.7,
            "delta_evaluation": 0.2,
        }
    ]
)

# Probabilité d'appartenance à la classe 1
y_proba = model.predict_proba(df)[:, 1]

# Application du seuil choisi (par ex. celui trouvé plus tôt)
THRESHOLD = 0.35  # à adapter selon ton calcul métier

# Conversion en prédiction binaire selon le seuil
y_pred = (y_proba >= THRESHOLD).astype(int)

print(
    f"🔮 Prédiction: {y_pred[0]}, Probabilité: {y_proba[0]:.3f}, Seuil utilisé: {THRESHOLD}"
)