# Model Training and Evaluation



## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Machine learning libraries
from typing import Literal, cast
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


# Utils functions
import os
import sys

# Add the src directory to Python path to import our utils module
src_path = os.path.abspath("../../src")
if src_path not in sys.path:
    sys.path.append(src_path)

# Import feature engineering utilities from our custom module
from project_utils import confusion_matrix_analysis, analyze_feature_scaling, get_classification_report_table  # type: ignore

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)


# Styling
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("üìö Libraries imported successfully!")
print(f"üìç Working directory: {os.getcwd()}")

In [None]:
# Load preprocessed data
print("LOADING PREPROCESSED DATA:")
print("="*50)

# Define data paths
data_path = "../../data/processed"
X_file = os.path.join(data_path, "X_features.csv")
y_file = os.path.join(data_path, "y_target.csv")

# Load features and target
X = pd.read_csv(X_file)
y = pd.read_csv(y_file).squeeze()  # Convert to Series

print(f"‚úÖ Features loaded: {X.shape}")
print(f"‚úÖ Target loaded: {y.shape}")

print(f"\nüìä Dataset Summary:")
print(f"   Total samples: {len(X)}")
print(f"   Total features: {X.shape[1]}")
print(f"   Target variable: {y.name}")
print(f"   Target classes: {sorted(y.unique())}")
print(f"   Class distribution:")
for class_val in sorted(y.unique()):
    count = (y == class_val).sum()
    percentage = (count / len(y)) * 100
    print(f"     Class {class_val}: {count} ({percentage:.1f}%)")


AVERAGE_TYPE = 'weighted'  # Options: 'micro', 'macro', 'weighted', 'samples'
print(f"\nüéØ Using average type: {AVERAGE_TYPE}")


print(f"\nüéØ Data ready for modeling!")

## 2. Analyze feature scaling

In [None]:
analyze_feature_scaling(X)

## 3. Train/Test Split

In [None]:
# Create train/test split
print("TRAIN/TEST SPLIT:")
print("="*50)

# Split with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"‚úÖ Train set: {X_train.shape}")
print(f"‚úÖ Test set: {X_test.shape}")

print(f"\nüìä Class distribution after split:")
print(f"\nTraining set:")
for class_val in sorted(y_train.unique()):
    count = (y_train == class_val).sum()
    percentage = (count / len(y_train)) * 100
    print(f"  Class {class_val}: {count} ({percentage:.1f}%)")

print(f"\nTest set:")
for class_val in sorted(y_test.unique()):
    count = (y_test == class_val).sum()
    percentage = (count / len(y_test)) * 100
    print(f"  Class {class_val}: {count} ({percentage:.1f}%)")

print(f"\nüéØ Stratified split completed!")

## 4. Baseline Model - DummyClassifier

In [None]:
print("BASELINE MODEL - DUMMYCLASSIFIER WITH FULL CV METRICS:")
print("=" * 60)



# --- CONFIG ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
dummy_strategies = ["most_frequent", "stratified", "uniform"]

# --- CV scoring ---
scoring_metrics = {
    "accuracy": "accuracy",
    "precision": f"precision_{AVERAGE_TYPE}",
    "recall": f"recall_{AVERAGE_TYPE}",
    "f1": f"f1_{AVERAGE_TYPE}",
}

dummy_results = {}

# --- TEST all Dummy strategies ---
for strategy in dummy_strategies:
    print(f"\nü§ñ Testing strategy: {strategy}")

    strategy_literal = cast(
        Literal["most_frequent", "prior", "stratified", "uniform", "constant"], strategy
    )
    dummy = DummyClassifier(strategy=strategy_literal, random_state=42)

    # --- Cross-validation ---
    cv_scores = {}
    for metric_name, scoring in scoring_metrics.items():
        scores = cross_val_score(dummy, X_train, y_train, cv=cv, scoring=scoring)
        cv_scores[metric_name] = (scores.mean(), scores.std())

    # --- Fit full train / evaluate test ---
    dummy.fit(X_train, y_train)
    y_pred_train = dummy.predict(X_train)
    y_pred_test = dummy.predict(X_test)

    train_scores = {
        "accuracy": accuracy_score(y_train, y_pred_train),
        "precision": precision_score(
            y_train, y_pred_train, average=AVERAGE_TYPE, zero_division=0
        ),
        "recall": recall_score(y_train, y_pred_train, average=AVERAGE_TYPE),
        "f1": f1_score(y_train, y_pred_train, average=AVERAGE_TYPE),
    }
    test_scores = {
        "accuracy": accuracy_score(y_test, y_pred_test),
        "precision": precision_score(
            y_test, y_pred_test, average=AVERAGE_TYPE, zero_division=0
        ),
        "recall": recall_score(y_test, y_pred_test, average=AVERAGE_TYPE),
        "f1": f1_score(y_test, y_pred_test, average=AVERAGE_TYPE),
    }

    # --- Store results ---
    dummy_results[strategy] = {
        "cv": cv_scores,
        "train": train_scores,
        "test": test_scores,
    }

    # --- Display partial summary ---
    print("   üìä Cross-validation metrics:")
    for metric, (mean, std) in cv_scores.items():
        print(f"      {metric}: {mean:.4f} ¬± {std:.4f}")

    print("   üìä Test set metrics:")
    for metric, value in test_scores.items():
        print(f"      {metric}: {value:.4f}")

# --- Identify best Dummy strategy ---
best_strategy = max(dummy_results.keys(), key=lambda k: dummy_results[k]["test"]["f1"])
best_f1 = dummy_results[best_strategy]["test"]["f1"]

print(f"\nüèÜ Best baseline strategy: {best_strategy}")
print(f"    Test F1: {best_f1:.4f}")

# --- Retrain the best Dummy model ---
dummy_best = DummyClassifier(strategy=best_strategy, random_state=42)
dummy_best.fit(X_train, y_train)

# Predictions + Probabilities (if available)
y_pred_train_dummy = dummy_best.predict(X_train)
y_pred_test_dummy = dummy_best.predict(X_test)
y_pred_proba_dummy = dummy_best.predict_proba(X_test)

# --- Metrics summary ---

accuracy_train_dummy = dummy_results[best_strategy]["train"]["accuracy"]
accuracy_test_dummy = dummy_results[best_strategy]["test"]["accuracy"]
precision_train_dummy = dummy_results[best_strategy]["train"]["precision"]
precision_test_dummy = dummy_results[best_strategy]["test"]["precision"]
recall_train_dummy = dummy_results[best_strategy]["train"]["recall"]
recall_test_dummy = dummy_results[best_strategy]["test"]["recall"]
f1_train_dummy = dummy_results[best_strategy]["train"]["f1"]
f1_test_dummy = dummy_results[best_strategy]["test"]["f1"]

print(f"\n   Training Set (Full Fit):")
for metric, val in train_scores.items():
    print(f"      {metric.capitalize():<10}: {val:.4f}")

print(f"\n   Test Set:")
for metric, val in test_scores.items():
    print(f"      {metric.capitalize():<10}: {val:.4f}")

# --- Overfitting check ---
f1_diff_dummy = f1_train_dummy - f1_test_dummy
if abs(f1_diff_dummy) < 0.01:
    status = "‚úÖ Excellent generalization"
elif f1_diff_dummy > 0.05:
    status = "‚ö†Ô∏è Possible overfitting"
elif f1_diff_dummy > 0.02:
    status = "‚ö° Minor overfitting"
else:
    status = "‚ÑπÔ∏è Normal variance"

print(f"\n   üîç Overfitting Analysis:")
print(f"      Train-Test F1 diff: {f1_diff_dummy:+.4f}")
print(f"      Status: {status}")

# --- Classification Report ---
print(f"\nüìã Classification Report - DummyClassifier (Best Strategy):")
display(
    get_classification_report_table(
        y_test, y_pred_test_dummy, f"DummyClassifier ({best_strategy})"
    )
)

print(f"\nüéØ Baseline established! Any real model should beat test F1={best_f1:.4f}")

In [None]:
confusion_matrix_analysis(y_test, y_pred_test_dummy, model_name="Dummy Classifier")

In [None]:
# --- ROC curve for DummyClassifier (placed after Dummy model, before Logistic Regression)


# Some DummyClassifier strategies implement predict_proba (e.g., 'stratified' or 'uniform'),
# while 'most_frequent' does not provide useful probabilities. We handle both cases.
try:
    if hasattr(dummy_best, 'predict_proba'):
        # Use probability for positive class (assumes binary labels 0/1)
        y_score_dummy = dummy_best.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_score_dummy)
        roc_auc = auc(fpr, tpr)
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f'Dummy ({best_strategy}) ROC (AUC = {roc_auc:.3f})')
        plt.plot([0, 1], [0, 1], 'k--', linewidth=0.8)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve - Dummy Classifier')
        plt.legend(loc='lower right')
        plt.grid(alpha=0.3)
        plt.show()
    else:
        print('‚ö†Ô∏è Dummy classifier does not implement predict_proba(); skipping ROC plot.')
except Exception as e:
    print(f'‚ö†Ô∏è Could not compute ROC for Dummy classifier: {e}')

## 5. Logistic Regression Model (with StandardScaler Pipeline)

In [None]:
# ----------------------------------------
# LOGISTIC REGRESSION MODEL WITH CROSS-VALIDATION
# ----------------------------------------
print("LOGISTIC REGRESSION MODEL:")
print("="*50)

# --- Create Pipeline (but don't fit yet) ---
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

print("ü§ñ Pipeline created with StandardScaler + LogisticRegression")

# --- STEP 1: Cross-validation BEFORE fitting ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring_metrics = {
    'accuracy': 'accuracy',
    'precision': f'precision_{AVERAGE_TYPE}',
    'recall': f'recall_{AVERAGE_TYPE}',
    'f1': f'f1_{AVERAGE_TYPE}',
}

print("\nüîÑ Performing Cross-Validation (fresh models)...")
cv_results = {}
for metric_name, scoring in scoring_metrics.items():
    scores = cross_val_score(lr_pipeline, X_train, y_train, cv=cv, scoring=scoring)
    cv_results[metric_name] = (scores.mean(), scores.std())

# --- STEP 2: Fit on full training set ---
print("üèãÔ∏è Fitting on full training set...")
lr_pipeline.fit(X_train, y_train)

# --- STEP 3: Predictions ---
y_pred_train_lr = lr_pipeline.predict(X_train)
y_pred_test_lr = lr_pipeline.predict(X_test)
y_pred_proba_lr = lr_pipeline.predict_proba(X_test)

# --- STEP 4: Calculate metrics ---
# Training metrics
accuracy_train_lr = accuracy_score(y_train, y_pred_train_lr)
precision_train_lr = precision_score(y_train, y_pred_train_lr, average=AVERAGE_TYPE, zero_division=0)
recall_train_lr = recall_score(y_train, y_pred_train_lr, average=AVERAGE_TYPE)
f1_train_lr = f1_score(y_train, y_pred_train_lr, average=AVERAGE_TYPE)

# Test metrics
accuracy_test_lr = accuracy_score(y_test, y_pred_test_lr)
precision_test_lr = precision_score(y_test, y_pred_test_lr, average=AVERAGE_TYPE, zero_division=0)
recall_test_lr = recall_score(y_test, y_pred_test_lr, average=AVERAGE_TYPE)
f1_test_lr = f1_score(y_test, y_pred_test_lr, average=AVERAGE_TYPE)

# --- Display metrics ---
print("\nüìä Logistic Regression Performance:")

print(f"\n   üîÑ Cross-Validation Results ({cv.n_splits} folds):")
for metric, (mean, std) in cv_results.items():
    print(f"      {metric}: {mean:.4f} ¬± {std:.4f}")

print(f"\n   Training Set (Full Fit):")
print(f"      Accuracy:  {accuracy_train_lr:.4f}")
print(f"      Precision: {precision_train_lr:.4f}")
print(f"      Recall:    {recall_train_lr:.4f}")
print(f"      F1-Score:  {f1_train_lr:.4f}")

print(f"\n   Test Set:")
print(f"      Accuracy:  {accuracy_test_lr:.4f}")
print(f"      Precision: {precision_test_lr:.4f}")
print(f"      Recall:    {recall_test_lr:.4f}")
print(f"      F1-Score:  {f1_test_lr:.4f}")

# --- Overfitting check ---
f1_diff_lr = f1_train_lr - f1_test_lr
if abs(f1_diff_lr) < 0.01:
    overfitting_status_lr = "‚úÖ Excellent generalization"
elif f1_diff_lr > 0.05:
    overfitting_status_lr = "‚ö†Ô∏è Possible overfitting"
elif f1_diff_lr > 0.02:
    overfitting_status_lr = "‚ö° Minor overfitting"
else:
    overfitting_status_lr = "‚ÑπÔ∏è Normal variance"

print(f"\n   üîç Overfitting Analysis:")
print(f"      Train-Test F1 diff: {f1_diff_lr:+.4f}")
print(f"      Status: {overfitting_status_lr}")

# --- Comparison with baseline ---
print(f"\nüÜö Comparison with best baseline (Test F1={f1_test_dummy:.4f}):")
improvement = f1_test_lr - f1_test_dummy
if improvement > 0:
    print(f"   ‚úÖ Improvement: +{improvement:.4f} ({(improvement/f1_test_dummy)*100:.1f}%)")
else:
    print(f"   ‚ùå Worse than baseline: {improvement:.4f}")

# --- Classification report ---
print(f"\nüìã Classification Report - Logistic Regression:")
display(get_classification_report_table(y_test, y_pred_test_lr, "Logistic Regression"))

# Importance of features

print("\nüìà Feature Importances (Logistic Regression Coefficients):")

# R√©cup√©ration du scaler et du mod√®le depuis le pipeline
scaler = lr_pipeline.named_steps["scaler"]
model = lr_pipeline.named_steps["classifier"]

# R√©cup√©ration du nom des features (avec les transformations si besoin)
feature_names = X_train.columns

# R√©cup√©ration des coefficients
coefs = pd.Series(model.coef_[0], index=feature_names)

# Calcul de l‚Äôimportance absolue
importance_abs = coefs.abs().sort_values(ascending=False)

# Top 15 plus influentes
top_features = importance_abs.head(15)
print("\nTop 15 Features (by absolute coefficient value):")
print(top_features)

# Ajout du signe pour interpr√©tation directionnelle
coef_summary = pd.DataFrame(
    {
        "Feature": coefs.index,
        "Coefficient": coefs.values,
        "Abs_Importance": coefs.abs().values,
    }
).sort_values(by="Abs_Importance", ascending=False)

print("\nüîç Full coefficient summary (sorted):")
display(coef_summary.head(20))


# --- Save pipeline & model ---
lr_model_pipeline = lr_pipeline
lr_model = lr_pipeline.named_steps['classifier']

In [None]:
# Store variables for later comparison (keeping original names for compatibility)
y_pred_lr = y_pred_test_lr
accuracy_lr = accuracy_test_lr
precision_lr = precision_test_lr
recall_lr = recall_test_lr
f1_lr = f1_test_lr

print(f"\n‚úÖ Logistic Regression Pipeline completed!")

In [None]:
confusion_matrix_analysis(y_test, y_pred_lr, model_name="Logistic Regression")

## 5. Random Forest Model

In [None]:
# ----------------------------------------
# RANDOM FOREST MODEL WITH CROSS-VALIDATION
# ----------------------------------------
print("RANDOM FOREST MODEL:")
print("=" * 50)

# --- Create Random Forest pipeline ---
rf_pipeline = Pipeline(steps=[("classifier", RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1,
))])

# --- Cross-validation BEFORE fitting (methodologically correct) ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring_metrics = {
    "accuracy": "accuracy",
    "precision": f"precision_{AVERAGE_TYPE}",
    "recall": f"recall_{AVERAGE_TYPE}",
    "f1": f"f1_{AVERAGE_TYPE}",
}

cv_results_rf = {}
for metric_name, scoring in scoring_metrics.items():
    scores = cross_val_score(
        rf_pipeline, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1
    )
    cv_results_rf[metric_name] = (scores.mean(), scores.std())

# --- Now fit the model for train/test evaluation ---
rf_model = rf_pipeline.fit(X_train, y_train)

# --- Predictions ---
y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)

# --- Metrics on train set ---
accuracy_train_rf = accuracy_score(y_train, y_pred_train_rf)
precision_train_rf = precision_score(
    y_train, y_pred_train_rf, average=AVERAGE_TYPE, zero_division=0
)
recall_train_rf = recall_score(y_train, y_pred_train_rf, average=AVERAGE_TYPE)
f1_train_rf = f1_score(y_train, y_pred_train_rf, average=AVERAGE_TYPE)

# --- Metrics on test set ---
accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)
precision_test_rf = precision_score(
    y_test, y_pred_test_rf, average=AVERAGE_TYPE, zero_division=0
)
recall_test_rf = recall_score(y_test, y_pred_test_rf, average=AVERAGE_TYPE)
f1_test_rf = f1_score(y_test, y_pred_test_rf, average=AVERAGE_TYPE)
# --- Display metrics ---
print(f"\nüìä Random Forest Performance:")

print(f"\n   üîÑ Cross-Validation (5 folds, train set):")
for metric, (mean, std) in cv_results_rf.items():
    print(f"      {metric}: {mean:.4f} ¬± {std:.4f}")

print(f"\n   Training Set:")
print(f"      Accuracy:  {accuracy_train_rf:.4f}")
print(f"      Precision: {precision_train_rf:.4f}")
print(f"      Recall:    {recall_train_rf:.4f}")
print(f"      F1-Score:  {f1_train_rf:.4f}")

print(f"\n   Test Set:")
print(f"      Accuracy:  {accuracy_test_rf:.4f}")
print(f"      Precision: {precision_test_rf:.4f}")
print(f"      Recall:    {recall_test_rf:.4f}")
print(f"      F1-Score:  {f1_test_rf:.4f}")

# --- Overfitting check ---
f1_diff_rf = f1_train_rf - f1_test_rf
if abs(f1_diff_rf) < 0.01:
    overfitting_status_rf = "‚úÖ Excellent generalization"
elif f1_diff_rf > 0.10:
    overfitting_status_rf = "üö® Significant overfitting"
elif f1_diff_rf > 0.05:
    overfitting_status_rf = "‚ö†Ô∏è Moderate overfitting"
elif f1_diff_rf > 0.02:
    overfitting_status_rf = "‚ö° Minor overfitting"
else:
    overfitting_status_rf = "‚ÑπÔ∏è Normal variance"

print(f"\n   üîç Overfitting Analysis:")
print(f"      Train-Test F1 diff: {f1_diff_rf:+.4f}")
print(f"      Status: {overfitting_status_rf}")

if f1_diff_rf > 0.05:
    print(f"      üí° Consider: Reduce max_depth, increase min_samples_split/leaf")

# --- Comparison with baseline ---
print(f"\nüÜö Comparison with best baseline (Test F1={best_f1:.4f}):")
improvement_rf = f1_test_rf - best_f1
if improvement_rf > 0:
    print(
        f"   ‚úÖ Improvement: +{improvement_rf:.4f} ({(improvement_rf/best_f1)*100:.1f}%)"
    )
else:
    print(f"   ‚ùå Worse than baseline: {improvement_rf:.4f}")

# --- Classification report ---
print(f"\nüìã Classification Report - Random Forest:")
display(get_classification_report_table(y_test, y_pred_test_rf, "Random Forest"))

# --- Feature importance ---
print(f"\nüîç Top 10 Most Important Features:")
feature_importance = pd.DataFrame(
    {"feature": X.columns, "importance": rf_model.named_steps["classifier"].feature_importances_}
).sort_values("importance", ascending=False)

for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
    print(f"   {i+1:2d}. {row['feature']:<25} : {row['importance']:.4f}")

print(f"\nüå≤ Random Forest training completed!")

# --- Save model ---
rf_model_pipeline = rf_model  # Pour compatibilit√© future

In [None]:
confusion_matrix_analysis(y_test, y_pred_test_rf, model_name="Random Forest")

In [None]:
# Store variables for later comparison (keeping original names for compatibility)
y_pred_rf = y_pred_test_rf
accuracy_rf = accuracy_test_rf
precision_rf = precision_test_rf
recall_rf = recall_test_rf
f1_rf = f1_test_rf

print(f"\n‚úÖ Random Forest Pipeline completed!")

## 6. XGBoost



In [None]:

# ----------------------------------------
# XGBOOST MODEL WITH CROSS-VALIDATION
# ----------------------------------------
print("XGBOOST MODEL:")
print("=" * 50)


# --- Create XGBoost pipeline ---
xgb_pipeline = Pipeline(steps=[("classifier", XGBClassifier(
	n_estimators=200,
	max_depth=6,
	learning_rate=0.1,
	subsample=0.8,
	colsample_bytree=0.8,
	min_child_weight=4,
	scale_pos_weight=1,
	random_state=42,
	eval_metric="logloss",
	n_jobs=-1,
))])

# --- Cross-validation BEFORE fitting ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring_metrics = {
	"accuracy": "accuracy",
	"precision": "precision_weighted",
	"recall": "recall_weighted",
	"f1": "f1_weighted",
}

cv_results_xgb = {}
for metric_name, scoring in scoring_metrics.items():
	scores = cross_val_score(
		xgb_pipeline, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1
	)
	cv_results_xgb[metric_name] = (scores.mean(), scores.std())

# --- Fit the model for train/test evaluation ---
xgb_model = xgb_pipeline.fit(X_train, y_train)

# --- Predictions ---
y_pred_train_xgb = xgb_model.predict(X_train)
y_pred_test_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)

# --- Metrics on train set ---
accuracy_train_xgb = accuracy_score(y_train, y_pred_train_xgb)
precision_train_xgb = precision_score(
	y_train, y_pred_train_xgb, average=AVERAGE_TYPE, zero_division=0
)
recall_train_xgb = recall_score(y_train, y_pred_train_xgb, average=AVERAGE_TYPE)
f1_train_xgb = f1_score(y_train, y_pred_train_xgb, average=AVERAGE_TYPE)

# --- Metrics on test set ---
accuracy_test_xgb = accuracy_score(y_test, y_pred_test_xgb)
precision_test_xgb = precision_score(
	y_test, y_pred_test_xgb, average=AVERAGE_TYPE, zero_division=0
)
recall_test_xgb = recall_score(y_test, y_pred_test_xgb, average=AVERAGE_TYPE)
f1_test_xgb = f1_score(y_test, y_pred_test_xgb, average=AVERAGE_TYPE)

# --- Display metrics ---
print(f"\nüìä XGBoost Performance:")

print(f"\n   üîÑ Cross-Validation (5 folds, train set):")
for metric, (mean, std) in cv_results_xgb.items():
	print(f"      {metric}: {mean:.4f} ¬± {std:.4f}")

print(f"\n   Training Set:")
print(f"      Accuracy:  {accuracy_train_xgb:.4f}")
print(f"      Precision: {precision_train_xgb:.4f}")
print(f"      Recall:    {recall_train_xgb:.4f}")
print(f"      F1-Score:  {f1_train_xgb:.4f}")

print(f"\n   Test Set:")
print(f"      Accuracy:  {accuracy_test_xgb:.4f}")
print(f"      Precision: {precision_test_xgb:.4f}")
print(f"      Recall:    {recall_test_xgb:.4f}")
print(f"      F1-Score:  {f1_test_xgb:.4f}")

# --- Overfitting check ---
f1_diff_xgb = f1_train_xgb - f1_test_xgb
if abs(f1_diff_xgb) < 0.01:
	overfitting_status_xgb = "‚úÖ Excellent generalization"
elif f1_diff_xgb > 0.10:
	overfitting_status_xgb = "üö® Significant overfitting"
elif f1_diff_xgb > 0.05:
	overfitting_status_xgb = "‚ö†Ô∏è Moderate overfitting"
elif f1_diff_xgb > 0.02:
	overfitting_status_xgb = "‚ö° Minor overfitting"
else:
	overfitting_status_xgb = "‚ÑπÔ∏è Normal variance"

print(f"\n   üîç Overfitting Analysis:")
print(f"      Train-Test F1 diff: {f1_diff_xgb:+.4f}")
print(f"      Status: {overfitting_status_xgb}")

if f1_diff_xgb > 0.05:
	print(f"      üí° Consider: Reduce max_depth, increase min_child_weight or regularization")

# --- Comparison with baseline ---
print(f"\nüÜö Comparison with best baseline (Test F1={best_f1:.4f}):")
improvement_xgb = f1_test_xgb - best_f1
if improvement_xgb > 0:
	print(
		f"   ‚úÖ Improvement: +{improvement_xgb:.4f} ({(improvement_xgb/best_f1)*100:.1f}%)"
	)
else:
	print(f"   ‚ùå Worse than baseline: {improvement_xgb:.4f}")

# --- Classification report ---
print(f"\nüìã Classification Report - XGBoost:")
display(get_classification_report_table(y_test, y_pred_test_xgb, "XGBoost"))

# --- Feature importance ---
print(f"\nüîç Top 10 Most Important Features (XGBoost):")
feature_importance_xgb = pd.DataFrame(
	{"feature": X.columns, "importance": xgb_model.named_steps["classifier"].feature_importances_}
).sort_values("importance", ascending=False)

for i, (_, row) in enumerate(feature_importance_xgb.head(10).iterrows()):
	print(f"   {i+1:2d}. {row['feature']:<25} : {row['importance']:.4f}")

print(f"\nüåü XGBoost training completed!")

# --- Save model ---
xgb_model_pipeline = xgb_model

# Store variables for later comparison (keeping original names for compatibility)
y_pred_xgb = y_pred_test_xgb
accuracy_xgb = accuracy_test_xgb
precision_xgb = precision_test_xgb
recall_xgb = recall_test_xgb
f1_xgb = f1_test_xgb

## 6. Model Comparison and Selection

In [None]:
# ================================================
# COMPREHENSIVE MODEL COMPARISON WITH CV RESULTS
# ================================================
print("MODEL COMPARISON AND SELECTION:")
print("="*50)

# Prepare comprehensive comparison data
models_summary = {
    "DummyClassifier (Baseline)": {
        "cv": dummy_results[best_strategy]["cv"],
        "test": {
            "accuracy": accuracy_test_dummy,
            "precision": precision_test_dummy,
            "recall": recall_test_dummy,
            "f1": f1_test_dummy,
        },
        "strategy": best_strategy,
    },
    "Logistic Regression": {
        "cv": cv_results,
        "test": {
            "accuracy": accuracy_test_lr,
            "precision": precision_test_lr,
            "recall": recall_test_lr,
            "f1": f1_test_lr,
        },
    },
    "Random Forest": {
        "cv": cv_results_rf,
        "test": {
            "accuracy": accuracy_test_rf,
            "precision": precision_test_rf,
            "recall": recall_test_rf,
            "f1": f1_test_rf,
        },
    },
    "XGBoost": {
        "cv": cv_results_xgb,
        "test": {
            "accuracy": accuracy_test_xgb,
            "precision": precision_test_xgb,
            "recall": recall_test_xgb,
            "f1": f1_test_xgb,
        },
    },
}

# ================================================
# DETAILED COMPARISON TABLE
# ================================================
print("\nüìä DETAILED MODEL COMPARISON:")
print("-" * 80)
print(f"{'Model':<20} {'CV F1':<15}  {'Test F1':<16}  {'CV Stability':<12}")
print("-" * 80)

model_rankings = []

for model_name, results in models_summary.items():
    # Now we can use the proper f1 key
    cv_f1 = results['cv'].get('f1', (0, 0))
    test_f1 = results['test'].get('f1', results['test'].get('f1', 0))
    
    cv_stability = cv_f1[1] if isinstance(cv_f1, tuple) else 0
    
    # Stability assessment
    if cv_stability < 0.01:
        stability_emoji = "‚úÖ"
    elif cv_stability < 0.02:
        stability_emoji = "üëç"
    elif cv_stability < 0.05:
        stability_emoji = "‚ö†Ô∏è"
    else:
        stability_emoji = "üö®"
    
    print(f"{model_name:<20} "
          f"{cv_f1[0] if isinstance(cv_f1, tuple) else cv_f1:<15.4f} "
          f"{test_f1:<16.4f} "
          f"{stability_emoji} {cv_stability:<10.4f}")
    
    # Store for ranking
    model_rankings.append({
        'name': model_name,
        'cv_f1': cv_f1[0] if isinstance(cv_f1, tuple) else cv_f1,
        'test_f1': test_f1,
        'stability': cv_stability
    })

# ================================================
# MODEL RANKING AND SELECTION
# ================================================
print(f"\nüèÜ MODEL RANKING:")
print("-" * 50)

# Rank by test F1-weighted (primary metric)
model_rankings.sort(key=lambda x: x['test_f1'], reverse=True)

print("üìà By Test F1 Score:")
for i, model in enumerate(model_rankings, 1):
    medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else f"{i}."
    print(f"   {medal} {model['name']:<25}: {model['test_f1']:.4f}")

# Rank by CV F1-weighted (reliability check)
model_rankings_cv = sorted(model_rankings, key=lambda x: x['cv_f1'], reverse=True)
print("\nüîÑ By CV F1 Score (Training Reliability):")
for i, model in enumerate(model_rankings_cv, 1):
    medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else f"{i}."
    print(f"   {medal} {model['name']:<25}: {model['cv_f1']:.4f}")

# ================================================
# BEST MODEL SELECTION
# ================================================
best_model = model_rankings[0]  # Best by test F1-weighted
best_model_name = best_model['name']

print(f"\nüéØ FINAL MODEL SELECTION:")
print(f"   Best Model: {best_model_name}")
print(f"   Test F1: {best_model['test_f1']:.4f}")
print(f"   CV F1: {best_model['cv_f1']:.4f}")
print(f"   CV Stability (std): {best_model['stability']:.4f}")

# Performance improvement over baseline
baseline_perf = [m for m in model_rankings if 'Dummy' in m['name']][0]['test_f1']
improvement = best_model['test_f1'] - baseline_perf

if best_model['test_f1'] > baseline_perf:
    if baseline_perf > 0:
        improvement_pct = (improvement / baseline_perf) * 100
        print(f"   üìà Improvement over baseline: +{improvement:.4f} ({improvement_pct:.1f}%)")
    else:
        print(f"   üìà Improvement over baseline: +{improvement:.4f} (baseline was 0.0000)")
elif improvement == 0:
    print(f"   üü∞ Same performance as baseline: {best_model['test_']:.4f}")
else:
    print(f"   ‚ö†Ô∏è Performance issue: Model performs worse than baseline by {abs(improvement):.4f}!")

# ================================================
# OVERFITTING ANALYSIS
# ================================================
print(f"\nüîç OVERFITTING ANALYSIS:")
print("-" * 30)

for model in model_rankings:
    cv_test_diff = model['cv_f1'] - model['test_f1']
    
    if abs(cv_test_diff) < 0.01:
        status = "‚úÖ Excellent generalization"
    elif cv_test_diff > 0.05:
        status = "‚ö†Ô∏è Possible overfitting"
    elif cv_test_diff > 0.02:
        status = "‚ö° Minor overfitting"
    elif cv_test_diff < -0.02:
        status = "üìâ Underfitting"
    else:
        status = "üëç Good generalization"
    
    print(f"{model['name']:<25}: CV-Test diff = {cv_test_diff:+.4f} ‚Üí {status}")

print(f"\n‚úÖ Model comparison completed!")
print(f"üéØ Recommended model: {best_model_name}")

# Store results for potential further use
comparison_results = {
    'best_model': best_model_name,
    'rankings': model_rankings,
    'summary': models_summary
}

## 7. Detailed Performance Analysis by Class



In [None]:
# ================================================
# DETAILED PERFORMANCE ANALYSIS BY CLASS
# ================================================
print("DETAILED PERFORMANCE ANALYSIS BY CLASS:")
print("="*60)

# Collect predictions from all models
model_predictions = {
    'DummyClassifier (Baseline)': y_pred_test_dummy,
    'Logistic Regression': y_pred_test_lr,
    'Random Forest': y_pred_test_rf,
    'XGBoost': y_pred_test_xgb
}

# Class labels mapping
class_labels = {0: "Stayed (Class 0)", 1: "Left (Class 1)"}

print(f"\nüìä CLASS DISTRIBUTION IN TEST SET:")
print("-" * 40)
test_class_counts = y_test.value_counts().sort_index()
for class_val, count in test_class_counts.items():
    percentage = (count / len(y_test)) * 100
    print(f"   {class_labels[class_val]:<20}: {count:3d} samples ({percentage:.1f}%)")

# ================================================
# PER-CLASS METRICS FOR EACH MODEL
# ================================================
print(f"\nüìà PER-CLASS PERFORMANCE METRICS:")
print("="*60)

for model_name, y_pred in model_predictions.items():
    print(f"\nüî∏ {model_name}:")
    print("-" * 50)
    
    # Calculate per-class metrics
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    # Display metrics for each class
    for class_val in [0, 1]:
        class_str = str(class_val)
        if class_str in report:
            metrics = report[class_str]
            print(f"\n   {class_labels[class_val]}:")
            print(f"      Precision: {metrics['precision']:.4f}")
            print(f"      Recall:    {metrics['recall']:.4f}")
            print(f"      F1-Score:  {metrics['f1-score']:.4f}")
            print(f"      Support:   {metrics['support']:.0f} samples")
    
    # Overall metrics
    print(f"\n   üìä Overall Performance:")
    print(f"      Accuracy:     {report['accuracy']:.4f}")
    print(f"      Macro Avg F1: {report['macro avg']['f1-score']:.4f}")
    print(f"      Weighted F1:  {report['weighted avg']['f1-score']:.4f}")

# ================================================
# CLASS-SPECIFIC COMPARISON TABLE
# ================================================
print(f"\nüîç CLASS-SPECIFIC COMPARISON ACROSS MODELS:")
print("="*70)

# Create comparison table for each class
for class_val in [0, 1]:
    print(f"\nüìä {class_labels[class_val]} Performance:")
    print("-" * 55)
    print(f"{'Model':<25} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
    print("-" * 55)
    
    class_comparisons = []
    
    for model_name, y_pred in model_predictions.items():
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        class_str = str(class_val)
        
        if class_str in report:
            metrics = report[class_str]
            precision = metrics['precision']
            recall = metrics['recall']
            f1 = metrics['f1-score']
            
            print(f"{model_name:<25} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f}")
            
            class_comparisons.append({
                'model': model_name,
                'precision': precision,
                'recall': recall,
                'f1': f1
            })
    
    # Find best model for this class
    if class_comparisons:
        best_f1_model = max(class_comparisons, key=lambda x: x['f1'])
        best_recall_model = max(class_comparisons, key=lambda x: x['recall'])
        
        print(f"\n   üèÜ Best F1-Score:  {best_f1_model['model']} ({best_f1_model['f1']:.4f})")
        print(f"   üéØ Best Recall:    {best_recall_model['model']} ({best_recall_model['recall']:.4f})")

# ================================================
# CONFUSION MATRIX COMPARISON
# ================================================
print(f"\nüîç CONFUSION MATRIX COMPARISON:")
print("="*50)

for model_name, y_pred in model_predictions.items():
    print(f"\nüî∏ {model_name}:")
    cm = confusion_matrix(y_test, y_pred)
    
    print("   Confusion Matrix:")
    print(f"               Predicted")
    print(f"             Stayed  Left")
    print(f"   Actual Stayed  {cm[0,0]:3d}   {cm[0,1]:3d}")
    print(f"          Left    {cm[1,0]:3d}   {cm[1,1]:3d}")
    
    # Calculate derived metrics
    if len(cm) == 2 and cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        
        # Calculate rates
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # True Positive Rate (Recall for Class 1)
        tnr = tn / (tn + fp) if (tn + fp) > 0 else 0  # True Negative Rate (Recall for Class 0)
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        print(f"\n   üìä Key Rates:")
        print(f"      True Positive Rate (Sensitivity):  {tpr:.4f}")
        print(f"      True Negative Rate (Specificity):  {tnr:.4f}")
        print(f"      False Positive Rate:               {fpr:.4f}")
        print(f"      False Negative Rate:               {fnr:.4f}")

# ================================================
# BUSINESS IMPACT ANALYSIS
# ================================================
print(f"\nüíº BUSINESS IMPACT ANALYSIS:")
print("="*50)

print(f"\nüìà Key Business Metrics:")
print("-" * 30)

for model_name, y_pred in model_predictions.items():
    cm = confusion_matrix(y_test, y_pred)
    if len(cm) == 2 and cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        
        # Business interpretations
        correctly_identified_leavers = tp
        missed_leavers = fn
        false_alarms = fp
        correctly_identified_stayers = tn
        
        total_actual_leavers = tp + fn
        total_actual_stayers = tn + fp
        
        print(f"\nüî∏ {model_name}:")
        print(f"   Correctly identified employees who left:  {correctly_identified_leavers}/{total_actual_leavers} ({(correctly_identified_leavers/total_actual_leavers)*100:.1f}%)")
        print(f"   Missed employees who left:                {missed_leavers}/{total_actual_leavers} ({(missed_leavers/total_actual_leavers)*100:.1f}%)")
        print(f"   False alarms (predicted left but stayed): {false_alarms}/{total_actual_stayers} ({(false_alarms/total_actual_stayers)*100:.1f}%)")
        
        # Cost-benefit consideration
        if total_actual_leavers > 0:
            prevention_success_rate = correctly_identified_leavers / total_actual_leavers
            print(f"   Potential intervention success rate:       {prevention_success_rate:.1%}")

print(f"\n‚úÖ Detailed class-specific analysis completed!")
print(f"üí° Use these insights to understand which employees your model identifies best.")

### Precision Recall curve for Logistic Regression, Random Forest, and XGBoost

In [None]:

# Dictionnaire des mod√®les et leurs probabilit√©s pour la classe positive
models_proba = {
    "DummyClassifier (Baseline)": y_pred_proba_dummy,
    "Logistic Regression": y_pred_proba_lr,
    "Random Forest": y_pred_proba_rf,
    "XGBoost": y_pred_proba_xgb,
}

model_colors = {
    "DummyClassifier (Baseline)": "gray",
    "Logistic Regression": "blue",
    "Random Forest": "green",
    "XGBoost": "orange",
}

plt.figure(figsize=(8, 6))

for model_name, y_pred_proba in models_proba.items():
    # S√©lection de la probabilit√© de la classe positive
    y_scores = y_pred_proba[:, 1] if y_pred_proba.ndim > 1 else y_pred_proba

    precision, recall, _ = precision_recall_curve(y_test, y_scores)
    average_precision = average_precision_score(y_test, y_scores)

    plt.plot(
        recall,
        precision,
        label=f"{model_name} (AP = {average_precision:.2f})",
        color=model_colors.get(model_name, None),
    )

# D√©corations du graphique
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve Comparison")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:


# === 1. Donn√©es de d√©part ===
recap_data = {
    "Model": [
        "DummyClassifier (Baseline)",
        "Logistic Regression",
        "Random Forest",
        "XGBoost",
    ],
    "Accuracy Train": [
        accuracy_train_dummy,
        accuracy_train_lr,
        accuracy_train_rf,
        accuracy_train_xgb,
    ],
    "Accuracy Test": [
        accuracy_test_dummy,
        accuracy_test_lr,
        accuracy_test_rf,
        accuracy_test_xgb,
    ],
    "Recall Train": [
        recall_train_dummy,
        recall_train_lr,
        recall_train_rf,
        recall_train_xgb,
    ],
    "Recall Test": [recall_test_dummy, recall_test_lr, recall_test_rf, recall_test_xgb],
    "F1 Train": [f1_train_dummy, f1_train_lr, f1_train_rf, f1_train_xgb],
    "F1 Test": [f1_test_dummy, f1_test_lr, f1_test_rf, f1_test_xgb],
    "Overfitting (Train-Test F1)": [
        None,
        f1_train_lr - f1_test_lr,
        f1_train_rf - f1_test_rf,
        f1_train_xgb - f1_test_xgb,
    ],
    "Recall Class 1 (Test)": [
        classification_report(
            y_test, y_pred_test_dummy, output_dict=True, zero_division=0
        )["1"]["recall"],
        classification_report(
            y_test, y_pred_test_lr, output_dict=True, zero_division=0
        )["1"]["recall"],
        classification_report(
            y_test, y_pred_test_rf, output_dict=True, zero_division=0
        )["1"]["recall"],
        classification_report(
            y_test, y_pred_test_xgb, output_dict=True, zero_division=0
        )["1"]["recall"],
    ],
}

# === 2. Cr√©ation du DataFrame ===
recap_df = pd.DataFrame(recap_data)

# === 3. S√©lection des colonnes num√©riques √† visualiser ===
metrics_cols = [
    "Accuracy Train",
    "Accuracy Test",
    "Recall Train",
    "Recall Test",
    "F1 Train",
    "F1 Test",
    "Overfitting (Train-Test F1)",
    "Recall Class 1 (Test)",
]

# === 4. Pr√©paration pour le heatmap ===
heat_df = recap_df.set_index("Model")[metrics_cols].astype(float)

# === 5. Affichage avec Seaborn ===
plt.figure(figsize=(10, 4))
sns.heatmap(
    heat_df,
    annot=True,  # affiche les valeurs
    fmt=".2f",  # 2 d√©cimales
    cmap="YlGnBu",  # palette bleue/verte
    linewidths=0.5,  # fines s√©parations
    cbar=True,  # barre de couleur √† droite
)
plt.title("Model Performance Comparison", fontsize=13, pad=12)
plt.tight_layout()
plt.show()