In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
import pandas as pd

# Changer le répertoire courant
os.chdir(r"C:\Users\yacin\Desktop\DATA SCIENCE OPENCLASSROOMS\Projet 8 MLOPS2\datasets")

# Afficher les fichiers pour vérifier
print(os.listdir())

['test_final.csv', 'train_final.csv']


In [3]:
train_final = pd.read_csv("train_final.csv")

In [4]:
test_final = pd.read_csv("test_final.csv")

In [5]:
train_final.shape

(307511, 598)

In [6]:
test_final.shape

(48744, 597)

## Fusion des données 

In [7]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

Pourquoi un seuil à 0.1 : Par défaut, le seuil est 0.5.
Mais dans ton projet, accorder un crédit à un mauvais client (FN) est très coûteux.

Donc, en abaissant le seuil à 0.1, tu dis :

    "Même si j’ai seulement 10% de certitude qu’un client est mauvais, je préfère le classer comme tel pour éviter une perte potentielle."

In [8]:
#XGBOOST

In [9]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score,
    accuracy_score, confusion_matrix, classification_report
)
from xgboost import XGBClassifier
import optuna
import numpy as np
import matplotlib.pyplot as plt

# --------- Split des données ---------
X = train_final.drop(columns=["TARGET"])
y = train_final["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------- Fonctions utiles ---------
def compute_business_cost(y_true, y_pred, fn_cost=10, fp_cost=1):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return fn * fn_cost + fp * fp_cost

def find_best_threshold(y_true, y_probas, thresholds=np.linspace(0.1, 0.9, 81), fn_cost=10, fp_cost=1):
    best_threshold = 0.5
    lowest_cost = float("inf")
    cost_per_threshold = []

    for threshold in thresholds:
        y_pred = (y_probas >= threshold).astype(int)
        cost = compute_business_cost(y_true, y_pred, fn_cost, fp_cost)
        cost_per_threshold.append((threshold, cost))
        if cost < lowest_cost:
            best_threshold = threshold
            lowest_cost = cost

    return best_threshold, lowest_cost, cost_per_threshold

# --------- Définition de l'espace des hyperparamètres XGBoost ---------
param_space_xgb = {
    "n_estimators": ("int", 50, 150),
    "max_depth": ("int", 3, 6),
    "learning_rate": ("float", 0.01, 0.3),
    "subsample": ("float", 0.6, 1.0),
    "colsample_bytree": ("float", 0.6, 1.0),
    "gamma": ("float", 0.0, 5.0),
    "reg_lambda": ("float", 0.001, 10.0),
    "reg_alpha": ("float", 0.001, 10.0),
}

# --------- Optimisation avec Optuna ---------
def objective(trial):
    params = {
        key: (
            trial.suggest_int(key, *space[1:]) if space[0] == "int"
            else trial.suggest_float(key, *space[1:]) if space[0] == "float"
            else trial.suggest_categorical(key, space)
        )
        for key, space in param_space_xgb.items()
    }

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', **params)
    y_val_proba = cross_val_predict(
        model,
        X_train,
        y_train,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        method='predict_proba',
        n_jobs=-1
    )[:, 1]

    y_val_pred = (y_val_proba >= 0.5).astype(int)
    return compute_business_cost(y_train, y_val_pred)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
best_params = study.best_params

# --------- Entraînement et évaluation finale ---------
best_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', **best_params)
best_model.fit(X_train, y_train)

y_test_proba = best_model.predict_proba(X_test)[:, 1]
best_threshold, cost, cost_curve = find_best_threshold(y_test, y_test_proba)
y_test_pred = (y_test_proba >= best_threshold).astype(int)

# --------- Résultats ---------
print(f"Meilleur seuil trouvé : {best_threshold:.2f}")
print(f"Coût métier total : {cost}")
print("\nRapport de classification :\n")
print(classification_report(y_test, y_test_pred, digits=3))


[I 2025-08-20 15:40:49,459] A new study created in memory with name: no-name-756a701a-9cc3-4207-ba49-f341b7007783
[I 2025-08-20 15:41:08,766] Trial 0 finished with value: 198760.0 and parameters: {'n_estimators': 59, 'max_depth': 4, 'learning_rate': 0.03362092976141023, 'subsample': 0.602338704052726, 'colsample_bytree': 0.8693855634419014, 'gamma': 3.922610949101106, 'reg_lambda': 5.899250157807242, 'reg_alpha': 5.215501890680464}. Best is trial 0 with value: 198760.0.
[I 2025-08-20 15:41:34,321] Trial 1 finished with value: 190372.0 and parameters: {'n_estimators': 107, 'max_depth': 4, 'learning_rate': 0.24804846059426322, 'subsample': 0.7633391440043886, 'colsample_bytree': 0.9644068618572104, 'gamma': 3.2662701047649922, 'reg_lambda': 1.7547098069883518, 'reg_alpha': 8.543591329611194}. Best is trial 1 with value: 190372.0.
[I 2025-08-20 15:42:05,862] Trial 2 finished with value: 197196.0 and parameters: {'n_estimators': 136, 'max_depth': 4, 'learning_rate': 0.05115666524367053, 's

Meilleur seuil trouvé : 0.10
Coût métier total : 32124

Rapport de classification :

              precision    recall  f1-score   support

           0      0.956     0.793     0.867     56554
           1      0.199     0.588     0.297      4949

    accuracy                          0.776     61503
   macro avg      0.578     0.690     0.582     61503
weighted avg      0.896     0.776     0.821     61503

