In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# === Reproductibilité ===
import os, random, numpy as np
os.environ["PYTHONHASHSEED"] = "42"
random.seed(42)
np.random.seed(42)

# XGBoost / Sklearn: on passera random_state=42 partout où c'est possible
# Optuna: on fige aussi son sampler


In [2]:
import os
import pandas as pd

# Changer le répertoire courant
os.chdir(r"C:\Users\yacin\Desktop\DATA SCIENCE OPENCLASSROOMS\Projet 8 MLOPS2\datasets")

# Afficher les fichiers pour vérifier
print(os.listdir())

['test_final.csv', 'train_final.csv']


In [3]:
train_final = pd.read_csv("train_final.csv")

In [4]:
test_final = pd.read_csv("test_final.csv")

In [5]:
train_final.shape

(307511, 598)

In [6]:
test_final.shape

(48744, 597)

## Fusion des données 

In [7]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

Pourquoi un seuil à 0.1 : Par défaut, le seuil est 0.5.
Mais dans ton projet, accorder un crédit à un mauvais client (FN) est très coûteux.

Donc, en abaissant le seuil à 0.1, tu dis :

    "Même si j’ai seulement 10% de certitude qu’un client est mauvais, je préfère le classer comme tel pour éviter une perte potentielle."

In [8]:
#XGBOOST

In [10]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score,
    accuracy_score, confusion_matrix, classification_report
)
from xgboost import XGBClassifier
import optuna
import numpy as np
import matplotlib.pyplot as plt

# --------- Split des données ---------
X = train_final.drop(columns=["TARGET"])
y = train_final["TARGET"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --------- Fonctions utiles ---------
def compute_business_cost(y_true, y_pred, fn_cost=10, fp_cost=1):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    return fn * fn_cost + fp * fp_cost

def find_best_threshold(y_true, y_probas, thresholds=np.linspace(0.1, 0.9, 81), fn_cost=10, fp_cost=1):
    best_threshold = 0.5
    lowest_cost = float("inf")
    cost_per_threshold = []

    for threshold in thresholds:
        y_pred = (y_probas >= threshold).astype(int)
        cost = compute_business_cost(y_true, y_pred, fn_cost, fp_cost)
        cost_per_threshold.append((threshold, cost))
        if cost < lowest_cost:
            best_threshold = threshold
            lowest_cost = cost

    return best_threshold, lowest_cost, cost_per_threshold

# --------- Définition de l'espace des hyperparamètres XGBoost ---------
param_space_xgb = {
    "n_estimators": ("int", 50, 150),
    "max_depth": ("int", 3, 6),
    "learning_rate": ("float", 0.01, 0.3),
    "subsample": ("float", 0.6, 1.0),
    "colsample_bytree": ("float", 0.6, 1.0),
    "gamma": ("float", 0.0, 5.0),
    "reg_lambda": ("float", 0.001, 10.0),
    "reg_alpha": ("float", 0.001, 10.0),
}

# --------- Optimisation avec Optuna ---------
def objective(trial):
    params = {
        key: (
            trial.suggest_int(key, *space[1:]) if space[0] == "int"
            else trial.suggest_float(key, *space[1:]) if space[0] == "float"
            else trial.suggest_categorical(key, space)
        )
        for key, space in param_space_xgb.items()
    }

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', **params)
    y_val_proba = cross_val_predict(
        model,
        X_train,
        y_train,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        method='predict_proba',
        n_jobs=-1
    )[:, 1]

    y_val_pred = (y_val_proba >= 0.5).astype(int)
    return compute_business_cost(y_train, y_val_pred)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
best_params = study.best_params

# --------- Entraînement et évaluation finale ---------
best_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', **best_params)
best_model.fit(X_train, y_train)

y_test_proba = best_model.predict_proba(X_test)[:, 1]
best_threshold, cost, cost_curve = find_best_threshold(y_test, y_test_proba)
y_test_pred = (y_test_proba >= best_threshold).astype(int)

# --------- Résultats ---------
print(f"Meilleur seuil trouvé : {best_threshold:.2f}")
print(f"Coût métier total : {cost}")
print("\nRapport de classification :\n")
print(classification_report(y_test, y_test_pred, digits=3))


[I 2025-08-20 16:00:44,362] A new study created in memory with name: no-name-baa5187b-dd5f-48d9-9824-01c62a2dc111
[I 2025-08-20 16:01:13,046] Trial 0 finished with value: 192050.0 and parameters: {'n_estimators': 145, 'max_depth': 3, 'learning_rate': 0.1688201642023821, 'subsample': 0.6160060578874623, 'colsample_bytree': 0.9725447203285285, 'gamma': 4.626428741194067, 'reg_lambda': 0.7797463468279101, 'reg_alpha': 4.750607910478342}. Best is trial 0 with value: 192050.0.
[I 2025-08-20 16:01:30,990] Trial 1 finished with value: 190485.0 and parameters: {'n_estimators': 51, 'max_depth': 5, 'learning_rate': 0.2963890349644768, 'subsample': 0.6573998590458774, 'colsample_bytree': 0.6110355449054009, 'gamma': 0.4653729121712702, 'reg_lambda': 4.157649868674675, 'reg_alpha': 5.451013090905095}. Best is trial 1 with value: 190485.0.
[I 2025-08-20 16:01:53,052] Trial 2 finished with value: 198658.0 and parameters: {'n_estimators': 95, 'max_depth': 3, 'learning_rate': 0.04807493774404145, 'sub

Meilleur seuil trouvé : 0.10
Coût métier total : 31502

Rapport de classification :

              precision    recall  f1-score   support

           0      0.958     0.790     0.866     56554
           1      0.201     0.603     0.302      4949

    accuracy                          0.775     61503
   macro avg      0.579     0.697     0.584     61503
weighted avg      0.897     0.775     0.821     61503



In [13]:
# On part du modèle déjà entraîné ci-dessus: best_model
# et on réutilise le meilleur seuil trouvé: best_threshold

# 1) Charger le jeu d’inférence
X_prod = test_final.copy()                # (48744, 597)


# ⚠️ Assure-toi que X_prod a les MÊMES colonnes que X_train (même ordre si nécessaire)
# Si tu as gardé la liste d'entraînement :
# X_prod = X_prod[X_train.columns]

# 2) Prédire
proba_prod = best_model.predict_proba(X_prod)[:, 1]
pred_prod  = (proba_prod >= best_threshold).astype(int)

# 3) Sauvegarder
preds_df = X_prod.copy()
preds_df["score"] = proba_prod
preds_df["prediction"] = pred_prod


print("✅ Fichier créé: datasets/predictions_test_final.csv")
print(preds_df)

✅ Fichier créé: datasets/predictions_test_final.csv
       SK_ID_CURR  NAME_CONTRACT_TYPE  FLAG_OWN_CAR  FLAG_OWN_REALTY  \
0       -0.000003                 0.0           0.0              1.0   
1        0.000008                 0.0           0.0              1.0   
2        0.000031                 0.0           1.0              1.0   
3        0.000073                 0.0           0.0              1.0   
4        0.000101                 0.0           1.0              0.0   
...           ...                 ...           ...              ...   
48739    0.999905                 0.0           0.0              1.0   
48740    0.999907                 0.0           0.0              0.0   
48741    0.999910                 0.0           1.0              1.0   
48742    0.999913                 0.0           0.0              0.0   
48743    0.999986                 0.0           1.0              0.0   

       CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0          0.0000

In [15]:
import os
import joblib

# Création du dossier si nécessaire
os.makedirs("models", exist_ok=True)

# Sauvegarde du modèle
joblib.dump(best_model, "models/best_model.pkl")
print("✅ Modèle sauvegardé dans models/best_model.pkl")

✅ Modèle sauvegardé dans models/best_model.pkl
