<b><font color="SteelBlue" size="+3">Implémentez un modèle de scoring</font></b>

# Introduction

Ce note book est la suite de l'EDA, il va permettre de présenter le prétraitement des données, le feature engineering et la modélisation.

In [1]:
# Chargement des librairies

# Built-in
import os
import re
import time
# Affichage
from IPython.display import Image

# Manipulation et analyse des données
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning - Suivi et Enregistrement
import mlflow
import mlflow.sklearn

# Machine Learning - Prétraitement
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight

# Machine Learning - Algorithmes
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

# Machine Learning - Équilibrage des données
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Machine Learning - Évaluation des performances
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, make_scorer
from sklearn.metrics import f1_score

# Features importance
import shap

# Transformateur
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin

# Data drift
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import joblib

# Custom Pipeline
from feature_pipeline import FeatureEngineeringPipeline, remove_highly_correlated_features
from models import PipelineWithDriftDetection, prepare_pip_data, ThresholdClassifier

  from .autonotebook import tqdm as notebook_tqdm


# Data préparation

## Chargement des données

In [2]:
os.listdir("data/Sources/")

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'param',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'Projet+Mise+en+prod+-+home-credit-default-risk.zip',
 'sample_submission.csv']

In [3]:
PATH = "./data/Sources/"

In [4]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")

In [5]:
application_train.shape

(307511, 122)

## Split du jeu de données

In [6]:
# Division des données en ensembles d'entraînement et de test
train, test = train_test_split(application_train, test_size=0.2, random_state=42, stratify=application_train['TARGET'])

In [7]:
train.shape

(246008, 122)

In [8]:
train.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
181648,310536,0,Cash loans,F,N,N,2,90000.0,227520.0,13189.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
229245,365516,0,Cash loans,M,Y,Y,0,90000.0,161730.0,13095.0,...,0,0,0,0,,,,,,
122525,242055,1,Cash loans,M,N,Y,0,135000.0,728847.0,26307.0,...,0,0,0,0,0.0,0.0,2.0,0.0,0.0,2.0
306311,454894,1,Cash loans,M,N,N,0,135000.0,474183.0,34636.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
300658,448321,0,Cash loans,F,N,Y,0,180000.0,254700.0,27558.0,...,0,0,0,0,,,,,,


## Sans imputation

In [9]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline(data_directory=PATH, impute=False)

In [None]:
train_transformed = pipeline.fit(train)

feature_engineering - done
Bureau and bureau_balance data - done in 55s
previous_application - done in 52s
previous applications balances - done in 461s
Colonnes supprimées (Constantes): 0
Variables avec + de 50% de valeurs manquantes: 247
Colonnes supprimées (hautement corrélées): 125


In [None]:
train_transformed.shape

In [None]:
train_transformed.head(5)

In [None]:
test_transformed = pipeline.transform(test)

In [None]:
test_transformed.shape

In [None]:
test_transformed.head(5)

In [None]:
# Séparer les features et la cible sur le jeu sans imputation
X_train = train_transformed
y_train = train['TARGET']
X_test = test_transformed
y_test = test['TARGET']

## Avec imputation

In [None]:
# Initialiser le pipeline
pipeline_imp = FeatureEngineeringPipeline(data_directory=PATH, impute=True)

In [None]:
train_imputed_transformed = pipeline_imp.fit(train)

In [None]:
train_imputed_transformed.shape

In [None]:
test_imputed_transformed = pipeline_imp.transform(test)

In [None]:
test_imputed_transformed.shape

In [None]:
# Séparer les features et la cible sur le jeu imputé
X_train_imputed = train_imputed_transformed
y_train_imputed = train['TARGET']
X_test_imputed = test_imputed_transformed
y_test_imputed = test['TARGET']

## Sauvegarde / Chargement

In [None]:
# Fonction pour sauvegarder un DataFrame
def save_dataframe(df, filename, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_path = os.path.join(output_dir, filename)
    df.to_csv(output_path, index=False)
    print(f"DataFrame sauvegardé sous : {output_path}")

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
train_imp_df = pd.concat([X_train_imputed, y_train_imputed], axis=1)
test_imp_df = pd.concat([X_test_imputed, y_test_imputed], axis=1)

In [None]:
# Sauvegarde des dataframe selectionnées
output_dir = "/data/Cleaned/"
save_dataframe(train_df, "train_df.csv", output_dir)
save_dataframe(test_df, "test_df.csv", output_dir)
save_dataframe(train_imp_df, "train_imp_df.csv", output_dir)
save_dataframe(test_imp_df, "test_imp_df.csv", output_dir)

In [None]:
# Vérification de l'existence de la variable train_df
try:
    train_df
except NameError:
    train_df = None

if train_df is None:
    # Définir la variable selected_features ici
    output_dir = "data/Cleaned/"
    train_df = pd.read_csv(os.path.join(output_feat_dir, "train_df.csv"))
    test_df = pd.read_csv(os.path.join(output_feat_dir, "test_df.csv"))
    train_imp_df = pd.read_csv(os.path.join(output_feat_dir, "train_imp_df.csv"))
    test_imp_df = pd.read_csv(os.path.join(output_feat_dir, "test_imp_df.csv"))  
    print("Dataframes chargés avec succès.")

    # Séparer les features et la cible sur le jeu sans imputation
    X_train = train_df.drop(columns=['TARGET'])
    y_train = train_df['TARGET']
    X_test = test_df.drop(columns=['TARGET'])
    y_test = test_df['TARGET']


    # Séparer les features et la cible sur le jeu imputé
    X_train_imputed = train_imp_df.drop(columns=['TARGET'])
    y_train_imputed = train_imp_df['TARGET']
    X_test_imputed = test_imp_df.drop(columns=['TARGET'])
    y_test_imputed = test_imp_df['TARGET']


## Standardisation

In [None]:
# Normaliser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

## Ensemble de validation

In [None]:
# Division des données d'entraînement en sous-ensembles d'entraînement et de validation sur jeu non imputé
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(X_train, y_train,
                                                                      test_size=0.2, random_state=42,
                                                                      stratify=y_train)

In [None]:
# Division des données d'entraînement en sous-ensembles d'entraînement et de validation sur jeu imputé
X_train_imp_part, X_val_imp_part, y_train_imp_part, y_val_imp_part = train_test_split(X_train_imputed, y_train_imputed,
                                                                                      test_size=0.2, random_state=42,
                                                                                      stratify=y_train_imputed)

### Sur le df train

In [None]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline(data_directory=PATH)

In [None]:
# Entraîner le pipeline
train_transformed = pipeline.fit(dfs['Train'])

In [None]:
# Sauvegarder les paramètres de transformation
pipeline.save(os.path.join(output_dir, "param"))

In [None]:
train_transformed.shape

In [None]:
train_transformed.head(5)

In [None]:
check_missing_values(train_transformed)

### Sur le df test

In [None]:
# Charger les paramètres de transformation
pipeline.load(os.path.join(output_dir, "param"))

# Transformer les données de test
test_transformed = pipeline.transform(dfs['Test'])

In [None]:
test_transformed.shape

## Sur jeu imputé

### Sur le df train

In [None]:
# Initialiser le pipeline
pipeline_imp = FeatureEngineeringPipeline(data_directory=output_dir_imputed + '/')

In [None]:
# Entraîner le pipeline
train_imputed_transformed = pipeline_imp.fit(dfs_imputed['Train'])

In [None]:
# Sauvegarder les paramètres de transformation
pipeline_imp.save(os.path.join(output_dir_imputed, "param"))

In [None]:
train_imputed_transformed.shape

In [None]:
train_imputed_transformed.head(5)

In [None]:
check_missing_values(train_imputed_transformed)

### Sur le df test

In [None]:
# Charger les paramètres de transformation
pipeline_imp.load(os.path.join(output_dir_imputed, "param"))

# Transformer les données de test
test_imputed_transformed = pipeline_imp.transform(dfs_imputed['Test'])

In [None]:
test_imputed_transformed.shape

In [None]:
# Vérification des colonnes avec des valeurs manquantes
missing_values = test_imputed_transformed.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Colonnes avec des valeurs manquantes : {missing_columns}")

## Sauvegarde / Chargement des données

In [None]:
output_feat_dir = "data/Featured"

# Enregistrer les DataFrames dans des fichiers CSV
train_transformed.to_csv(os.path.join(output_feat_dir, "train_transformed.csv"), index=False)
test_transformed.to_csv(os.path.join(output_feat_dir, "test_transformed.csv"), index=False)
train_imputed_transformed.to_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"), index=False)
test_imputed_transformed.to_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"), index=False)

print("DataFrames enregistrés avec succès.")

In [None]:
# Vérification de l'existence de la variable output_feat_dir
try:
    output_feat_dir
except NameError:
    output_feat_dir = None

if output_feat_dir is None:
    output_feat_dir = "data/Featured"

    # Charger les DataFrames à partir des fichiers CSV
    train_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_transformed.csv"))
    test_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_transformed.csv"))
    train_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"))
    test_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"))

    print("DataFrames chargés avec succès.")

In [None]:
print(f"train_transformed : {train_transformed.shape},\n"
      f"test_transformed : {test_transformed.shape},\n"
      f"train_imputed_transformed : {train_imputed_transformed.shape},\n"
      f"test_imputed_transformed : {test_imputed_transformed.shape}")

In [None]:
# nettoyage des noms de colonnes
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

train_transformed = clean_column_names(train_transformed)
test_transformed = clean_column_names(test_transformed)
train_imputed_transformed = clean_column_names(train_imputed_transformed)
test_imputed_transformed = clean_column_names(test_imputed_transformed)

# Définition du Score Métier

In [None]:
def normalized_cost_function(y_true, y_pred):
    fp_cost = 1
    fn_cost = 10
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    cost = (fp_cost * fp + fn_cost * fn) / len(y_true)
    return cost

In [None]:
# Création du custom score avec la fonction cost
custom_scorer = make_scorer(normalized_cost_function, greater_is_better=False)

## Tests de différents modèles

In [None]:
# Liste des modèles à tester
dummy_model = DummyClassifier(strategy='most_frequent')
logistic_model = LogisticRegression(random_state=42, max_iter=2000, n_jobs=-1)

# Calcul de scale_pos_weight pour XGBoost
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)
xgb_model_weight = XGBClassifier(scale_pos_weight=scale_pos_weight, n_jobs=-1, use_label_encoder=False,
                                 eval_metric='logloss', random_state=42)
xgb_model = XGBClassifier(n_jobs=-1, use_label_encoder=False, eval_metric='logloss', random_state=42)


lgbm_model = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=False, random_state=42)
lgbm_model_ub = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=True, verbosity=-1, random_state=42)
lgbm_model_weight = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=False, scale_pos_weight=scale_pos_weight, random_state=42)

# Calcul des poids de classe pour catboost
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_catboost = class_weights.tolist()
catboost_model_weight = CatBoostClassifier(verbose=0, class_weights=class_weights_catboost, thread_count=-1, task_type="GPU", random_seed=42)

catboost_model = CatBoostClassifier(verbose=0, thread_count=-1, task_type="GPU", random_seed=42)

In [None]:
# Fonction pour préparer les données avec différentes techniques de rééquilibrage
def prepare_data(need_imputation, need_scaling, need_validation, technique=None):

    if need_imputation:
        if need_validation:
            #print("Avec imputation, sans scaling, avec validation")
            X_train_loc, y_train_loc = X_train_imp_part, y_train_imp_part
            X_val, y_val = X_val_imp_part, y_val_imp_part
            X_test_loc, y_test_loc = X_test_imputed, y_test_imputed
        else:
            if need_scaling:
                #print("Avec Imputation, avec scaling, sans validation")
                X_train_loc, y_train_loc = X_train_scaled, y_train_imputed
                X_test_loc, y_test_loc = X_test_scaled, y_test_imputed
            else:
                #print("Avec Imputation, sans scaling, sans validation")
                X_train_loc, y_train_loc = X_train_imputed, y_train_imputed
                X_test_loc, y_test_loc = X_test_imputed, y_test_imputed
            X_val, y_val = None, None
    else:
        if need_validation:
            #print("Sans imputation, sans scaling, avec validation")
            X_train_loc, y_train_loc = X_train_part, y_train_part
            X_val, y_val = X_val_part, y_val_part
        else:
            #print("Sans imputation, sans scaling, sans validation")
            X_train_loc, y_train_loc = X_train, y_train
            X_val, y_val = None, None
        X_test_loc, y_test_loc = X_test, y_test
        
    
    if technique == "SMOTE":
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train_loc, y_train_loc)
    elif technique == "Undersampling":
        undersample = RandomUnderSampler(random_state=42)
        X_res, y_res = undersample.fit_resample(X_train_loc, y_train_loc)
    else:
        X_res, y_res = X_train_loc, y_train_loc
    #print(f"X_res :{X_res.shape}")
    return X_res, y_res, X_val, y_val, X_test_loc, y_test_loc

### Ensemble de validation fixe

In [None]:
# Fonction pour calculer le taux de FN et FP
def calculate_fn_fp_rates(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fn_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
    fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
    return fn_rate, fp_rate

In [None]:
results=[]

# Fonction pour entraîner et enregistrer les modèles avec MLFlow
def train_and_log_model(model, model_name, need_imputation=True, need_scaling=False, need_validation=False, technique=None, best_params=None):
    with mlflow.start_run(run_name=f"{model_name}"):
        # lancement du timer
        start_time = time.time()
    
        # préparation des données
        X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(need_imputation, need_scaling,
                                                                                      need_validation, technique)

        # Appliquer les meilleurs hyperparamètres
        if best_params:
            model.set_params(**best_params)
    
        # entrainement du modèle
        if "LightGBM" in model_name:
            model.fit(X_train_loc, y_train_loc, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
        elif "CatBoost" in model_name:
            eval_dataset = Pool(X_val, y_val)
            model.fit(X_train_loc, y_train_loc, eval_set=eval_dataset, early_stopping_rounds=50, verbose=False)
        else:
            # Entraînement des modèles qui n'utilisent pas d'ensemble de validation
            model.fit(X_train_loc, y_train_loc)

        # timer d'entrainement
        training_time = time.time() - start_time

        # lancement du timer
        start_time = time.time()
        
        y_pred_proba = model.predict_proba(X_test_loc)[:, 1]
        y_pred = (y_pred_proba >= 0.5).astype(int)

        # timer de prediction
        prediction_time = time.time() - start_time
        
        auc_score = roc_auc_score(y_test_loc, y_pred_proba)       
        accuracy = accuracy_score(y_test_loc, y_pred)
        custom_score = -normalized_cost_function(y_test_loc, y_pred)

        # Calculer les taux de FN et FP
        fn_rate, fp_rate = calculate_fn_fp_rates(y_test, y_pred)
        
        mlflow.log_param("model", model_name)
        mlflow.log_param("balancing_technique", technique)
        mlflow.log_metric("ROC AUC", auc_score)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Custom Score", custom_score)
        mlflow.log_metric("FN Rate", fn_rate)
        mlflow.log_metric("FP Rate", fp_rate)
        mlflow.log_metric("Training Time", training_time)
        mlflow.log_metric("Prediction Time", prediction_time)
        
        mlflow.sklearn.log_model(model, f"{model_name}")

        print(f"{model_name} - ROC AUC: {auc_score}, Accuracy: {accuracy}, Custom Score: {custom_score}")

            # Ajouter les résultats au DataFrame
    results.append({
        "Model": model_name,
        "ROC AUC": auc_score,
        "Accuracy": accuracy,
        "Custom Score": custom_score,
        "FN Rate": fn_rate,
        "FP Rate": fp_rate,
        "Training Time": training_time,
        "Prediction Time": prediction_time
    })

In [None]:
# Liste des modèles à tester
model_datasets = {
    "Dummy Classifier": (dummy_model, True, False, False, None, None),
    "Logistic Regression": (logistic_model, True, True, False, None, None),
    "Logistic Regression SMOTE": (logistic_model, True, True, False, "SMOTE", None),
    "Logistic Regression Undersampling": (logistic_model, True, True, False, "Undersampling", None),
    "XGBoost": (xgb_model, True, False, False, None, None),
    "XGBoost with weight": (xgb_model_weight, True, False, False, None, None),
    "LightGBM": (lgbm_model, False, False, True, None, None),
    "LightGBM with weight": (lgbm_model_weight, False, False, True, None, None),
    "LightGBM with unbalanced": (lgbm_model_ub, False, False, True, None, None),
    "CatBoost": (catboost_model, False, False, True, None, None),
    "CatBoost with weight": (catboost_model_weight, False, False, True, None, None),
}

In [None]:
# Entraîner et évaluer chaque modèle
for model_name, dfs in model_datasets.items():
    print(f"Evaluating {model_name}")
    train_and_log_model(dfs[0], model_name, dfs[1], dfs[2], dfs[3], dfs[4], dfs[5])

In [None]:
# Afficher les résultats
results_df = pd.DataFrame(results)
results_df.sort_values(by="ROC AUC", ascending=False)

### Validation croisée

In [None]:
# selection des modèles prometteurs
model_datasets_cv = {
    "Dummy Classifier": (dummy_model, True, False, False, None),
    "Logistic Regression": (logistic_model, True, True, False, None),
    "Logistic Regression SMOTE": (logistic_model, True, True, False, "SMOTE"),
    "Logistic Regression Undersampling": (logistic_model, True, True, False, "Undersampling"),
    "XGBoost": (xgb_model, True, False, False, None),
    "XGBoost with weight": (xgb_model_weight, True, False, False, None),
    "LightGBM": (lgbm_model, False, False, False, None),
    "LightGBM with weight": (lgbm_model, False, False, False, None),
    "LightGBM with unbalanced": (lgbm_model_ub, False, False, False, None),
    "CatBoost": (catboost_model, False, False, False, None),
    "CatBoost with weight": (catboost_model_weight, False, False, False, None),
}

In [None]:
# Fonction pour effectuer une validation croisée et évaluer les modèles
def cross_validate_model(model, X, y, scoring=custom_scorer):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    auc_scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(roc_auc_score))
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(accuracy_score))
    scores = cross_val_score(model, X, y, cv=skf, scoring=scoring)
    
    return {
        "auc_mean": np.mean(auc_scores),
        "auc_std": np.std(auc_scores),
        "accuracy_mean": np.mean(accuracy_scores),
        "accuracy_std": np.std(accuracy_scores),
        "custom_mean": np.mean(scores),
        "custom_std": np.std(scores)
    }

In [None]:
# Liste pour stocker les résultats
results = []

# Entraîner et évaluer chaque modèle
for model_name, dfs in model_datasets_cv.items():
    print(f"Evaluating {model_name}")
    # préparation des données
    X_train_loc, y_train_loc, X_val_loc, y_val_loc, X_test_loc, y_test_loc = prepare_data(dfs[1], dfs[2], dfs[3], dfs[4])

    # CV
    scores = cross_validate_model(dfs[0], X_train_loc, y_train_loc, custom_scorer)
    
    # Enregistrer les résultats dans MLflow
    with mlflow.start_run(run_name=f"{model_name}_CV"):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("ROC AUC", scores["auc_mean"])
        mlflow.log_metric("Accuracy", scores["accuracy_mean"])
        mlflow.log_metric("Custom Score", scores["custom_mean"])
        mlflow.log_metric("ROC AUC STD", scores["auc_std"])
        mlflow.log_metric("Accuracy STD", scores["accuracy_std"])
        mlflow.log_metric("Custom Score STD", scores["custom_std"])
        mlflow.sklearn.log_model(dfs[0], f"{model_name}")

    print(f"{model_name} - ROC AUC: {scores['auc_mean']} (STD: {scores['auc_std']}), "
          f"Accuracy: {scores['accuracy_mean']} (STD: {scores['accuracy_std']}), "
          f"Custom Score: {scores['custom_mean']} (STD: {scores['custom_std']})")

    # Ajouter les résultats au DataFrame
    results.append({
        "Model": model_name,
        "ROC AUC": scores["auc_mean"],
        "ROC AUC STD": scores["auc_std"],
        "Accuracy": scores["accuracy_mean"],
        "Accuracy STD": scores["accuracy_std"],
        "Custom Score": scores["custom_mean"],
        "Custom Score STD": scores["custom_std"]
    })

In [None]:
# Afficher les résultats
results_df = pd.DataFrame(results)
results_df.sort_values(by="Custom Score", ascending=False)

## Recherche des hyperparamètres

In [None]:
# Sélection des modèles les plus prometteurs
model_datasets_hp = {
    #"LightGBM with unbalanced": (lgbm_model_ub, False, False, False, None),
    "CatBoost with weight": (catboost_model_weight, False, False, False, None),
}

### RandomizedSearchCV

In [None]:
# Grilles d'hyperparamètres pour chaque modèle
param_grids = {
    "LightGBM with unbalanced": {
        'n_estimators': [1200, 1300],
        'max_depth': [40, 50, 60],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [50, 60, 70],
        'reg_alpha': [0.3, 0.4],
        'reg_lambda': [0.1, 0.3],
        'min_child_samples': [5, 10]
    },
    "CatBoost with weight": {
        'iterations': [500, 1000, 1500],
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.03, 0.05, 0.1],
        'l2_leaf_reg': [1, 3, 5, 7]
    }
}

In [None]:
# Recherche des hyperparamètres pour chaque modèle
best_params = {}
best_scores = {}

for model_name, dfs in model_datasets_hp.items(): 
    print(f"Processing {model_name}...")
    X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(dfs[1], dfs[2], dfs[3], dfs[4])
    
    # Utiliser RandomizedSearchCV
    random_search = RandomizedSearchCV(dfs[0], param_grids[model_name], n_iter=10, cv=5, scoring=make_scorer(roc_auc_score))
    random_search.fit(X_train_loc, y_train_loc)
    
    best_params[model_name] = random_search.best_params_
    best_scores[model_name] = random_search.best_score_

    # Enregistrer les résultats dans MLflow
    with mlflow.start_run(run_name=f"{model_name}_HP_Rand"):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("ROC AUC", best_scores[model_name])
        mlflow.log_params(best_params[model_name])
        # Enregistrer le modèle dans MLflow
        mlflow.sklearn.log_model(random_search.best_estimator_, model_name)


# Afficher les meilleurs hyperparamètres et scores pour chaque modèle
for model_name in best_params:
    print(f"{model_name} - Best Parameters: {best_params[model_name]}, Best ROC AUC: {best_scores[model_name]:.4f}")


### GridSearchCV

In [None]:
# Grilles d'hyperparamètres pour chaque modèle
param_grids = {
    "LightGBM with unbalanced": {
        'n_estimators': [1100, 1200],
        'max_depth': [40, 50],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [60, 70],
        'reg_alpha': [0.3, 0.5],
        'reg_lambda': [0, 0.1, 0.3],
        'min_child_samples': [10, 20]
    },
    "CatBoost with weight": {
        'iterations': [1000, 1100],
        'depth': [5, 6],
        'learning_rate': [0.02, 0.03],
        'l2_leaf_reg': [1, 2]
    }
}

In [None]:
# Recherche des hyperparamètres pour chaque modèle
best_params = {}
best_scores = {}

for model_name, (model, need_imputation, need_scaling, need_validation, technique) in model_datasets_hp.items():
    print(f"Processing {model_name}...")
    X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(need_imputation, need_scaling,
                                                                                  need_validation, technique)
    
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring=make_scorer(roc_auc_score))
    grid_search.fit(X_train_loc, y_train_loc)
    
    best_params[model_name] = grid_search.best_params_
    best_scores[model_name] = grid_search.best_score_

    # Enregistrer les résultats dans MLflow
    with mlflow.start_run(run_name=f"{model_name}_HP_Grid"):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("ROC AUC", best_scores[model_name])
        mlflow.log_params(best_params[model_name])
        # Enregistrer le modèle dans MLflow
        mlflow.sklearn.log_model(grid_search.best_estimator_, model_name)

In [None]:
# Afficher les meilleurs hyperparamètres et scores pour chaque modèle
for model_name in best_params:
    print(f"{model_name} - Best Parameters: {best_params[model_name]}, Best ROC AUC: {best_scores[model_name]:.4f}")

Resultats sur jeu de test

In [None]:
#  nouvelles instance des modeles
lgbm_model_ub = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=True, verbosity=-1)
catboost_model_weight = CatBoostClassifier(verbose=0, class_weights=class_weights_catboost, thread_count=-1, task_type="GPU")

In [None]:
train_and_log_model(catboost_model_weight, "best model", False, False, True, None, best_params=best_params["CatBoost with weight"])

## Features importance

In [None]:
# Initialiser l'explainer SHAP
explainer = shap.Explainer(catboost_model_weight)

# Calculer les valeurs SHAP pour les données de test
shap_values = explainer(X_test_selected)

# Visualiser l'importance des features
shap.summary_plot(shap_values, X_test_selected, plot_type="bar")

# Visualiser les valeurs SHAP pour toutes les instances
shap.summary_plot(shap_values, X_test_selected)

## Optimisation du seuil

Calculez les probabilités de prédiction pour l'ensemble de validation ou de test.
Évaluez le coût de différentes valeurs de seuil (par exemple, de 0 à 1 avec des pas de 0.01) et sélectionnez le seuil qui minimise votre fonction de coût.

In [None]:
best_model = CatBoostClassifier(verbose=0, class_weights=class_weights_catboost, thread_count=-1, task_type="GPU",
                                    **best_params["CatBoost with weight"])

In [None]:
# Entraînez le modèle avec les meilleurs hyperparamètres
best_model.fit(X_train_part, y_train_part)

# Proba de prédiction
y_proba = best_model.predict_proba(X_val_part)[:, 1]

def find_best_threshold(y_true, y_proba, cost_function):
    thresholds = np.arange(0.0, 1.0, 0.01)
    best_threshold = 0.5
    best_cost = float('inf')
    costs = []
    f1_scores = []
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        cost = cost_function(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        costs.append(cost)
        f1_scores.append(f1)
        if cost < best_cost:
            best_cost = cost
            best_threshold = threshold

    # Tracer les valeurs
    plt.figure(figsize=(12, 6))
    plt.plot(thresholds, costs, label='Cost Function', color='red')
    plt.plot(thresholds, f1_scores, label='F1 Score', color='blue')
    plt.axvline(x=best_threshold, color='green', linestyle='--', label=f'Best Threshold: {best_threshold}')
    plt.xlabel('Threshold')
    plt.ylabel('Value')
    plt.title('Cost Function and F1 Score vs Threshold')
    plt.legend()
    plt.grid(True)
    plt.show()

    return best_threshold, best_cost, thresholds, costs, f1_scores


# Trouver le seuil optimal
best_threshold, best_cost, thresholds, costs, f1_scores = find_best_threshold(y_val_part, y_proba, normalized_cost_function)
print(f"Optimal threshold: {best_threshold}, with cost: {best_cost}")

In [None]:
# Prédire les probabilités sur le jeu de test
y_test_proba = best_model.predict_proba(X_test_selected)[:, 1]

# Utiliser le seuil optimisé pour les prédictions finales
y_test_pred = (y_test_proba >= best_threshold).astype(int)

# Calculer le custom_score final
final_custom_score = -normalized_cost_function(y_test, y_test_pred)
print(f"Custom Score final: {final_custom_score}")

# Sauvegarde pour deploiement

In [None]:
# Sauvegarder le pipeline de préparation des données
# Initialiser le pipeline
pipeline_final = FeatureEngineeringPipeline(data_directory=PATH, impute=False)
appli_train_transformed = pipeline_final.fit(application_train)

pipeline_path = 'pipeline.pkl'
joblib.dump(pipeline_final, pipeline_path)
print(f"Pipeline sauvegardé sous {pipeline_path}")

In [None]:
# Sauvegarder le meilleur modèle
model_path = 'best_model.pkl'
joblib.dump(best_model, model_path)
print(f"Modèle sauvegardé sous {model_path}")


In [None]:
# Sauvegarder le seuil optimal
threshold_path = 'optimal_threshold.pkl'
joblib.dump(optimal_threshold, threshold_path)
print(f"Seuil optimal sauvegardé sous {threshold_path}")