<b><font color="SteelBlue" size="+3">Implémentez un modèle de scoring</font></b>

# Introduction

Ce note book est la suite de l'EDA, il va permettre de présenter le prétraitement des données, le feature engineering et la modélisation.

In [1]:
# Chargement des librairies

# Built-in
import os

# Data Manipulation and Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import mlflow
import mlflow.sklearn

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.feature_selection import RFE
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
import re
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import RandomizedSearchCV


# Custom Feature Engineering Pipeline
from feature_pipeline import FeatureEngineeringPipeline, remove_highly_correlated_features, remove_low_correlation_features

In [2]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("Credit_Scoring_Experiment")

<Experiment: artifact_location='mlflow-artifacts:/185199612184583695', creation_time=1715766694823, experiment_id='185199612184583695', last_update_time=1715766694823, lifecycle_stage='active', name='Credit_Scoring_Experiment', tags={}>

# Data préparation

## Chargement des données

In [3]:
os.listdir("data/sources/")

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'Projet+Mise+en+prod+-+home-credit-default-risk.zip',
 'sample_submission.csv']

In [4]:
PATH = "./data/sources/"

In [5]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

## Split du jeu de données

In [6]:
# Division des données en ensembles d'entraînement et de test
train, test = train_test_split(application_train, test_size=0.2, random_state=21, stratify=application_train['TARGET'])

In [7]:
train.shape

(246008, 122)

## Gestion des valeurs manquantes et abérrantes

In [8]:
# Supprimer les lignes où TARGET est manquant dans application_train
train.dropna(subset=['TARGET'], inplace=True)
test.dropna(subset=['TARGET'], inplace=True)

In [9]:
# Suppression des lignes où les clés sont absentes
def drop_missing_keys(df, key, reference_df, reference_key):
    valid_keys = reference_df[reference_key].unique()
    return df[df[key].isin(valid_keys)]

# Supprimer les lignes où les clés sont absentes
bureau = drop_missing_keys(bureau, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
bureau_balance = drop_missing_keys(bureau_balance, 'SK_ID_BUREAU', bureau, 'SK_ID_BUREAU')
previous_application = drop_missing_keys(previous_application, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
POS_CASH_balance = drop_missing_keys(POS_CASH_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
installments_payments = drop_missing_keys(installments_payments, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
credit_card_balance = drop_missing_keys(credit_card_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')

In [10]:
# Fonction pour supprimer les colonnes avec plus de 80% de valeurs manquantes
def drop_missing_columns(df, threshold=0.8):
    initial_columns = df.shape[1]
    missing_percentage = df.isnull().mean()
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    df = df.drop(columns=columns_to_drop)
    final_columns = df.shape[1]
    print(f"Colonnes supprimées: {initial_columns - final_columns}")
    print(f"Colonnes restantes: {final_columns}")
    return df

In [11]:
# Fonction pour gérer les valeurs aberrantes
def cap_values(series, threshold=0.2):
    lower_percentile = np.percentile(series, 1)
    upper_percentile = np.percentile(series, 99)
    outliers = (series < lower_percentile) | (series > upper_percentile)

    outlier_pct = outliers.sum() / len(series)

    if outlier_pct > threshold:
        print("Significant outliers detected, not capping values.")
        return series  # Ne pas appliquer le cap si les valeurs aberrantes sont significatives
    else:
        return np.clip(series, lower_percentile, upper_percentile)


In [12]:
def cap_outliers(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'SK_ID_CURR':  # Exclure la colonne avec la clé
            df[col] = cap_values(df[col])
    return df

In [13]:
# Appliquer le filtre des colonnes manquantes et la gestion des valeurs aberrantes à chaque DataFrame
dfs = {
    "Train": train,
    "Test": test,
    "Application Test": application_test,
    "Bureau": bureau,
    "Bureau Balance": bureau_balance,
    "Credit Card Balance": credit_card_balance,
    "Installments Payments": installments_payments,
    "Previous Application": previous_application,
    "POS CASH Balance": POS_CASH_balance
}

In [14]:
for name, df in dfs.items():
    print(f"\n{name}:")
    df = drop_missing_columns(df)
    df = cap_outliers(df)
    dfs[name] = df


Train:
Colonnes supprimées: 0
Colonnes restantes: 122

Test:
Colonnes supprimées: 0
Colonnes restantes: 122

Application Test:
Colonnes supprimées: 0
Colonnes restantes: 121

Bureau:
Colonnes supprimées: 0
Colonnes restantes: 17

Bureau Balance:
Colonnes supprimées: 0
Colonnes restantes: 3

Credit Card Balance:
Colonnes supprimées: 0
Colonnes restantes: 23

Installments Payments:
Colonnes supprimées: 0
Colonnes restantes: 8

Previous Application:
Colonnes supprimées: 2
Colonnes restantes: 35

POS CASH Balance:
Colonnes supprimées: 0
Colonnes restantes: 8


In [15]:
dfs['Train'].shape

(246008, 122)

## Imputation

In [16]:
dfs_imputed = dfs

In [17]:
# Fonction pour imputer les valeurs manquantes
def impute_missing_values(df):
    # Imputation pour les colonnes numériques
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)
    
    # Imputation pour les colonnes catégorielles
    categorical_cols = df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

In [18]:
for name, df in dfs_imputed.items():
    df = impute_missing_values(df)
    dfs_imputed[name] = df

In [19]:
# Fonction pour vérifier les valeurs manquantes dans un DataFrame
def check_missing_values(df):
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        print("Colonnes avec des valeurs manquantes :")
        print(missing_values)
    else:
        print("Aucune valeur manquante détectée.")

# Vérification des valeurs manquantes dans chaque DataFrame
for name, df in dfs_imputed.items():
    print(f"\n{name}:")
    check_missing_values(df)


Train:
Aucune valeur manquante détectée.

Test:
Aucune valeur manquante détectée.

Application Test:
Aucune valeur manquante détectée.

Bureau:
Aucune valeur manquante détectée.

Bureau Balance:
Aucune valeur manquante détectée.

Credit Card Balance:
Aucune valeur manquante détectée.

Installments Payments:
Aucune valeur manquante détectée.

Previous Application:
Aucune valeur manquante détectée.

POS CASH Balance:
Aucune valeur manquante détectée.


In [20]:
dfs_imputed['Train'].shape

(246008, 122)

## Sauvegarde des données nettoyées

In [21]:
# Fonction pour sauvegarder un DataFrame
def save_dataframe(df, filename, output_dir):
    output_path = os.path.join(output_dir, filename)
    df.to_csv(output_path, index=False)
    print(f"DataFrame sauvegardé sous : {output_path}")

In [22]:
# Dictionnaire contenant les noms des fichiers
original_names = {
    "Train": "train.csv",
    "Test": "test.csv",
    "Application Test": "application_test.csv",
    "Bureau": "bureau.csv",
    "Bureau Balance": "bureau_balance.csv",
    "Credit Card Balance": "credit_card_balance.csv",
    "Installments Payments": "installments_payments.csv",
    "Previous Application": "previous_application.csv",
    "POS CASH Balance": "POS_CASH_balance.csv"
}

### Sans l'imputation

In [23]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir = "data/Cleaned"

In [24]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs.items():
    save_dataframe(df, original_names[name], output_dir)

DataFrame sauvegardé sous : data/Cleaned\train.csv
DataFrame sauvegardé sous : data/Cleaned\test.csv
DataFrame sauvegardé sous : data/Cleaned\application_test.csv
DataFrame sauvegardé sous : data/Cleaned\bureau.csv
DataFrame sauvegardé sous : data/Cleaned\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned\POS_CASH_balance.csv


### Avec l'imputation

In [25]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir_imputed = "data/Cleaned/Imputed"

In [26]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs_imputed.items():
    save_dataframe(df, original_names[name], output_dir_imputed)

DataFrame sauvegardé sous : data/Cleaned/Imputed\train.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\test.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\application_test.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\bureau.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\POS_CASH_balance.csv


# Feature engineering

## Sur jeu non imputé

### Sur le df train

In [27]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline(data_directory=output_dir + '/')

In [28]:
# Entraîner le pipeline
train_transformed = pipeline.fit()

feature_engineering - done
Bureau and bureau_balance data - done in 36s
previous_application - done in 39s
previous applications balances - done in 488s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des outliers - done
Colonnes supprimées (constantes): 29, Colonnes restantes: 655
Initial df memory usage is 1234.27 MB for 773 columns
Final memory usage is: 453.26 MB - decreased by 63.3%


In [29]:
# Sauvegarder les paramètres de transformation
pipeline.save(os.path.join(output_dir, "param"))

In [30]:
train_transformed.shape

(246004, 773)

In [31]:
train_transformed.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,183096.0,1,0,0,0,1,0,135000.0,161730.0,8464.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,433941.0,0,0,0,1,0,0,225000.0,729792.0,22126.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,378379.0,0,0,1,0,1,0,121500.0,911263.5,36270.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,301108.0,0,0,0,0,1,0,180000.0,808650.0,23773.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,194730.0,0,0,1,1,1,0,180000.0,355536.0,15192.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [32]:
check_missing_values(train_transformed)

Colonnes avec des valeurs manquantes :
DAYS_EMPLOYED                       44589
DAYS_LAST_PHONE_CHANGE              30095
INCOME_TO_EMPLOYED_RATIO            44591
EMPLOYED_TO_BIRTH_RATIO             44589
CAR_TO_EMPLOYED_RATIO               44591
                                    ...  
CTA_CREDIT_TO_ANNUITY_MEAN_RATIO    13990
DAYS_DECISION_MEAN_TO_BIRTH         13977
DAYS_CREDIT_MEAN_TO_BIRTH           35323
DAYS_DECISION_MEAN_TO_EMPLOYED      56182
DAYS_CREDIT_MEAN_TO_EMPLOYED        73145
Length: 528, dtype: int64


### Sur le df test

In [33]:
test = dfs["Test"]

In [34]:
# Charger les paramètres de transformation
pipeline.load(os.path.join(output_dir, "param"))

# Transformer les données de test
test_transformed = pipeline.transform(test)

Bureau and bureau_balance data - done in 35s
previous_application - done in 37s
Colonnes supprimées: 50, Colonnes restantes: 684
Initial df memory usage is 361.54 MB for 773 columns
Final memory usage is: 105.87 MB - decreased by 70.7%
previous applications balances - done in 492s


In [35]:
test_transformed.shape

(61503, 773)

## Sur jeu imputé

### Sur le df train

In [36]:
# Initialiser le pipeline
pipeline_imp = FeatureEngineeringPipeline(data_directory=output_dir_imputed + '/')

In [37]:
# Entraîner le pipeline
train_imputed_transformed = pipeline_imp.fit()

feature_engineering - done
Bureau and bureau_balance data - done in 36s
previous_application - done in 37s
previous applications balances - done in 491s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des outliers - done
Gestion des valeurs manquantes - done
Colonnes supprimées (constantes): 29, Colonnes restantes: 655
Initial df memory usage is 1234.27 MB for 773 columns
Final memory usage is: 453.26 MB - decreased by 63.3%


In [38]:
# Sauvegarder les paramètres de transformation
pipeline_imp.save(os.path.join(output_dir_imputed, "param"))

In [39]:
train_imputed_transformed.shape

(246004, 773)

In [40]:
train_imputed_transformed.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,183096.0,1,0,0,0,1,0,135000.0,161730.0,8464.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,433941.0,0,0,0,1,0,0,225000.0,729792.0,22126.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,378379.0,0,0,1,0,1,0,121500.0,911263.5,36270.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,301108.0,0,0,0,0,1,0,180000.0,808650.0,23773.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,194730.0,0,0,1,1,1,0,180000.0,355536.0,15192.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [41]:
check_missing_values(train_imputed_transformed)

Aucune valeur manquante détectée.


### Sur le df test

In [42]:
test_imputed = dfs_imputed["Test"]

In [43]:
# Charger les paramètres de transformation
pipeline_imp.load(os.path.join(output_dir_imputed, "param"))

# Transformer les données de test
test_imputed_transformed = pipeline_imp.transform(test_imputed)

Bureau and bureau_balance data - done in 35s
previous_application - done in 38s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des valeurs manquantes - done
Initial df memory usage is 361.54 MB for 773 columns
Final memory usage is: 105.87 MB - decreased by 70.7%
previous applications balances - done in 493s


In [44]:
test_imputed.shape

(61503, 122)

In [45]:
test_imputed_transformed.shape

(61503, 773)

In [46]:
# Vérification des colonnes avec des valeurs manquantes
missing_values = test_imputed_transformed.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Colonnes avec des valeurs manquantes : {missing_columns}")

Colonnes avec des valeurs manquantes : []


## Sauvegarde / Chargement des données

In [47]:
output_feat_dir = "data/Featured"

# Enregistrer les DataFrames dans des fichiers CSV
train_transformed.to_csv(os.path.join(output_feat_dir, "train_transformed.csv"), index=False)
test_transformed.to_csv(os.path.join(output_feat_dir, "test_transformed.csv"), index=False)
train_imputed_transformed.to_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"), index=False)
test_imputed_transformed.to_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"), index=False)

print("DataFrames enregistrés avec succès.")

DataFrames enregistrés avec succès.


In [2]:
# Vérification de l'existence de la variable output_feat_dir
try:
    output_feat_dir
except NameError:
    output_feat_dir = None

if output_feat_dir is None:
    output_feat_dir = "data/Featured"

    # Charger les DataFrames à partir des fichiers CSV
    train_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_transformed.csv"))
    test_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_transformed.csv"))
    train_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"))
    test_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"))

    print("DataFrames chargés avec succès.")

DataFrames chargés avec succès.


In [3]:
print(f"train_transformed : {train_transformed.shape},\n"
      f"test_transformed : {test_transformed.shape},\n"
      f"train_imputed_transformed : {train_imputed_transformed.shape},\n"
      f"test_imputed_transformed : {test_imputed_transformed.shape}")

train_transformed : (246004, 773),
test_transformed : (61503, 773),
train_imputed_transformed : (246004, 773),
test_imputed_transformed : (61503, 773)


In [4]:
# nettoyage des noms de colonnes
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

train_transformed = clean_column_names(train_transformed)
test_transformed = clean_column_names(test_transformed)
train_imputed_transformed = clean_column_names(train_imputed_transformed)
test_imputed_transformed = clean_column_names(test_imputed_transformed)

# Définition du Score Métier

In [51]:
def normalized_cost_function(y_true, y_pred):
    fp_cost = 1
    fn_cost = 10
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    cost = (fp_cost * fp + fn_cost * fn) / len(y_true)
    return cost

In [52]:
# Création du custom score avec la fonction cost
custom_scorer = make_scorer(normalized_cost_function, greater_is_better=False)

# Modèles de machine learning

## Préparation

In [5]:
# Séparer les features et la cible sur le jeu sans imputation
X_train = train_transformed.drop(columns=['TARGET'])
y_train = train_transformed['TARGET']
X_test = test_transformed.drop(columns=['TARGET'])
y_test = test_transformed['TARGET']

In [6]:
# Séparer les features et la cible sur le jeu imputé
X_train_imputed = train_imputed_transformed.drop(columns=['TARGET'])
y_train_imputed = train_imputed_transformed['TARGET']
X_test_imputed = test_imputed_transformed.drop(columns=['TARGET'])
y_test_imputed = test_imputed_transformed['TARGET']

### Séléction des variables

#### Suppression des variables hautement ou trop peu corrélées

In [7]:
train_imputed_cleaned, low_corr_features = remove_low_correlation_features(train_imputed_transformed, 'TARGET', threshold=0.01)
print(f"Variables très peu corrélées supprimées: {len(low_corr_features)}")

Variables très peu corrélées supprimées: 267


In [8]:
train_imputed_cleaned, high_corr_features = remove_highly_correlated_features(train_imputed_transformed, threshold=0.9)
print(f"Variables très corrélées supprimées: {len(high_corr_features)}")

Variables très corrélées supprimées: 192


In [9]:
train_imputed_cleaned.shape

(246004, 581)

In [10]:
# Aligner les colones du jeu de test

train_imputed_cleaned_columns = list(train_imputed_cleaned.columns)
test_imputed_cleaned = test_imputed_transformed.copy()

# Supprimer les colonnes en trop
extra_cols = set(test_imputed_cleaned.columns) - set(train_imputed_cleaned_columns)
if extra_cols:
    test_imputed_cleaned.drop(columns=list(extra_cols), inplace=True)

# Réordonner les colonnes pour correspondre à celles du train
test_imputed_cleaned = test_imputed_cleaned[train_imputed_cleaned_columns]

test_imputed_cleaned.shape

(61503, 581)

In [11]:
# Séparer les features et la cible sur le jeu imputé
X_train_imputed = train_imputed_cleaned.drop(columns=['TARGET'])
y_train_imputed = train_imputed_cleaned['TARGET']
X_test_imputed = test_imputed_cleaned.drop(columns=['TARGET'])
y_test_imputed = test_imputed_cleaned['TARGET']

#### Recherche de la feature importance avec random forest

<b>Random Forest et Boruta

In [12]:
# Normalisation des données
scaler = StandardScaler()
X_train_fi_scaled = scaler.fit_transform(X_train_imputed)
X_test_fi_scaled = scaler.transform(X_test_imputed)

In [13]:
# Rééquilibrage des classes avec Undersampling
undersample = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = undersample.fit_resample(X_train_fi_scaled, y_train_imputed)

In [14]:
# Création du modèle de base
rf = RandomForestClassifier(n_jobs=-1, random_state=42)

In [64]:
# Initialisation de Boruta
boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42, alpha=0.05)
boruta_selector.fit(X_train_res, y_train_res)

In [65]:
# Sélection des caractéristiques importantes
selected_features_boruta = X_train_imputed.columns[boruta_selector.support_]

# Nombre de caractéristiques sélectionnées
print("Nombre de caractéristiques sélectionnées :", len(selected_features_boruta))

Nombre de caractéristiques sélectionnées : 58


In [66]:
# Créer un DataFrame avec les noms des caractéristiques sélectionnées
selected_features_boruta = pd.DataFrame(selected_features_boruta, columns=["Selected Features"])

In [67]:
selected_features_boruta.shape

(58, 1)

In [68]:
selected_features_boruta_list = selected_features_boruta.squeeze().tolist()

<b>CatBoost et RFE

In [None]:
# Modèle CatBoost
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_catboost = class_weights.tolist()
catboost_model = CatBoostClassifier(verbose=0, class_weights=class_weights_catboost, thread_count=-1, task_type="GPU")

# Sélection Récursive par Élimination avec CatBoost
rfe = RFE(estimator=catboost_model, n_features_to_select=150, step=1)
rfe.fit(X_train, y_train)

In [None]:
# Récupérer les indices des caractéristiques sélectionnées
selected_features = rfe.support_
selected_feature_indices = np.where(selected_features)[0]

# Récupérer les noms des caractéristiques sélectionnées 
selected_feature_list = X_train_res.columns[selected_feature_indices].tolist()

# Transformer en DataFrame
selected_features =  pd.DataFrame(selected_feature_list, columns=["Selected Features"])

In [None]:
selected_features.shape

#### Sauvegarde / Chargement des features

In [69]:
# Sauvegarde des features selectionnées
save_dataframe(selected_features, "features.csv", output_feat_dir)

DataFrame sauvegardé sous : data/Featured\features.csv


In [70]:
# Charger la liste des features selectionnées
# Vérification de l'existence de la variable selected_features
try:
    selected_features
except NameError:
    selected_features = None

if selected_features is None:
    # Définir la variable selected_features ici
    selected_features = pd.read_csv(os.path.join(output_feat_dir, "features.csv"))
    print("Features chargés avec succès.")

    # Séparer les features et la cible sur le jeu imputé
    X_train_imputed = train_imputed_transformed.drop(columns=['TARGET'])
    y_train_imputed = train_imputed_transformed['TARGET']
    X_test_imputed = test_imputed_transformed.drop(columns=['TARGET'])
    y_test_imputed = test_imputed_transformed['TARGET']

    selected_features_list = selected_features.iloc[:, 0].tolist()

In [71]:
# Filtrer les jeux de données pour ne garder que les features sélectionnées
X_train_imp_selected = X_train_imputed[selected_features_list]
X_test_imp_selected = X_test_imputed[selected_features_list]

# idem sur jeu non imputé
X_train_selected = X_train[selected_features_list]
X_test_selected = X_test[selected_features_list]

In [72]:
X_train_imp_selected.shape

(246004, 58)

In [73]:
X_test_imp_selected.shape

(61503, 58)

In [74]:
X_train_selected.shape

(246004, 58)

In [75]:
X_test_selected.shape

(61503, 58)

### Standardisation

In [76]:
# Normaliser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp_selected)
X_test_scaled = scaler.transform(X_test_imp_selected)

In [77]:
X_train_scaled.shape

(246004, 58)

### Ensemble de validation

In [78]:
# Division des données d'entraînement en sous-ensembles d'entraînement et de validation sur jeu non imputé
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(X_train_selected, y_train,
                                                                      test_size=0.2, random_state=42,
                                                                      stratify=y_train)

In [79]:
# Division des données d'entraînement en sous-ensembles d'entraînement et de validation sur jeu imputé
X_train_imp_part, X_val_imp_part, y_train_imp_part, y_val_imp_part = train_test_split(X_train_imp_selected, y_train_imputed,
                                                                                      test_size=0.2, random_state=42,
                                                                                      stratify=y_train_imputed)

## Tests de différents modèles

In [80]:
# Liste des modèles à tester
dummy_model = DummyClassifier(strategy='most_frequent')
logistic_model = LogisticRegression(random_state=42, max_iter=5000, n_jobs=-1)

# Calcul de scale_pos_weight pour XGBoost
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)
xgb_model_weight = XGBClassifier(scale_pos_weight=scale_pos_weight, n_jobs=-1, use_label_encoder=False, eval_metric='logloss')
xgb_model = XGBClassifier(n_jobs=-1, use_label_encoder=False, eval_metric='logloss')

lgbm_model_ub = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=True, verbosity=-1)
lgbm_model = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=False)

# Calcul des poids de classe pour catboost
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_catboost = class_weights.tolist()
catboost_model_weight = CatBoostClassifier(verbose=0, class_weights=class_weights_catboost, thread_count=-1, task_type="GPU")

catboost_model = CatBoostClassifier(verbose=0, thread_count=-1, task_type="GPU")

In [81]:
# Fonction pour préparer les données avec différentes techniques de rééquilibrage
def prepare_data(need_imputation, need_scaling, need_validation, technique=None):

    if need_imputation:
        if need_validation:
            #print("Avec imputation, sans scaling, avec validation")
            X_train_loc, y_train_loc = X_train_imp_part, y_train_imp_part
            X_val, y_val = X_val_imp_part, y_val_imp_part
            X_test_loc, y_test_loc = X_test_imp_selected, y_test_imputed
        else:
            if need_scaling:
                #print("Avec Imputation, avec scaling, sans validation")
                X_train_loc, y_train_loc = X_train_scaled, y_train_imputed
                X_test_loc, y_test_loc = X_test_scaled, y_test_imputed
            else:
                #print("Avec Imputation, sans scaling, sans validation")
                X_train_loc, y_train_loc = X_train_imp_selected, y_train_imputed
                X_test_loc, y_test_loc = X_test_imp_selected, y_test_imputed
            X_val, y_val = None, None
    else:
        if need_validation:
            #print("Sans imputation, sans scaling, avec validation")
            X_train_loc, y_train_loc = X_train_part, y_train_part
            X_val, y_val = X_val_part, y_val_part
        else:
            #print("Sans imputation, sans scaling, sans validation")
            X_train_loc, y_train_loc = X_train_selected, y_train
            X_val, y_val = None, None
        X_test_loc, y_test_loc = X_test_selected, y_test
        
    
    if technique == "SMOTE":
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train_loc, y_train_loc)
    elif technique == "Undersampling":
        undersample = RandomUnderSampler(random_state=42)
        X_res, y_res = undersample.fit_resample(X_train_loc, y_train_loc)
    else:
        X_res, y_res = X_train_loc, y_train_loc
    #print(f"X_res :{X_res.shape}")
    return X_res, y_res, X_val, y_val, X_test_loc, y_test_loc

### Ensemble de validation fixe

In [82]:
results=[]

# Fonction pour entraîner et enregistrer les modèles avec MLFlow
def train_and_log_model(model, model_name, need_imputation=True, need_scaling=False, need_validation=False, technique=None, best_params=None):
    # préparation des données
    X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(need_imputation, need_scaling,
                                                                                  need_validation, technique)

    # Appliquer les meilleurs hyperparamètres
    if best_params:
        model.set_params(**best_params)
    
    with mlflow.start_run(run_name=f"{model_name}"):
        if "LightGBM" in model_name:
            model.fit(X_train_loc, y_train_loc, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
        elif "CatBoost" in model_name:
            eval_dataset = Pool(X_val, y_val)
            model.fit(X_train_loc, y_train_loc, eval_set=eval_dataset, early_stopping_rounds=50, verbose=False)
        else:
            # Entraînement des modèles qui n'utilisent pas d'ensemble de validation
            model.fit(X_train_loc, y_train_loc)
        
        y_pred_proba = model.predict_proba(X_test_loc)[:, 1]
        y_pred = (y_pred_proba >= 0.5).astype(int)
        auc_score = roc_auc_score(y_test_loc, y_pred_proba)       
        accuracy = accuracy_score(y_test_loc, y_pred)
        custom_score = -normalized_cost_function(y_test_loc, y_pred)

        mlflow.log_param("model", model_name)
        mlflow.log_param("balancing_technique", technique)
        mlflow.log_metric("ROC AUC", auc_score)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Custom Score", custom_score)
        
        mlflow.sklearn.log_model(model, f"{model_name}")

        print(f"{model_name} - ROC AUC: {auc_score}, Accuracy: {accuracy}, Custom Score: {custom_score}")

            # Ajouter les résultats au DataFrame
    results.append({
        "Model": model_name,
        "ROC AUC": auc_score,
        "Accuracy": accuracy,
        "Custom Score": custom_score
    })

In [83]:
# Liste des modèles à tester
model_datasets = {
    "Dummy Classifier": (dummy_model, True, False, False, None, None),
    "Logistic Regression": (logistic_model, True, True, False, None, None),
    "Logistic Regression SMOTE": (logistic_model, True, True, False, "SMOTE", None),
    "Logistic Regression Undersampling": (logistic_model, True, True, False, "Undersampling", None),
    "XGBoost": (xgb_model, True, False, False, None, None),
    "XGBoost with weight": (xgb_model_weight, True, False, False, None, None),
    "LightGBM": (lgbm_model, False, False, True, None, None),
    "LightGBM with unbalanced": (lgbm_model_ub, False, False, True, None, None),
    "CatBoost": (catboost_model, False, False, True, None, None),
    "CatBoost with weight": (catboost_model_weight, False, False, True, None, None),
}

In [84]:
# Entraîner et évaluer chaque modèle
for model_name, dfs in model_datasets.items():
    print(f"Evaluating {model_name}")
    train_and_log_model(dfs[0], model_name, dfs[1], dfs[2], dfs[3], dfs[4], dfs[5])
    

# Afficher les résultats
results_df = pd.DataFrame(results)
results_df.sort_values(by="ROC AUC", ascending=False)

Evaluating Dummy Classifier
Dummy Classifier - ROC AUC: 0.5, Accuracy: 0.9192722306228964, Custom Score: -0.8072776937710355
Evaluating Logistic Regression
Logistic Regression - ROC AUC: 0.7497580484595607, Accuracy: 0.9191746744061265, Custom Score: -0.8009365396809912
Evaluating Logistic Regression SMOTE
Logistic Regression SMOTE - ROC AUC: 0.7461737023147315, Accuracy: 0.6792188998910622, Custom Score: -0.5455506235468188
Evaluating Logistic Regression Undersampling
Logistic Regression Undersampling - ROC AUC: 0.7497552698027656, Accuracy: 0.6859990569565713, Custom Score: -0.5425751589353365
Evaluating XGBoost
XGBoost - ROC AUC: 0.7529197680306059, Accuracy: 0.9193210087312814, Custom Score: -0.783083752012097
Evaluating XGBoost with weight
XGBoost with weight - ROC AUC: 0.734425484386674, Accuracy: 0.7924654081914704, Custom Score: -0.5689803749410598
Evaluating LightGBM
LightGBM - ROC AUC: 0.7663992458867954, Accuracy: 0.9194185649480513, Custom Score: -0.7957172820838008
Evaluat

Unnamed: 0,Model,ROC AUC,Accuracy,Custom Score
8,CatBoost,0.767613,0.919597,-0.792026
9,CatBoost with weight,0.767542,0.744598,-0.518219
6,LightGBM,0.766399,0.919419,-0.795717
4,XGBoost,0.75292,0.919321,-0.783084
1,Logistic Regression,0.749758,0.919175,-0.800937
3,Logistic Regression Undersampling,0.749755,0.685999,-0.542575
2,Logistic Regression SMOTE,0.746174,0.679219,-0.545551
5,XGBoost with weight,0.734425,0.792465,-0.56898
7,LightGBM with unbalanced,0.721371,0.919272,-0.807278
0,Dummy Classifier,0.5,0.919272,-0.807278


### Validation croisée

In [85]:
# selection des modèles prometteurs
model_datasets_cv = {
    "Logistic Regression": (logistic_model, True, True, False, None),
    "Logistic Regression SMOTE": (logistic_model, True, True, False, "SMOTE"),
    "Logistic Regression Undersampling": (logistic_model, True, True, False, "Undersampling"),
    "XGBoost": (xgb_model, True, False, False, None),
    "XGBoost with weight": (xgb_model_weight, True, False, False, None),
    "LightGBM": (lgbm_model, False, False, False, None),
    "LightGBM with unbalanced": (lgbm_model_ub, False, False, False, None),
    "CatBoost": (catboost_model, False, False, False, None),
    "CatBoost with weight": (catboost_model_weight, False, False, False, None),
}

In [86]:
# Fonction pour effectuer une validation croisée et évaluer les modèles
def cross_validate_model(model, X, y, scoring=custom_scorer):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(roc_auc_score))
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(accuracy_score))
    scores = cross_val_score(model, X, y, cv=skf, scoring=scoring)
    return np.mean(auc_scores), np.mean(accuracy_scores), np.mean(scores)

In [87]:
# Liste pour stocker les résultats
results = []

# Entraîner et évaluer chaque modèle
for model_name, dfs in model_datasets_cv.items():
    print(f"Evaluating {model_name}")
    # préparation des données
    X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(dfs[1], dfs[2], dfs[3], dfs[4])

    # CV
    auc, accuracy, custom_score = cross_validate_model(dfs[0], X_train_loc, y_train_loc)
    
    # Enregistrer les résultats dans MLflow
    with mlflow.start_run(run_name=f"{model_name}_CV"):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("ROC AUC", auc)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("Custom Score", custom_score)
        mlflow.sklearn.log_model(dfs[0], f"{model_name}")

    print(f"{model_name} - ROC AUC: {auc}, Accuracy: {accuracy}, Custom Score: {custom_score}")

    # Ajouter les résultats au DataFrame
    results.append({
        "Model": model_name,
        "ROC AUC": auc,
        "Accuracy": accuracy,
        "Custom Score": custom_score
    })

Evaluating Logistic Regression
Logistic Regression - ROC AUC: 0.5048351136983508, Accuracy: 0.9193346456042353, Custom Score: -0.799592685091502
Evaluating Logistic Regression SMOTE
Logistic Regression SMOTE - ROC AUC: 0.6944203639698354, Accuracy: 0.6944203658424422, Custom Score: -1.617489346016113
Evaluating Logistic Regression Undersampling
Logistic Regression Undersampling - ROC AUC: 0.6850956696878148, Accuracy: 0.6850956696878148, Custom Score: -1.7306143001007048
Evaluating XGBoost
XGBoost - ROC AUC: 0.5225334661566252, Accuracy: 0.9181801903358762, Custom Score: -0.771552500998476
Evaluating XGBoost with weight
XGBoost with weight - ROC AUC: 0.6840698518467668, Accuracy: 0.7667680227597993, Custom Score: -0.5344344282611745
Evaluating LightGBM
LightGBM - ROC AUC: 0.5141577947478874, Accuracy: 0.9195013077267417, Custom Score: -0.7847189483061271
Evaluating LightGBM with unbalanced
LightGBM with unbalanced - ROC AUC: 0.7059788669439007, Accuracy: 0.7186346552874281, Custom Scor

In [88]:
# Afficher les résultats
results_df = pd.DataFrame(results)
results_df.sort_values(by="ROC AUC", ascending=False)

Unnamed: 0,Model,ROC AUC,Accuracy,Custom Score
8,CatBoost with weight,0.707758,0.729053,-0.501756
6,LightGBM with unbalanced,0.705979,0.718635,-0.505959
1,Logistic Regression SMOTE,0.69442,0.69442,-1.617489
2,Logistic Regression Undersampling,0.685096,0.685096,-1.730614
4,XGBoost with weight,0.68407,0.766768,-0.534434
3,XGBoost,0.522533,0.91818,-0.771553
7,CatBoost,0.51499,0.919668,-0.783438
5,LightGBM,0.514158,0.919501,-0.784719
0,Logistic Regression,0.504835,0.919335,-0.799593


## Recherche des hyperparamètres

In [89]:
# Sélection des modèles les plus prometteurs
model_datasets_hp = {
    "LightGBM": (lgbm_model_ub, False, False, False, None),
    "CatBoost": (catboost_model_weight, False, False, False, None),
}

### RandomizedSearchCV

In [90]:
# Grilles d'hyperparamètres pour chaque modèle
param_grids = {
    "LightGBM": {
        'n_estimators': [200, 400, 600],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1, 0.14, 0.16],
        'num_leaves': [50, 100, 150],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [0, 0.05, 0.1],
        'min_child_samples': [50, 75, 100]
    },
    "CatBoost": {
        'iterations': [500, 1000, 2000],
        'depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1, 0.12],
        'l2_leaf_reg': [3, 4, 5]
    }
}

In [91]:
# Recherche des hyperparamètres pour chaque modèle
best_params = {}
best_scores = {}

for model_name, dfs in model_datasets_hp.items(): 
    print(f"Processing {model_name}...")
    X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(dfs[1], dfs[2], dfs[3], dfs[4])
        
    # Utiliser RandomizedSearchCV
    random_search = RandomizedSearchCV(dfs[0], param_grids[model_name], n_iter=10, cv=5, scoring=make_scorer(roc_auc_score),
                                       random_state=42)
    random_search.fit(X_train_loc, y_train_loc)
    
    best_params[model_name] = random_search.best_params_
    best_scores[model_name] = random_search.best_score_

    # Enregistrer les résultats dans MLflow
    with mlflow.start_run(run_name=f"{model_name}_HP_Rand"):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("ROC AUC", best_scores[model_name])
        mlflow.log_params(best_params[model_name])
        # Enregistrer le modèle dans MLflow
        mlflow.sklearn.log_model(random_search.best_estimator_, model_name)


# Afficher les meilleurs hyperparamètres et scores pour chaque modèle
for model_name in best_params:
    print(f"{model_name} - Best Parameters: {best_params[model_name]}, Best ROC AUC Score: {best_scores[model_name]:.4f}")


Processing LightGBM...
Processing CatBoost...
LightGBM - Best Parameters: {'reg_lambda': 0.1, 'reg_alpha': 0.01, 'num_leaves': 150, 'n_estimators': 400, 'min_child_samples': 75, 'max_depth': 3, 'learning_rate': 0.14}, Best ROC AUC Score: 0.7083
CatBoost - Best Parameters: {'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 500, 'depth': 5}, Best ROC AUC Score: 0.7076


### GridSearchCV

In [92]:
# Grilles d'hyperparamètres pour chaque modèle
param_grids = {
    "LightGBM": {
        'n_estimators': [200, 400, 600],
        'max_depth': [3, 4],
        'learning_rate': [0.12, 0.14],
        'num_leaves': [50, 100],
        'reg_alpha': [0, 0.01],
        'reg_lambda': [0, 0.1],
        'min_child_samples': [50, 75, 100]
    },
    "CatBoost": {
        'iterations': [500, 1000],
        'depth': [3, 5],
        'learning_rate': [0.01, 0.1],
        'l2_leaf_reg': [3, 4]
    }
}

In [93]:
# Recherche des hyperparamètres pour chaque modèle
best_params = {}
best_scores = {}

for model_name, (model, need_imputation, need_scaling, need_validation, technique) in model_datasets_hp.items():
    print(f"Processing {model_name}...")
    X_train_loc, y_train_loc, X_val, y_val, X_test_loc, y_test_loc = prepare_data(need_imputation, need_scaling,
                                                                                  need_validation, technique)
    
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring=make_scorer(roc_auc_score))
    grid_search.fit(X_train_loc, y_train_loc)
    
    best_params[model_name] = grid_search.best_params_
    best_scores[model_name] = grid_search.best_score_

    # Enregistrer les résultats dans MLflow
    with mlflow.start_run(run_name=f"{model_name}_HP_Grid"):
        mlflow.log_param("model", model_name)
        mlflow.log_metric("ROC AUC", best_scores[model_name])
        mlflow.log_params(best_params[model_name])
        # Enregistrer le modèle dans MLflow
        mlflow.sklearn.log_model(random_search.best_estimator_, model_name)

Processing LightGBM...


KeyboardInterrupt: 

In [None]:
# Afficher les meilleurs hyperparamètres et scores pour chaque modèle
for model_name in best_params:
    print(f"{model_name} - Best Parameters: {best_params[model_name]}, Best roc_auc_score: {best_scores[model_name]:.4f}")

Resultats sur jeu de test

In [None]:
#  nouvelles instance des modeles
lgbm_model_ub = LGBMClassifier(verbose=0, n_jobs=-1, is_unbalance=True, verbosity=-1)
catboost_model_weight = CatBoostClassifier(verbose=0, class_weights=class_weights_catboost, thread_count=-1, task_type="GPU")

# modèles choisis
model_datasets_hp = {
    "LightGBM unbalance": (lgbm_model_ub, False, False, True, None),
    "CatBoost weight": (catboost_model_weight, False, False, True, None),
}

# Meilleurs paramètres trouvés pour chaque modèle
best_params = {
    "LightGBM unbalance": {'learning_rate': 0.14, 'max_depth': 3, 'min_child_samples': 100, 'n_estimators': 400, 'num_leaves': 100},
    "CatBoost weight": {'depth': 3, 'iterations': 1000, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
}

In [None]:
results=[]

# Entraîner et évaluer chaque modèle
for model_name, dfs in model_datasets_hp.items():
    print(f"Evaluating {model_name}")
    train_and_log_model(dfs[0], model_name, dfs[1], dfs[2], dfs[3], dfs[4], best_params=best_params[model_name])

# Afficher les résultats
results_df = pd.DataFrame(results)
results_df.sort_values(by="ROC AUC", ascending=False)

## Optimisation du seuil

Calculez les probabilités de prédiction pour l'ensemble de validation ou de test.
Évaluez le coût de différentes valeurs de seuil (par exemple, de 0 à 1 avec des pas de 0.01) et sélectionnez le seuil qui minimise votre fonction de coût.