<b><font color="SteelBlue" size="+3">Implémentez un modèle de scoring</font></b>

# Introduction

Ce note book est la suite de l'EDA, il va permettre de présenter le prétraitement des données, le feature engineering et la modélisation.

In [109]:
# Chargement des librairies

# Built-in
import os

# Data Manipulation and Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import mlflow
import mlflow.sklearn

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
import re
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score

# Custom Feature Engineering Pipeline
from feature_pipeline import FeatureEngineeringPipeline, remove_highly_correlated_features, remove_low_correlation_features

In [2]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("Credit_Scoring_Experiment")

<Experiment: artifact_location='mlflow-artifacts:/185199612184583695', creation_time=1715766694823, experiment_id='185199612184583695', last_update_time=1715766694823, lifecycle_stage='active', name='Credit_Scoring_Experiment', tags={}>

# Data préparation

## Chargement des données

In [3]:
os.listdir("data/sources/")

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'Projet+Mise+en+prod+-+home-credit-default-risk.zip',
 'sample_submission.csv']

In [4]:
PATH = "./data/sources/"

In [5]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

## Split du jeu de données

In [6]:
# Division des données en ensembles d'entraînement et de test
train, test = train_test_split(application_train, test_size=0.2, random_state=42, stratify=application_train['TARGET'])

In [7]:
train.shape

(246008, 122)

## Gestion des valeurs manquantes et abérrantes

In [8]:
# Supprimer les lignes où TARGET est manquant dans application_train
train.dropna(subset=['TARGET'], inplace=True)
test.dropna(subset=['TARGET'], inplace=True)

In [9]:
# Suppression des lignes où les clés sont absentes
def drop_missing_keys(df, key, reference_df, reference_key):
    valid_keys = reference_df[reference_key].unique()
    return df[df[key].isin(valid_keys)]

# Supprimer les lignes où les clés sont absentes
bureau = drop_missing_keys(bureau, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
bureau_balance = drop_missing_keys(bureau_balance, 'SK_ID_BUREAU', bureau, 'SK_ID_BUREAU')
previous_application = drop_missing_keys(previous_application, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
POS_CASH_balance = drop_missing_keys(POS_CASH_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
installments_payments = drop_missing_keys(installments_payments, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
credit_card_balance = drop_missing_keys(credit_card_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')

In [10]:
# Fonction pour supprimer les colonnes avec plus de 80% de valeurs manquantes
def drop_missing_columns(df, threshold=0.8):
    initial_columns = df.shape[1]
    missing_percentage = df.isnull().mean()
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    df = df.drop(columns=columns_to_drop)
    final_columns = df.shape[1]
    print(f"Colonnes supprimées: {initial_columns - final_columns}")
    print(f"Colonnes restantes: {final_columns}")
    return df

In [11]:
# Fonction pour gérer les valeurs aberrantes
def cap_values(series, threshold=0.2):
    lower_percentile = np.percentile(series, 1)
    upper_percentile = np.percentile(series, 99)
    outliers = (series < lower_percentile) | (series > upper_percentile)

    outlier_pct = outliers.sum() / len(series)

    if outlier_pct > threshold:
        print("Significant outliers detected, not capping values.")
        return series  # Ne pas appliquer le cap si les valeurs aberrantes sont significatives
    else:
        return np.clip(series, lower_percentile, upper_percentile)


In [12]:
def cap_outliers(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'SK_ID_CURR':  # Exclure la colonne avec la clé
            df[col] = cap_values(df[col])
    return df

In [13]:
# Appliquer le filtre des colonnes manquantes et la gestion des valeurs aberrantes à chaque DataFrame
dfs = {
    "Train": train,
    "Test": test,
    "Application Test": application_test,
    "Bureau": bureau,
    "Bureau Balance": bureau_balance,
    "Credit Card Balance": credit_card_balance,
    "Installments Payments": installments_payments,
    "Previous Application": previous_application,
    "POS CASH Balance": POS_CASH_balance
}

In [14]:
for name, df in dfs.items():
    print(f"\n{name}:")
    df = drop_missing_columns(df)
    df = cap_outliers(df)
    dfs[name] = df


Train:
Colonnes supprimées: 0
Colonnes restantes: 122

Test:
Colonnes supprimées: 0
Colonnes restantes: 122

Application Test:
Colonnes supprimées: 0
Colonnes restantes: 121

Bureau:
Colonnes supprimées: 0
Colonnes restantes: 17

Bureau Balance:
Colonnes supprimées: 0
Colonnes restantes: 3

Credit Card Balance:
Colonnes supprimées: 0
Colonnes restantes: 23

Installments Payments:
Colonnes supprimées: 0
Colonnes restantes: 8

Previous Application:
Colonnes supprimées: 2
Colonnes restantes: 35

POS CASH Balance:
Colonnes supprimées: 0
Colonnes restantes: 8


In [15]:
dfs['Train'].shape

(246008, 122)

## Imputation

In [16]:
dfs_imputed = dfs

In [17]:
# Fonction pour imputer les valeurs manquantes
def impute_missing_values(df):
    # Imputation pour les colonnes numériques
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)
    
    # Imputation pour les colonnes catégorielles
    categorical_cols = df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

In [18]:
for name, df in dfs_imputed.items():
    df = impute_missing_values(df)
    dfs_imputed[name] = df

In [19]:
# Fonction pour vérifier les valeurs manquantes dans un DataFrame
def check_missing_values(df):
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        print("Colonnes avec des valeurs manquantes :")
        print(missing_values)
    else:
        print("Aucune valeur manquante détectée.")

# Vérification des valeurs manquantes dans chaque DataFrame
for name, df in dfs_imputed.items():
    print(f"\n{name}:")
    check_missing_values(df)


Train:
Aucune valeur manquante détectée.

Test:
Aucune valeur manquante détectée.

Application Test:
Aucune valeur manquante détectée.

Bureau:
Aucune valeur manquante détectée.

Bureau Balance:
Aucune valeur manquante détectée.

Credit Card Balance:
Aucune valeur manquante détectée.

Installments Payments:
Aucune valeur manquante détectée.

Previous Application:
Aucune valeur manquante détectée.

POS CASH Balance:
Aucune valeur manquante détectée.


In [20]:
dfs_imputed['Train'].shape

(246008, 122)

## Sauvegarde des données nettoyées

In [21]:
# Fonction pour sauvegarder un DataFrame
def save_dataframe(df, filename, output_dir):
    output_path = os.path.join(output_dir, filename)
    df.to_csv(output_path, index=False)
    print(f"DataFrame sauvegardé sous : {output_path}")

In [22]:
# Dictionnaire contenant les noms des fichiers
original_names = {
    "Train": "train.csv",
    "Test": "test.csv",
    "Application Test": "application_test.csv",
    "Bureau": "bureau.csv",
    "Bureau Balance": "bureau_balance.csv",
    "Credit Card Balance": "credit_card_balance.csv",
    "Installments Payments": "installments_payments.csv",
    "Previous Application": "previous_application.csv",
    "POS CASH Balance": "POS_CASH_balance.csv"
}

### Sans l'imputation

In [23]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir = "data/Cleaned"

In [24]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs.items():
    save_dataframe(df, original_names[name], output_dir)

DataFrame sauvegardé sous : data/Cleaned\train.csv
DataFrame sauvegardé sous : data/Cleaned\test.csv
DataFrame sauvegardé sous : data/Cleaned\application_test.csv
DataFrame sauvegardé sous : data/Cleaned\bureau.csv
DataFrame sauvegardé sous : data/Cleaned\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned\POS_CASH_balance.csv


### Avec l'imputation

In [25]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir_imputed = "data/Cleaned/Imputed"

In [26]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs_imputed.items():
    save_dataframe(df, original_names[name], output_dir_imputed)

DataFrame sauvegardé sous : data/Cleaned/Imputed\train.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\test.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\application_test.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\bureau.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\POS_CASH_balance.csv


# Feature engineering

## Sur jeu non imputé

### Sur le df train

In [27]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline(data_directory=output_dir + '/')

In [28]:
# Entraîner le pipeline
train_transformed = pipeline.fit()

feature_engineering - done
Bureau and bureau_balance data - done in 38s
previous_application - done in 39s
previous applications balances - done in 500s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des outliers - done
Colonnes supprimées (constantes): 28, Colonnes restantes: 656
Initial df memory usage is 1236.16 MB for 774 columns
Final memory usage is: 453.03 MB - decreased by 63.4%


In [29]:
# Sauvegarder les paramètres de transformation
pipeline.save(os.path.join(output_dir, "param"))

In [30]:
train_transformed.shape

(246006, 774)

In [31]:
train_transformed.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,310536.0,0,0,0,0,0,2,90000.0,227520.0,13189.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,365516.0,0,0,1,1,1,0,90000.0,161730.0,13095.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,242055.0,1,0,1,0,1,0,135000.0,728847.0,26307.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,452674.9375,1,0,1,0,0,0,135000.0,474183.0,34636.5,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,448321.0,0,0,0,0,1,0,180000.0,254700.0,27558.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [32]:
check_missing_values(train_transformed)

Colonnes avec des valeurs manquantes :
DAYS_EMPLOYED                       44143
DAYS_LAST_PHONE_CHANGE              30085
INCOME_TO_EMPLOYED_RATIO            44144
EMPLOYED_TO_BIRTH_RATIO             44143
CAR_TO_EMPLOYED_RATIO               44144
                                    ...  
CTA_CREDIT_TO_ANNUITY_MEAN_RATIO    14000
DAYS_DECISION_MEAN_TO_BIRTH         13985
DAYS_CREDIT_MEAN_TO_BIRTH           35244
DAYS_DECISION_MEAN_TO_EMPLOYED      55782
DAYS_CREDIT_MEAN_TO_EMPLOYED        72717
Length: 528, dtype: int64


### Sur le df test

In [33]:
test = dfs["Test"]

In [34]:
# Charger les paramètres de transformation
pipeline.load(os.path.join(output_dir, "param"))

# Transformer les données de test
test_transformed = pipeline.transform(test)

Bureau and bureau_balance data - done in 36s
previous_application - done in 38s
Colonnes supprimées: 50, Colonnes restantes: 684
Initial df memory usage is 362.00 MB for 774 columns
Final memory usage is: 106.04 MB - decreased by 70.7%
previous applications balances - done in 504s


In [35]:
test_transformed.shape

(61501, 774)

## Sur jeu imputé

### Sur le df train

In [36]:
# Initialiser le pipeline
pipeline_imp = FeatureEngineeringPipeline(data_directory=output_dir_imputed + '/')

In [37]:
# Entraîner le pipeline
train_imputed_transformed = pipeline_imp.fit()

feature_engineering - done
Bureau and bureau_balance data - done in 37s
previous_application - done in 39s
previous applications balances - done in 499s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des outliers - done
Gestion des valeurs manquantes - done
Colonnes supprimées (constantes): 28, Colonnes restantes: 656
Initial df memory usage is 1236.16 MB for 774 columns
Final memory usage is: 453.03 MB - decreased by 63.4%


In [38]:
# Sauvegarder les paramètres de transformation
pipeline_imp.save(os.path.join(output_dir_imputed, "param"))

In [39]:
train_imputed_transformed.shape

(246006, 774)

In [40]:
train_imputed_transformed.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,310536.0,0,0,0,0,0,2,90000.0,227520.0,13189.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,365516.0,0,0,1,1,1,0,90000.0,161730.0,13095.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,242055.0,1,0,1,0,1,0,135000.0,728847.0,26307.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,452674.9375,1,0,1,0,0,0,135000.0,474183.0,34636.5,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,448321.0,0,0,0,0,1,0,180000.0,254700.0,27558.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [41]:
check_missing_values(train_imputed_transformed)

Aucune valeur manquante détectée.


### Sur le df test

In [42]:
test_imputed = dfs_imputed["Test"]

In [43]:
# Charger les paramètres de transformation
pipeline_imp.load(os.path.join(output_dir_imputed, "param"))

# Transformer les données de test
test_imputed_transformed = pipeline_imp.transform(test_imputed)

Bureau and bureau_balance data - done in 37s
previous_application - done in 39s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des valeurs manquantes - done
Initial df memory usage is 362.00 MB for 774 columns
Final memory usage is: 106.04 MB - decreased by 70.7%
previous applications balances - done in 502s


In [44]:
test_imputed.shape

(61503, 122)

In [45]:
test_imputed_transformed.shape

(61501, 774)

In [46]:
# Vérification des colonnes avec des valeurs manquantes
missing_values = test_imputed_transformed.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Colonnes avec des valeurs manquantes : {missing_columns}")

Colonnes avec des valeurs manquantes : []


## Sauvegarde / Chargement des données

In [47]:
output_feat_dir = "data/Featured"

# Enregistrer les DataFrames dans des fichiers CSV
train_transformed.to_csv(os.path.join(output_feat_dir, "train_transformed.csv"), index=False)
test_transformed.to_csv(os.path.join(output_feat_dir, "test_transformed.csv"), index=False)
train_imputed_transformed.to_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"), index=False)
test_imputed_transformed.to_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"), index=False)

print("DataFrames enregistrés avec succès.")

DataFrames enregistrés avec succès.


In [123]:
output_feat_dir = "data/Featured"

# Charger les DataFrames à partir des fichiers CSV
train_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_transformed.csv"))
test_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_transformed.csv"))
train_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"))
test_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"))

print("DataFrames chargés avec succès.")

DataFrames chargés avec succès.


In [124]:
print(f"train_transformed : {train_transformed.shape},\n"
      f"test_transformed : {test_transformed.shape},\n"
      f"train_imputed_transformed : {train_imputed_transformed.shape},\n"
      f"test_imputed_transformed : {test_imputed_transformed.shape}")

train_transformed : (246006, 774),
test_transformed : (61501, 774),
train_imputed_transformed : (246006, 774),
test_imputed_transformed : (61501, 774)


In [125]:
# nettoyage des noms de colonnes
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

train_transformed = clean_column_names(train_transformed)
test_transformed = clean_column_names(test_transformed)
train_imputed_transformed = clean_column_names(train_imputed_transformed)
test_imputed_transformed = clean_column_names(test_imputed_transformed)

In [126]:
# Déterminer les colonnes présentes dans X_train_imputed mais pas dans X_train
extra_columns = set(train_imputed_transformed.columns) - set(train_transformed.columns)
print(f"Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: {len(extra_columns)}")

Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: 0


# Définition du Score Métier

In [127]:
def cost_function(y_true, y_pred):
    fp_cost = 1
    fn_cost = 10
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return fp_cost * fp + fn_cost * fn

In [128]:
custom_scorer = make_scorer(cost_function, greater_is_better=False)

# Modèles de machine learning

In [129]:
# Séparer les features et la cible sur le jeu sans imputation
X_train = train_transformed.drop(columns=['TARGET'])
y_train = train_transformed['TARGET']
X_test = test_transformed.drop(columns=['TARGET'])
y_test = test_transformed['TARGET']

## Séléction des variables

### Suppression des variables hautement ou trop peu corrélées

In [130]:
train_imputed_cleaned, low_corr_features = remove_low_correlation_features(train_imputed_transformed, 'TARGET', threshold=0.01)
print(f"Variables très peu corrélées supprimées: {len(low_corr_features)}")

Variables très peu corrélées supprimées: 271


In [131]:
train_imputed_cleaned, high_corr_features = remove_highly_correlated_features(train_imputed_transformed, threshold=0.9)
print(f"Variables très corrélées supprimées: {len(high_corr_features)}")

Variables très corrélées supprimées: 190


In [132]:
train_imputed_cleaned.shape

(246006, 584)

In [133]:
# Aligner les colones du jeu de test

train_imputed_cleaned_columns = list(train_imputed_cleaned.columns)
test_imputed_cleaned = test_imputed_transformed.copy()

# Supprimer les colonnes en trop
extra_cols = set(test_imputed_cleaned.columns) - set(train_imputed_cleaned_columns)
if extra_cols:
    test_imputed_cleaned.drop(columns=list(extra_cols), inplace=True)

# Réordonner les colonnes pour correspondre à celles du train
test_imputed_cleaned = test_imputed_cleaned[train_imputed_cleaned_columns]

test_imputed_cleaned.shape

(61501, 584)

In [134]:
# Séparer les features et la cible sur le jeu imputé
X_train_imputed = train_imputed_cleaned.drop(columns=['TARGET'])
y_train_imputed = train_imputed_cleaned['TARGET']
X_test_imputed = test_imputed_cleaned.drop(columns=['TARGET'])
y_test_imputed = test_imputed_cleaned['TARGET']

### Recherche de la feature importance avec randomforest

In [135]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_imputed, y_train_imputed)

In [136]:
# Obtenir les importances des features
importances = rf.feature_importances_

feature_importances = pd.DataFrame({'Feature': X_train_imputed.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

In [137]:
feature_importances.head(20)

Unnamed: 0,Feature,Importance
59,EXT_SOURCES_PROD,0.014869
60,EXT_SOURCES_WEIGHTED,0.011109
61,EXT_SOURCES_MIN,0.009742
62,EXT_SOURCES_MAX,0.00899
29,EXT_SOURCE_2,0.008467
63,EXT_SOURCES_NANMEDIAN,0.007953
30,EXT_SOURCE_3,0.006562
64,EXT_SOURCES_VAR,0.005929
74,EXT_SOURCES_MEAN_MEAN,0.004472
72,CAR_TO_BIRTH_RATIO,0.004356


In [138]:
# Sélectionner les features les plus importantes au-dessus de la moyenne
threshold = feature_importances['Importance'].mean()
selected_features = feature_importances[feature_importances['Importance'] > threshold]['Feature']

#print(f"Selected Features: {selected_features.values}")
print(f"Number of selected features: {len(selected_features)}")

Number of selected features: 268


In [139]:
# Déterminer les colonnes présentes dans X_train_imputed mais pas dans X_train
extra_columns = set(X_train_imputed.columns) - set(X_train.columns)
print(f"Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: {len(extra_columns)}")
print(f"Colonnes supplémentaires: {extra_columns}")

Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: 0
Colonnes supplémentaires: set()


In [140]:
# Filtrer les jeux de données pour ne garder que les features sélectionnées
X_train_imp_selected = X_train_imputed[selected_features]
X_test_imp_selected = X_test_imputed[selected_features]

# idem sur jeu non imputé
X_train_slected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [141]:
X_train_slected.shape

(246006, 268)

## Standardisation

In [142]:
# Normaliser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp_selected)
X_test_scaled = scaler.transform(X_test_imp_selected)

## Tests de différents modèles

Pour les modèles Dummy Regressor, Régression Logistique et Random Forest, nous devons utiliser le jeu de données imputé (sans NaN). La standardisation (StandardScaler) est nécessaire pour la Régression Logistique mais pas pour le Dummy Regressor et la Random Forest.<br>
Pour les modèles Lightgbm et CatBoosting nous n'avons ni besoin de compléter les nan ni besoin de standardiser les données

In [143]:
# Fonction pour préparer les données avec différentes techniques de rééquilibrage
def prepare_data(X, y, technique=None):
    if technique == "SMOTE":
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X, y)
    elif technique == "Undersampling":
        undersample = RandomUnderSampler(random_state=42)
        X_res, y_res = undersample.fit_resample(X, y)
    else:
        X_res, y_res = X, y
    return X_res, y_res

In [144]:
# Fonction pour effectuer une validation croisée et évaluer les modèles
def cross_validate_model(model, X, y, technique=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(roc_auc_score))
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring=make_scorer(accuracy_score))
    return np.mean(auc_scores), np.mean(accuracy_scores)

In [145]:
# Fonction pour effectuer la recherche d'hyperparamètres
def hyperparameter_search(model, param_grid, X, y, technique=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='roc_auc', n_jobs=-1)
    X_res, y_res = prepare_data(X, y, technique)
    grid_search.fit(X_res, y_res)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

In [146]:
# Fonction pour entraîner et enregistrer les modèles avec MLFlow
def train_and_log_model(model, model_name, X_train, y_train, X_val=None, y_val=None, X_test=None, y_test=None, technique=None):
    with mlflow.start_run(run_name=f"{model_name}_{technique}"):
        if X_val is not None and y_val is not None:
            # Entraînement des modèles qui utilisent un ensemble de validation (LightGBM, CatBoost)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)])
        else:
            # Entraînement des modèles qui n'utilisent pas d'ensemble de validation (DummyClassifier, Logistic Regression, Random Forest)
            model.fit(X_train, y_train)
        
        if hasattr(model, "predict_proba"):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            y_pred = (y_pred_proba >= 0.5).astype(int)
            auc_score = roc_auc_score(y_test, y_pred_proba)
        else:
            y_pred = model.predict(X_test)
            auc_score = None
        
        accuracy = accuracy_score(y_test, y_pred)

        mlflow.log_param("model", model_name)
        mlflow.log_param("balancing_technique", technique)
        if auc_score is not None:
            mlflow.log_metric("ROC AUC", auc_score)
        mlflow.log_metric("Accuracy", accuracy)
        
        mlflow.sklearn.log_model(model, f"{model_name}_model")

        print(f"{model_name} with {technique} - ROC AUC: {auc_score}, Accuracy: {accuracy}")

### DummyClassifier

In [147]:
dummy_model = DummyClassifier(strategy='most_frequent')
train_and_log_model(dummy_model, "Dummy Classifier", X_train_imp_selected, y_train_imputed, None, None,
                    X_test_imp_selected, y_test_imputed, None)

Dummy Classifier with None - ROC AUC: 0.5, Accuracy: 0.9192696053722703


### Logistic Regression

In [148]:
# sans augmentation de données
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
train_and_log_model(logistic_model, "Logistic Regression", X_train_scaled, y_train, None, None,
                    X_test_scaled, y_test, None)

Logistic Regression with None - ROC AUC: 0.7681224992094798, Accuracy: 0.9195785434383181


In [149]:
# avec SMOTE
logistic_model_smote = LogisticRegression(random_state=42, max_iter=1000)
X_rl_smote, y_rl_smote = prepare_data(X_train_scaled, y_train, 'SMOTE')
train_and_log_model(logistic_model_smote, "Logistic Regression SMOTE", X_rl_smote, y_rl_smote, None, None,
                    X_test_scaled, y_test, "SMOTE")

Logistic Regression SMOTE with SMOTE - ROC AUC: 0.7566674375930794, Accuracy: 0.7102486138436773


In [150]:
# avec Undersamplig
logistic_model_us = LogisticRegression(random_state=42, max_iter=1000)
X_rl_us, y_rl_us = prepare_data(X_train_scaled, y_train, 'Undersampling')
train_and_log_model(logistic_model_us, "Logistic Regression Undersampling", X_rl_us, y_rl_us, None, None,
                    X_test_scaled, y_test, "Undersampling")

Logistic Regression Undersampling with Undersampling - ROC AUC: 0.7659396980219966, Accuracy: 0.7055495032601096


### Random Forest

In [151]:
# sans augmentation de données
rf_model = RandomForestClassifier()
train_and_log_model(rf_model, "Random Forest", X_train_imp_selected, y_train_imputed, None, None,
                    X_test_imp_selected, y_test_imputed, None)

Random Forest with None - ROC AUC: 0.7288077583839673, Accuracy: 0.9193996845579747


In [None]:
# avec SMOTE
rf_model_smote = RandomForestClassifier()
X_rl_smote, y_rl_smote = prepare_data(X_train_imp_selected, y_train_imputed, 'SMOTE')
train_and_log_model(rf_model_smote, "Random Forest SMOTE", X_rl_smote, y_rl_smote, None, None,
                    X_test_imp_selected, y_test_imputed, "SMOTE")

In [None]:
# avec Undersamplig
rf_model_us = RandomForestClassifier()
X_rl_us, y_rl_us = prepare_data(X_train_imp_selected, y_train_imputed, 'Undersampling')
train_and_log_model(rf_model_us, "Random Forest Undersampling", X_rl_us, y_rl_us, None, None,
                    X_test_imp_selected, y_test_imputed, "Undersampling")

### LightGBM

In [None]:
# Division des données d'entraînement en sous-ensembles d'entraînement et de validation
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(X_train_slected, y_train,
                                                                      test_size=0.2, random_state=42,
                                                                      stratify=y_train)

In [None]:
# sans augmentation de données
lgbm_model = LGBMClassifier()
train_and_log_model(lgbm_model, "LightGBM", X_train_part, y_train_part, X_val_part, y_val_part,
                   X_test_selected, y_test, None)

Pas de gestion de déséquilibre de la target avec SMOTE car : "SMOTE does not accept missing values encoded as NaN"

In [None]:
# avec Undersamplig
lgbm_model_us = LGBMClassifier()
X_lgbm_us, y_lgbm_us = prepare_data(X_train_part, y_train_part, "Undersampling")
train_and_log_model(lgbm_model_us, "LightGBM", X_lgbm_us, y_lgbm_us, X_val_part, y_val_part,
                   X_test_selected, y_test, "Undersampling")

## CatBoost

In [None]:
# sans augmentation de données
catboost_model = CatBoostClassifier(verbose=0)
train_and_log_model(catboost_model, "CatBoost", X_train_part, X_val_part, y_train_part, y_val_part,
                   X_test_selected, y_test, None)

Pas de gestion de déséquilibre de la target avec SMOTE car : "SMOTE does not accept missing values encoded as NaN"

In [None]:
# avec Undersamplig
catboost_model_us = CatBoostClassifier(verbose=0)
X_catb_us, y_catb_us = prepare_data(X_train_part, y_train_part, "Undersampling")
train_and_log_model(catboost_model_us, "CatBoost", X_catb_us, y_catb_us, X_val_part, y_val_part,
                   X_test_slected, y_test, "Undersampling")

## Recherche des hyperparamètres