<b><font color="SteelBlue" size="+3">Implémentez un modèle de scoring</font></b>

# Introduction

Ce note book est la suite de l'EDA, il va permettre de présenter le prétraitement des données, le feature engineering et la modélisation.

In [1]:
# Chargement des librairies

# Built-in
import os

# Data Manipulation and Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import mlflow
import mlflow.sklearn

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import lightgbm as lgb
import re
from catboost import CatBoostClassifier

# Custom Feature Engineering Pipeline
from feature_pipeline import FeatureEngineeringPipeline, remove_highly_correlated_features

In [2]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("Credit_Scoring_Experiment")

<Experiment: artifact_location='mlflow-artifacts:/185199612184583695', creation_time=1715766694823, experiment_id='185199612184583695', last_update_time=1715766694823, lifecycle_stage='active', name='Credit_Scoring_Experiment', tags={}>

# Data préparation

## Chargement des données

In [2]:
os.listdir("data/sources/")

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'Projet+Mise+en+prod+-+home-credit-default-risk.zip',
 'sample_submission.csv']

In [3]:
PATH = "./data/sources/"

In [4]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

## Split du jeu de données

In [5]:
# Division des données en ensembles d'entraînement et de test
train, test = train_test_split(application_train, test_size=0.2, random_state=42, stratify=application_train['TARGET'])

In [6]:
train.shape

(246008, 122)

## Gestion des valeurs manquantes et abérrantes

In [7]:
# Supprimer les lignes où TARGET est manquant dans application_train
train.dropna(subset=['TARGET'], inplace=True)
test.dropna(subset=['TARGET'], inplace=True)

In [8]:
# Suppression des lignes où les clés sont absentes
def drop_missing_keys(df, key, reference_df, reference_key):
    valid_keys = reference_df[reference_key].unique()
    return df[df[key].isin(valid_keys)]

# Supprimer les lignes où les clés sont absentes
bureau = drop_missing_keys(bureau, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
bureau_balance = drop_missing_keys(bureau_balance, 'SK_ID_BUREAU', bureau, 'SK_ID_BUREAU')
previous_application = drop_missing_keys(previous_application, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
POS_CASH_balance = drop_missing_keys(POS_CASH_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
installments_payments = drop_missing_keys(installments_payments, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
credit_card_balance = drop_missing_keys(credit_card_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')

In [9]:
# Fonction pour supprimer les colonnes avec plus de 80% de valeurs manquantes
def drop_missing_columns(df, threshold=0.8):
    initial_columns = df.shape[1]
    missing_percentage = df.isnull().mean()
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    df = df.drop(columns=columns_to_drop)
    final_columns = df.shape[1]
    print(f"Colonnes supprimées: {initial_columns - final_columns}")
    print(f"Colonnes restantes: {final_columns}")
    return df

In [10]:
# Fonction pour gérer les valeurs aberrantes
def cap_values(series, threshold=0.2):
    lower_percentile = np.percentile(series, 1)
    upper_percentile = np.percentile(series, 99)
    outliers = (series < lower_percentile) | (series > upper_percentile)

    outlier_pct = outliers.sum() / len(series)

    if outlier_pct > threshold:
        print("Significant outliers detected, not capping values.")
        return series  # Ne pas appliquer le cap si les valeurs aberrantes sont significatives
    else:
        return np.clip(series, lower_percentile, upper_percentile)


In [11]:
def cap_outliers(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'SK_ID_CURR':  # Exclure la colonne avec la clé
            df[col] = cap_values(df[col])
    return df

In [12]:
# Appliquer le filtre des colonnes manquantes et la gestion des valeurs aberrantes à chaque DataFrame
dfs = {
    "Train": train,
    "Test": test,
    "Application Test": application_test,
    "Bureau": bureau,
    "Bureau Balance": bureau_balance,
    "Credit Card Balance": credit_card_balance,
    "Installments Payments": installments_payments,
    "Previous Application": previous_application,
    "POS CASH Balance": POS_CASH_balance
}

In [13]:
for name, df in dfs.items():
    print(f"\n{name}:")
    df = drop_missing_columns(df)
    df = cap_outliers(df)
    dfs[name] = df


Train:
Colonnes supprimées: 0
Colonnes restantes: 122

Test:
Colonnes supprimées: 0
Colonnes restantes: 122

Application Test:
Colonnes supprimées: 0
Colonnes restantes: 121

Bureau:
Colonnes supprimées: 0
Colonnes restantes: 17

Bureau Balance:
Colonnes supprimées: 0
Colonnes restantes: 3

Credit Card Balance:
Colonnes supprimées: 0
Colonnes restantes: 23

Installments Payments:
Colonnes supprimées: 0
Colonnes restantes: 8

Previous Application:
Colonnes supprimées: 2
Colonnes restantes: 35

POS CASH Balance:
Colonnes supprimées: 0
Colonnes restantes: 8


In [14]:
dfs['Train'].shape

(246008, 122)

## Imputation

In [15]:
dfs_imputed = dfs

In [16]:
# Fonction pour imputer les valeurs manquantes
def impute_missing_values(df):
    # Imputation pour les colonnes numériques
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)
    
    # Imputation pour les colonnes catégorielles
    categorical_cols = df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

In [17]:
for name, df in dfs_imputed.items():
    df = impute_missing_values(df)
    dfs_imputed[name] = df

In [18]:
# Fonction pour vérifier les valeurs manquantes dans un DataFrame
def check_missing_values(df):
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        print("Colonnes avec des valeurs manquantes :")
        print(missing_values)
    else:
        print("Aucune valeur manquante détectée.")

# Vérification des valeurs manquantes dans chaque DataFrame
for name, df in dfs_imputed.items():
    print(f"\n{name}:")
    check_missing_values(df)


Train:
Aucune valeur manquante détectée.

Test:
Aucune valeur manquante détectée.

Application Test:
Aucune valeur manquante détectée.

Bureau:
Aucune valeur manquante détectée.

Bureau Balance:
Aucune valeur manquante détectée.

Credit Card Balance:
Aucune valeur manquante détectée.

Installments Payments:
Aucune valeur manquante détectée.

Previous Application:
Aucune valeur manquante détectée.

POS CASH Balance:
Aucune valeur manquante détectée.


In [19]:
dfs_imputed['Train'].shape

(246008, 122)

## Sauvegarde des données nettoyées

In [20]:
# Fonction pour sauvegarder un DataFrame
def save_dataframe(df, filename, output_dir):
    output_path = os.path.join(output_dir, filename)
    df.to_csv(output_path, index=False)
    print(f"DataFrame sauvegardé sous : {output_path}")

In [21]:
# Dictionnaire contenant les noms des fichiers
original_names = {
    "Train": "train.csv",
    "Test": "test.csv",
    "Application Test": "application_test.csv",
    "Bureau": "bureau.csv",
    "Bureau Balance": "bureau_balance.csv",
    "Credit Card Balance": "credit_card_balance.csv",
    "Installments Payments": "installments_payments.csv",
    "Previous Application": "previous_application.csv",
    "POS CASH Balance": "POS_CASH_balance.csv"
}

### Sans l'imputation

In [2]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir = "data/Cleaned"

In [23]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs.items():
    save_dataframe(df, original_names[name], output_dir)

DataFrame sauvegardé sous : data/Cleaned\train.csv
DataFrame sauvegardé sous : data/Cleaned\test.csv
DataFrame sauvegardé sous : data/Cleaned\application_test.csv
DataFrame sauvegardé sous : data/Cleaned\bureau.csv
DataFrame sauvegardé sous : data/Cleaned\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned\POS_CASH_balance.csv


### Avec l'imputation

In [3]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir_imputed = "data/Cleaned/Imputed"

In [25]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs_imputed.items():
    save_dataframe(df, original_names[name], output_dir_imputed)

DataFrame sauvegardé sous : data/Cleaned/Imputed\train.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\test.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\application_test.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\bureau.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned/Imputed\POS_CASH_balance.csv


# Feature engineering

## Sur jeu non imputé

### Sur le df train

In [26]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline(data_directory=output_dir + '/')

In [27]:
# Entraîner le pipeline
train_transformed = pipeline.fit()

feature_engineering - done
Bureau and bureau_balance data - done in 39s
previous_application - done in 40s
previous applications balances - done in 516s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des outliers - done
Colonnes supprimées (constantes): 28, Colonnes restantes: 656
Initial df memory usage is 1236.16 MB for 774 columns
Final memory usage is: 453.03 MB - decreased by 63.4%
Variables peu corrélées supprimées: 256


In [28]:
# Sauvegarder les paramètres de transformation
pipeline.save(os.path.join(output_dir, "param"))

In [29]:
train_transformed.shape

(246006, 518)

In [30]:
train_transformed.head(5)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick"
0,0,0,0,0,2,90000.0,227520.0,13189.5,180000.0,0.008232,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0,0,1,1,0,90000.0,161730.0,13095.0,135000.0,0.003069,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1,0,1,0,0,135000.0,728847.0,26307.0,553500.0,0.020706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1,0,1,0,0,135000.0,474183.0,34636.5,391500.0,0.011703,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,0,0,0,0,180000.0,254700.0,27558.0,225000.0,0.00663,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [31]:
check_missing_values(train_transformed)

Colonnes avec des valeurs manquantes :
DAYS_EMPLOYED                       44143
DAYS_LAST_PHONE_CHANGE              30085
INCOME_TO_EMPLOYED_RATIO            44144
EMPLOYED_TO_BIRTH_RATIO             44143
CAR_TO_EMPLOYED_RATIO               44144
                                    ...  
PAYMENT_MEAN_TO_ANNUITY_RATIO       14513
CTA_CREDIT_TO_ANNUITY_MAX_RATIO     14000
CTA_CREDIT_TO_ANNUITY_MEAN_RATIO    14000
DAYS_CREDIT_MEAN_TO_BIRTH           35244
DAYS_DECISION_MEAN_TO_EMPLOYED      55782
Length: 378, dtype: int64


### Sur le df test

In [32]:
test = dfs["Test"]

In [33]:
# Charger les paramètres de transformation
pipeline.load(os.path.join(output_dir, "param"))

# Transformer les données de test
test_transformed = pipeline.transform(test)

Bureau and bureau_balance data - done in 38s
previous_application - done in 40s
Colonnes supprimées: 50, Colonnes restantes: 684
Initial df memory usage is 242.35 MB for 518 columns
Final memory usage is: 74.78 MB - decreased by 69.1%
previous applications balances - done in 515s


In [34]:
test_transformed.shape

(61501, 518)

## Sur jeu imputé

### Sur le df train

In [35]:
# Initialiser le pipeline
pipeline_imp = FeatureEngineeringPipeline(data_directory=output_dir_imputed + '/')

In [36]:
# Entraîner le pipeline
train_imputed_transformed = pipeline_imp.fit()

feature_engineering - done
Bureau and bureau_balance data - done in 39s
previous_application - done in 40s
previous applications balances - done in 514s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des outliers - done
Gestion des valeurs manquantes - done
Colonnes supprimées (constantes): 28, Colonnes restantes: 656
Initial df memory usage is 1236.16 MB for 774 columns
Final memory usage is: 453.03 MB - decreased by 63.4%
Variables peu corrélées supprimées: 271


In [37]:
# Sauvegarder les paramètres de transformation
pipeline_imp.save(os.path.join(output_dir_imputed, "param"))

In [38]:
train_imputed_transformed.shape

(246006, 503)

In [39]:
train_imputed_transformed.head(5)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick"
0,0,0,0,0,2,90000.0,227520.0,13189.5,180000.0,0.008232,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0,0,1,1,0,90000.0,161730.0,13095.0,135000.0,0.003069,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1,0,1,0,0,135000.0,728847.0,26307.0,553500.0,0.020706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1,0,1,0,0,135000.0,474183.0,34636.5,391500.0,0.011703,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,0,0,0,0,180000.0,254700.0,27558.0,225000.0,0.00663,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [40]:
check_missing_values(train_imputed_transformed)

Aucune valeur manquante détectée.


### Sur le df test

In [41]:
test_imputed = dfs_imputed["Test"]

In [50]:
# Charger les paramètres de transformation
pipeline_imp.load(os.path.join(output_dir_imputed, "param"))

# Transformer les données de test
test_imputed_transformed = pipeline_imp.transform(test_imputed)

Bureau and bureau_balance data - done in 42s
previous_application - done in 43s
Colonnes supprimées: 50, Colonnes restantes: 684
Gestion des valeurs manquantes - done
Initial df memory usage is 235.31 MB for 503 columns
Final memory usage is: 71.61 MB - decreased by 69.6%
previous applications balances - done in 511s


In [51]:
test_imputed.shape

(61503, 122)

In [52]:
test_imputed_transformed.shape

(61501, 503)

In [53]:
# Vérification des colonnes avec des valeurs manquantes
missing_values = test_imputed_transformed.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Colonnes avec des valeurs manquantes : {missing_columns}")

Colonnes avec des valeurs manquantes : []


## Sauvegarde / Chargement des données

In [54]:
output_feat_dir = "data/Featured"

# Enregistrer les DataFrames dans des fichiers CSV
train_transformed.to_csv(os.path.join(output_feat_dir, "train_transformed.csv"), index=False)
test_transformed.to_csv(os.path.join(output_feat_dir, "test_transformed.csv"), index=False)
train_imputed_transformed.to_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"), index=False)
test_imputed_transformed.to_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"), index=False)

print("DataFrames enregistrés avec succès.")

DataFrames enregistrés avec succès.


In [3]:
output_feat_dir = "data/Featured"

# Charger les DataFrames à partir des fichiers CSV
train_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_transformed.csv"))
test_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_transformed.csv"))
train_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "train_imputed_transformed.csv"))
test_imputed_transformed = pd.read_csv(os.path.join(output_feat_dir, "test_imputed_transformed.csv"))

print("DataFrames chargés avec succès.")

DataFrames chargés avec succès.


In [4]:
print(f"train_transformed : {train_transformed.shape},\n"
      f"test_transformed : {test_transformed.shape},\n"
      f"train_imputed_transformed : {train_imputed_transformed.shape},\n"
      f"test_imputed_transformed : {test_imputed_transformed.shape}")

train_transformed : (246006, 518),
test_transformed : (61501, 518),
train_imputed_transformed : (246006, 503),
test_imputed_transformed : (61501, 503)


In [5]:
# nettoyage des noms de colonnes
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

train_transformed = clean_column_names(train_transformed)
test_transformed = clean_column_names(test_transformed)
train_imputed_transformed = clean_column_names(train_imputed_transformed)
test_imputed_transformed = clean_column_names(test_imputed_transformed)

In [39]:
# Déterminer les colonnes présentes dans X_train_imputed mais pas dans X_train
extra_columns = set(train_imputed_transformed.columns) - set(train_transformed.columns)
print(f"Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: {len(extra_columns)}")

Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: 13


# Définition du Score Métier

In [172]:
def cost_function(y_true, y_pred):
    fp_cost = 1
    fn_cost = 10
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return fp_cost * fp + fn_cost * fn

In [173]:
custom_scorer = make_scorer(cost_function, greater_is_better=False)

# Modèles de machine learning

In [27]:
# Séparer les features et la cible sur le jeu sans imputation
X_train = train_transformed.drop(columns=['TARGET'])
y_train = train_transformed['TARGET']
X_test = test_transformed.drop(columns=['TARGET'])
y_test = test_transformed['TARGET']

## Séléction des variables

### Suppression des variables hautement corrélées

In [28]:
train_imputed_cleaned, high_corr_features = remove_highly_correlated_features(train_imputed_transformed, threshold=0.9)
print(f"Variables très corrélées supprimées: {len(high_corr_features)}")

Variables très corrélées supprimées: 140


In [29]:
train_imputed_cleaned.shape

(246006, 363)

In [30]:
# Aligner les colones du jeu de test

train_imputed_cleaned_columns = list(train_imputed_cleaned.columns)
test_imputed_cleaned = test_imputed_transformed.copy()

# Supprimer les colonnes en trop
extra_cols = set(test_imputed_cleaned.columns) - set(train_imputed_cleaned_columns)
if extra_cols:
    test_imputed_cleaned.drop(columns=list(extra_cols), inplace=True)

# Réordonner les colonnes pour correspondre à celles du train
test_imputed_cleaned = test_imputed_cleaned[train_imputed_cleaned_columns]

test_imputed_cleaned.shape

(61501, 363)

In [31]:
# Séparer les features et la cible
X_train_imputed = train_imputed_cleaned.drop(columns=['TARGET'])
y_train_imputed = train_imputed_cleaned['TARGET']
X_test_imputed = test_imputed_cleaned.drop(columns=['TARGET'])
y_test_imputed = test_imputed_cleaned['TARGET']

### Recherche de la feature importance avec randomforest

In [11]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_imputed, y_train_imputed)

In [21]:
# Obtenir les importances des features
importances = rf.feature_importances_

feature_importances = pd.DataFrame({'Feature': X_train_imputed.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

In [24]:
feature_importances.head(20)

Unnamed: 0,Feature,Importance
48,EXT_SOURCES_PROD,0.01649
49,EXT_SOURCES_WEIGHTED,0.0128
50,EXT_SOURCES_MIN,0.010827
51,EXT_SOURCES_MAX,0.010507
22,EXT_SOURCE_2,0.009903
52,EXT_SOURCES_NANMEDIAN,0.009846
23,EXT_SOURCE_3,0.007563
53,EXT_SOURCES_VAR,0.007039
60,CAR_TO_EMPLOYED_RATIO,0.005914
11,DAYS_ID_PUBLISH,0.00567


In [32]:
# Sélectionner les features les plus importantes au-dessus de la moyenne
threshold = feature_importances['Importance'].mean()
selected_features = feature_importances[feature_importances['Importance'] > threshold]['Feature']

#print(f"Selected Features: {selected_features.values}")
print(f"Number of selected features: {len(selected_features)}")

Number of selected features: 177


In [34]:
X_train_imputed.shape

(246006, 362)

In [35]:
X_train.shape

(246006, 517)

In [37]:
# Déterminer les colonnes présentes dans X_train_imputed mais pas dans X_train
extra_columns = set(X_train_imputed.columns) - set(X_train.columns)
print(f"Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: {len(extra_columns)}")
print(f"Colonnes supplémentaires: {extra_columns}")

Nombre de colonnes présentes dans X_train_imputed mais pas dans X_train: 10
Colonnes supplémentaires: {'PREV_CHANNEL_TYPE_Creditandcashoffices_MEAN', 'PREV_APPLICATION_CREDIT_DIFF_MEAN', 'PREV_LAST12M_APPLICATION_CREDIT_DIFF_MIN', 'INS_36M_DBD_MAX', 'CC_AMT_DRAWINGS_POS_CURRENT_MAX', 'POS_REMAINING_INSTALMENTS', 'REFUSED_APPLICATION_CREDIT_DIFF_MIN', 'REFUSED_NAME_CONTRACT_TYPE_Revolvingloans_MEAN', 'PREV_PRODUCT_COMBINATION_CardXSell_MEAN', 'BUREAU_CLOSED_ENDDATE_DIF_MEAN'}


In [33]:
# Filtrer les jeux de données pour ne garder que les features sélectionnées
X_train_imp_selected = X_train_imputed[selected_features]
X_test_imp_selected = X_test_imputed[selected_features]


# Séparer les features et la cible sur le jeu sans imputation
X_train_slected = X_train[selected_features]
X_test_selected = X_test[selected_features]

KeyError: "['INS_36M_DBD_MAX', 'BUREAU_CLOSED_ENDDATE_DIF_MEAN', 'PREV_APPLICATION_CREDIT_DIFF_MEAN', 'PREV_LAST12M_APPLICATION_CREDIT_DIFF_MIN', 'POS_REMAINING_INSTALMENTS'] not in index"

## Standardisation

In [None]:
# Normaliser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imp_selected)
X_test_scaled = scaler.transform(X_test_imp_selected)

## Tests de différents modèles

Pour les modèles Dummy Regressor, Régression Logistique et Random Forest, nous devons utiliser le jeu de données imputé (sans NaN). La standardisation (StandardScaler) est nécessaire pour la Régression Logistique mais pas pour le Dummy Regressor et la Random Forest.<br>
Pour les modèles Lightgbm et CatBoosting nous n'avons ni besoin de compléter les nan ni besoin de standardiser les données

In [None]:
# Fonction pour entraîner et enregistrer les modèles avec MLFlow
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test, balancing_technique=None):
    with mlflow.start_run(run_name=f"{model_name}_{balancing_technique}"):
        model.fit(X_train, y_train)
        
        # Prédictions des probabilités pour le calcul de l'AUC
        if hasattr(model, "predict_proba"):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            y_pred = (y_pred_proba >= 0.5).astype(int)
            auc_score = roc_auc_score(y_test, y_pred_proba)
        else:
            y_pred = model.predict(X_test)
            auc_score = None
        
        # Calcul de l'accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Enregistrement des paramètres et des métriques dans MLFlow
        mlflow.log_param("model", model_name)
        mlflow.log_param("balancing_technique", balancing_technique)
        if auc_score is not None:
            mlflow.log_metric("ROC AUC", auc_score)
        mlflow.log_metric("Accuracy", accuracy)

        # Enregistrement du modèle dans MLFlow
        mlflow.sklearn.log_model(model, f"{model_name}_model")

        print(f"{model_name} with {balancing_technique} - ROC AUC: {auc_score}, Accuracy: {accuracy}")

In [182]:
# Modèles à tester
models = {
    "Dummy Classifier": DummyClassifier(strategy='most_frequent'),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
#    "Random Forest": RandomForestClassifier(),
#    "LightGBM": lgb.LGBMClassifier(),
#    "CatBoost": CatBoostClassifier(verbose=0)
}

In [183]:
# Scénarios de rééquilibrage
balancing_techniques = [None, "SMOTE", "Undersampling"]

In [184]:
for model_name, model in models.items():
    for technique in balancing_techniques:
        if model_name == "Logistic Regression":
            X_train_bal, y_train_bal = X_train_scaled, y_train_imputed
            X_test_bal, y_test_bal = X_test_scaled, y_test_imputed
        elif model_name == "LightGBM":
            # validation croisée pour LightGBM
            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            auc_scores = []
            accuracy_scores = []

            for train_index, val_index in skf.split(X_train, y_train):
                X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
                y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

                if technique == "SMOTE":
                    smote = SMOTE(random_state=42)
                    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)
                elif technique == "Undersampling":
                    undersample = RandomUnderSampler(random_state=42)
                    X_train_fold, y_train_fold = undersample.fit_resample(X_train_fold, y_train_fold)

                model.fit(X_train_fold, y_train_fold)
                y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
                y_pred = (y_pred_proba >= 0.5).astype(int)

                auc_score = roc_auc_score(y_val_fold, y_pred_proba)
                accuracy = accuracy_score(y_val_fold, y_pred)

                auc_scores.append(auc_score)
                accuracy_scores.append(accuracy)

            auc_score = np.mean(auc_scores)
            accuracy = np.mean(accuracy_scores)

            mlflow.log_param("model", model_name)
            mlflow.log_param("balancing_technique", technique)
            mlflow.log_metric("ROC AUC", auc_score)
            mlflow.log_metric("Accuracy", accuracy)

            mlflow.sklearn.log_model(model, f"{model_name}_model")

            print(f"{model_name} with {technique} - ROC AUC: {auc_score}, Accuracy: {accuracy}")
            continue
        else:
            X_train_bal, y_train_bal = X_train_imp_selected, y_train_imputed
            X_test_bal, y_test_bal = X_test_imp_selected, y_test_imputed
        
        if technique == "SMOTE":
            smote = SMOTE(random_state=42)
            X_train_bal, y_train_bal = smote.fit_resample(X_train_bal, y_train_bal)
        elif technique == "Undersampling":
            undersample = RandomUnderSampler(random_state=42)
            X_train_bal, y_train_bal = undersample.fit_resample(X_train_bal, y_train_bal)
        
        train_and_log_model(model, model_name, X_train_bal, y_train_bal, X_test_bal, y_test_bal, technique)

Dummy Classifier with None - ROC AUC: 0.5, Accuracy: 0.9192696053722703
Dummy Classifier with SMOTE - ROC AUC: 0.5, Accuracy: 0.9192696053722703
Dummy Classifier with Undersampling - ROC AUC: 0.5, Accuracy: 0.9192696053722703
Logistic Regression with None - ROC AUC: 0.7747197091113669, Accuracy: 0.9195785434383181
Logistic Regression with SMOTE - ROC AUC: 0.7607358806110013, Accuracy: 0.8254012129884067
Logistic Regression with Undersampling - ROC AUC: 0.7729039957215722, Accuracy: 0.7161184370985838
Random Forest with None - ROC AUC: 0.7306577377428045, Accuracy: 0.9193671647615486


KeyboardInterrupt: 

## Lightgbm

In [31]:
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

# Appliquer le nettoyage des noms de colonnes
X_train = clean_column_names(X_train)
X_test = clean_column_names(X_test)

In [32]:
# Diviser les données d'entraînement en un ensemble d'entraînement et de validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [33]:
# Créer les ensembles de données LightGBM
train_data = lgb.Dataset(X_train_split, label=y_train_split)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [34]:
# Définir les paramètres du modèle
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': True,  # Paramètre pour gérer le déséquilibre des classes
    'lambda_l1': 0.1,  # L1 regularization
    'lambda_l2': 0.1,  # L2 regularization
    'feature_fraction': 0.8,  # Utilise 80% des caractéristiques à chaque itération
    'bagging_fraction': 0.8,  # Utilise 80% des données à chaque itération
    'bagging_freq': 5,  # Effectue le bagging toutes les 5 itérations
    'max_depth': 10,  # Profondeur maximale des arbres
    'num_leaves': 31,  # Nombre maximum de feuilles
    'min_data_in_leaf': 20,  # Nombre minimal de données dans une feuille
    'min_gain_to_split': 0.01,  # Gain minimal pour effectuer une division
    'learning_rate': 0.01,
    'verbose': -1
}

In [35]:
# Entraîner le modèle
print("Training LightGBM model...")
callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=True)]
gbm = lgb.train(params,
                train_data,
                num_boost_round=1000,
                valid_sets=[train_data, val_data],
                callbacks=callbacks)

Training LightGBM model...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.851892	valid_1's auc: 0.788459


In [36]:
# Prédire sur l'ensemble de test
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred_class = (y_pred > 0.5).astype(int)

In [37]:
# Évaluer le modèle
print("Classification Report:")
print(classification_report(y_test, y_pred_class))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_class))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred))

# Afficher l'importance des caractéristiques
importance = gbm.feature_importance(importance_type='split')
feature_names = X_train.columns
feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

print("Importance des caractéristiques :")
print(feature_importance.head(10))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.73      0.83     56536
           1       0.19      0.71      0.30      4965

    accuracy                           0.73     61501
   macro avg       0.58      0.72      0.56     61501
weighted avg       0.90      0.73      0.79     61501

Confusion Matrix:
[[41337 15199]
 [ 1450  3515]]
ROC AUC Score:
0.7892407600336928
Importance des caractéristiques :
                                     feature  importance
90                   CREDIT_TO_ANNUITY_RATIO         827
23                              EXT_SOURCE_1         370
87                          EXT_SOURCES_MEAN         366
83                          EXT_SOURCES_PROD         338
6                                AMT_ANNUITY         323
9                                 DAYS_BIRTH         305
97                     CAR_TO_EMPLOYED_RATIO         291
91                     CREDIT_TO_GOODS_RATIO         280
167  BUREAU_CONSUMER