<b><font color="SteelBlue" size="+3">Implémentez un modèle de scoring</font></b>

# Introduction

Ce note book est la suite de l'EDA, il va permettre de présenter le prétraitement des données, le feature engineering et la modélisation.

In [50]:
# Chargement des librairies

# Built-in
import os

# Data Manipulation and Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import mlflow
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import re
from catboost import CatBoostClassifier

# Custom Feature Engineering Pipeline
from feature_pipeline import FeatureEngineeringPipeline, remove_highly_correlated_features, remove_high_vif_features, free_memory

In [2]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("Credit_Scoring_Experiment")

<Experiment: artifact_location='mlflow-artifacts:/185199612184583695', creation_time=1715766694823, experiment_id='185199612184583695', last_update_time=1715766694823, lifecycle_stage='active', name='Credit_Scoring_Experiment', tags={}>

# Data préparation

## Chargement des données

In [3]:
os.listdir("data/sources/")

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'Projet+Mise+en+prod+-+home-credit-default-risk.zip',
 'sample_submission.csv']

In [4]:
PATH = "./data/sources/"

In [5]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

## Split du jeu de données

In [6]:
# Division des données en ensembles d'entraînement et de test
train, test = train_test_split(application_train, test_size=0.2, random_state=42, stratify=application_train['TARGET'])

## Gestion des valeurs manquantes et abérrantes

In [7]:
# Supprimer les lignes où TARGET est manquant dans application_train
train.dropna(subset=['TARGET'], inplace=True)
test.dropna(subset=['TARGET'], inplace=True)

In [8]:
# Suppression des lignes où les clés sont absentes
def drop_missing_keys(df, key, reference_df, reference_key):
    valid_keys = reference_df[reference_key].unique()
    return df[df[key].isin(valid_keys)]

# Supprimer les lignes où les clés sont absentes
bureau = drop_missing_keys(bureau, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
bureau_balance = drop_missing_keys(bureau_balance, 'SK_ID_BUREAU', bureau, 'SK_ID_BUREAU')
previous_application = drop_missing_keys(previous_application, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
POS_CASH_balance = drop_missing_keys(POS_CASH_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
installments_payments = drop_missing_keys(installments_payments, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
credit_card_balance = drop_missing_keys(credit_card_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')

In [9]:
# Fonction pour supprimer les colonnes avec plus de 80% de valeurs manquantes
def drop_missing_columns(df, threshold=0.8):
    initial_columns = df.shape[1]
    missing_percentage = df.isnull().mean()
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    df = df.drop(columns=columns_to_drop)
    final_columns = df.shape[1]
    print(f"Colonnes supprimées: {initial_columns - final_columns}")
    print(f"Colonnes restantes: {final_columns}")
    return df

In [10]:
# Fonction pour gérer les valeurs aberrantes
def cap_values(series, target, threshold=0.05):
    lower_percentile = np.percentile(series, 1)
    upper_percentile = np.percentile(series, 99)
    outliers = (series < lower_percentile) | (series > upper_percentile)

    if target is not None:
        outlier_pct_target1 = (outliers & (target == 1)).sum() / (target == 1).sum()
        outlier_pct_target0 = (outliers & (target == 0)).sum() / (target == 0).sum()

        if outlier_pct_target1 > threshold or outlier_pct_target0 > threshold:
            print("Significant outliers detected, not capping values.")
            return series  # Ne pas appliquer le cap si les valeurs aberrantes sont significatives
        else:
            return np.clip(series, lower_percentile, upper_percentile)
    else:
        return np.clip(series, lower_percentile, upper_percentile)

In [11]:
def cap_outliers(df, target=None):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = cap_values(df[col], target)
    return df

In [12]:
# Fonction pour imputer les valeurs manquantes
def impute_missing_values(df, target=None):
    # Imputation pour les colonnes numériques
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if target is not None:
            missing_pct_target1 = df[col][target == 1].isnull().mean()
            missing_pct_target0 = df[col][target == 0].isnull().mean()
            if abs(missing_pct_target1 - missing_pct_target0) < 0.05:  # Seuil de 5% de différence
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df.loc[target == 1, col] = df[col][target == 1].fillna(df[col][target == 1].median())
                df.loc[target == 0, col] = df[col][target == 0].fillna(df[col][target == 0].median())
        else:
            df[col].fillna(df[col].median(), inplace=True)
    
    # Imputation pour les colonnes catégorielles
    categorical_cols = df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        if target is not None:
            mode_target1 = df[col][target == 1].mode()[0]
            mode_target0 = df[col][target == 0].mode()[0]
            df.loc[target == 1, col] = df[col][target == 1].fillna(mode_target1)
            df.loc[target == 0, col] = df[col][target == 0].fillna(mode_target0)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

In [13]:
# Appliquer le filtre des colonnes manquantes et la gestion des valeurs aberrantes à chaque DataFrame
dfs = {
    "Train": train,
    "Test": test,
    "Application Test": application_test,
    "Bureau": bureau,
    "Bureau Balance": bureau_balance,
    "Credit Card Balance": credit_card_balance,
    "Installments Payments": installments_payments,
    "Previous Application": previous_application,
    "POS CASH Balance": POS_CASH_balance
}

In [14]:
for name, df in dfs.items():
    print(f"\n{name}:")
    df = drop_missing_columns(df)
    if name in ["Train", "Test"]:
        df = cap_outliers(df, df['TARGET'])
        df = impute_missing_values(df, df['TARGET'])
    else:
        df = cap_outliers(df)
        df = impute_missing_values(df)
    dfs[name] = df


Train:
Colonnes supprimées: 0
Colonnes restantes: 122

Test:
Colonnes supprimées: 0
Colonnes restantes: 122

Application Test:
Colonnes supprimées: 0
Colonnes restantes: 121

Bureau:
Colonnes supprimées: 0
Colonnes restantes: 17

Bureau Balance:
Colonnes supprimées: 0
Colonnes restantes: 3

Credit Card Balance:
Colonnes supprimées: 0
Colonnes restantes: 23

Installments Payments:
Colonnes supprimées: 0
Colonnes restantes: 8

Previous Application:
Colonnes supprimées: 2
Colonnes restantes: 35

POS CASH Balance:
Colonnes supprimées: 0
Colonnes restantes: 8


In [15]:
# Fonction pour vérifier les valeurs manquantes dans un DataFrame
def check_missing_values(df):
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        print("Colonnes avec des valeurs manquantes :")
        print(missing_values)
    else:
        print("Aucune valeur manquante détectée.")

# Vérification des valeurs manquantes dans chaque DataFrame
for name, df in dfs.items():
    print(f"\n{name}:")
    check_missing_values(df)



Train:
Aucune valeur manquante détectée.

Test:
Aucune valeur manquante détectée.

Application Test:
Aucune valeur manquante détectée.

Bureau:
Aucune valeur manquante détectée.

Bureau Balance:
Aucune valeur manquante détectée.

Credit Card Balance:
Aucune valeur manquante détectée.

Installments Payments:
Aucune valeur manquante détectée.

Previous Application:
Aucune valeur manquante détectée.

POS CASH Balance:
Aucune valeur manquante détectée.


## Sauvegarde des données nettoyées

In [16]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir = "data/Cleaned"

In [17]:
# Fonction pour sauvegarder un DataFrame
def save_dataframe(df, filename, output_dir):
    output_path = os.path.join(output_dir, filename)
    df.to_csv(output_path, index=False)
    print(f"DataFrame sauvegardé sous : {output_path}")

In [18]:
# Dictionnaire contenant les noms des fichiers
original_names = {
    "Train": "train.csv",
    "Test": "test.csv",
    "Application Test": "application_test.csv",
    "Bureau": "bureau.csv",
    "Bureau Balance": "bureau_balance.csv",
    "Credit Card Balance": "credit_card_balance.csv",
    "Installments Payments": "installments_payments.csv",
    "Previous Application": "previous_application.csv",
    "POS CASH Balance": "POS_CASH_balance.csv"
}

In [19]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs.items():
    save_dataframe(df, original_names[name], output_dir)

DataFrame sauvegardé sous : data/Cleaned\train.csv
DataFrame sauvegardé sous : data/Cleaned\test.csv
DataFrame sauvegardé sous : data/Cleaned\application_test.csv
DataFrame sauvegardé sous : data/Cleaned\bureau.csv
DataFrame sauvegardé sous : data/Cleaned\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned\POS_CASH_balance.csv


# Feature engineering

## Sur le df train

In [20]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline()

In [21]:
train.shape

(246008, 122)

In [22]:
# Entraîner le pipeline
train_transformed = pipeline.fit(train)

Bureau and bureau_balance data - done in 13s
previous_application - done in 21s
previous applications balances - done in 477s
Colonnes supprimées: 47, Colonnes restantes: 687
Gestion des outliers - done
Gestion des valeurs manquantes - done
Colonnes supprimées (constantes): 36, Colonnes restantes: 651
Initial df memory usage is 1226.77 MB for 769 columns
Final memory usage is: 426.28 MB - decreased by 65.3%
Variables peu corrélées supprimées: 232


In [23]:
# Sauvegarder les paramètres de transformation
pipeline.save('param')

In [24]:
train_transformed.shape

(246005, 537)

In [25]:
train_transformed.head(5)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick"
0,0,0,0,0,2,90000.0,227520.0,13189.5,180000.0,0.008232,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0,0,1,1,0,90000.0,161730.0,13095.0,135000.0,0.003069,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1,0,1,0,0,135000.0,728847.0,26307.0,553500.0,0.020706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1,0,1,0,0,135000.0,474183.0,34636.5,391500.0,0.011703,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,0,0,0,0,180000.0,254700.0,27558.0,225000.0,0.00663,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [26]:
train_transformed.dtypes.value_counts()

float16    405
float32    111
int8        19
int16        2
Name: count, dtype: int64

In [27]:
# Vérification des colonnes avec des valeurs manquantes
missing_values = train_transformed.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Colonnes avec des valeurs manquantes : {missing_columns}")

Colonnes avec des valeurs manquantes : []


## Sur le df test

In [28]:
# Charger les paramètres de transformation
pipeline.load('param')

# Transformer les données de test
test_transformed = pipeline.transform(test)

Bureau and bureau_balance data - done in 12s
previous_application - done in 20s
Colonnes supprimées: 47, Colonnes restantes: 687
Initial df memory usage is 251.26 MB for 537 columns
Final memory usage is: 72.79 MB - decreased by 71.0%
previous applications balances - done in 481s


In [29]:
test.shape

(61503, 122)

In [30]:
test_transformed.shape

(61501, 537)

In [31]:
# Vérification des colonnes avec des valeurs manquantes
missing_values = test_transformed.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()
print(f"Colonnes avec des valeurs manquantes : {missing_columns}")

Colonnes avec des valeurs manquantes : []


# Définition du Score Métier

In [32]:
def cost_function(y_true, y_pred):
    fp_cost = 1
    fn_cost = 10
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return fp_cost * fp + fn_cost * fn

In [33]:
custom_scorer = make_scorer(cost_function, greater_is_better=False)

# Modèles de machine learning

## Gestion du déséquilibre des classes

In [34]:
# Application de SMOTE
X = train_transformed.drop(columns=['TARGET'])
y = train_transformed['TARGET']

In [35]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [36]:
# Mettre à jour le DataFrame de avec les données rééquilibrées
train_transformed = pd.concat([X_res, y_res], axis=1)

In [37]:
# Vérification de la répartition des classes après SMOTE
print("Répartition des classes après SMOTE :")
print(y_res.value_counts())

Répartition des classes après SMOTE :
TARGET
0    226146
1    226146
Name: count, dtype: int64


## Régression logistique

In [38]:
train_df_cleaned, high_corr_features = remove_highly_correlated_features(train_transformed, threshold=0.9)
print(f"Variables très corrélées supprimées: {len(high_corr_features)}")

Variables très corrélées supprimées: 130


In [40]:
train_df_cleaned.shape

(452292, 407)

In [42]:
# Aligner les colones du jeu de test
test_transformed_cleaned = test_transformed.copy()

# Supprimer les colonnes en trop
extra_cols = set(test_transformed.columns) - set(train_df_cleaned_columns)
if extra_cols:
    test_transformed_cleaned.drop(columns=list(extra_cols), inplace=True)

# Réordonner les colonnes pour correspondre à celles du train
test_transformed_cleaned = test_transformed_cleaned[train_df_cleaned_columns]

In [43]:
test_transformed_cleaned.shape

(61501, 407)

In [62]:
# Séparer les features et la cible
X_train = train_df_cleaned.drop('TARGET', axis=1)
y_train = train_transformed['TARGET']
X_test = test_transformed_cleaned.drop('TARGET', axis=1)
y_test = test_transformed['TARGET']

In [63]:
X_test.shape

(61501, 406)

In [64]:
# Normaliser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [65]:
# Initialiser et entraîner le modèle de régression logistique
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

In [66]:
# Prédire sur l'ensemble de test
y_pred = log_reg.predict(X_test_scaled)
y_pred_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

In [67]:
# Évaluer le modèle
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

# Afficher les coefficients de la régression logistique
coefficients = pd.DataFrame(log_reg.coef_[0], index=X_train.columns, columns=['Coefficient'])
print("Coefficients de la régression logistique :")
print(coefficients.sort_values(by='Coefficient', ascending=False))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     56536
           1       0.47      0.46      0.46      4965

    accuracy                           0.91     61501
   macro avg       0.71      0.71      0.71     61501
weighted avg       0.91      0.91      0.91     61501

Confusion Matrix:
[[53940  2596]
 [ 2684  2281]]
ROC AUC Score:
0.8738429548797149


ValueError: Shape of passed values is (406, 1), indices imply (407, 1)

## Random forest

## Lightgbm

In [54]:
def clean_column_names(df):
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]
    return df

# Appliquer le nettoyage des noms de colonnes
train_trans = clean_column_names(train_transformed)
test_trans = clean_column_names(test_transformed)

In [55]:
# Séparer les features et la cible
X_train = train_trans.drop('TARGET', axis=1)
y_train = train_trans['TARGET']
X_test = test_trans.drop('TARGET', axis=1)
y_test = test_trans['TARGET']

In [56]:
# Diviser les données d'entraînement en un ensemble d'entraînement et de validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [57]:
# Créer les ensembles de données LightGBM
train_data = lgb.Dataset(X_train_split, label=y_train_split)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [58]:
# Définir les paramètres du modèle
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [59]:
# Entraîner le modèle
print("Training LightGBM model...")
callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=True)]
gbm = lgb.train(params,
                train_data,
                num_boost_round=1000,
                valid_sets=[train_data, val_data],
                callbacks=callbacks)

Training LightGBM model...
[LightGBM] [Info] Number of positive: 180917, number of negative: 180916
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.559575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 131219
[LightGBM] [Info] Number of data points in the train set: 361833, number of used features: 536
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500001 -> initscore=0.000006
[LightGBM] [Info] Start training from score 0.000006
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[461]	training's binary_logloss: 0.000287567	training's auc: 1	valid_1's binary_logloss: 0.0029298	valid_1's auc: 0.999994


In [60]:
# Prédire sur l'ensemble de test
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred_class = (y_pred > 0.5).astype(int)

In [61]:
# Évaluer le modèle
print("Classification Report:")
print(classification_report(y_test, y_pred_class))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_class))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred))

# Afficher l'importance des caractéristiques
importance = gbm.feature_importance(importance_type='split')
feature_names = X_train.columns
feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

print("Importance des caractéristiques :")
print(feature_importance.head(10))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.92      0.95     56536
           1       0.49      0.93      0.64      4965

    accuracy                           0.92     61501
   macro avg       0.74      0.92      0.80     61501
weighted avg       0.95      0.92      0.93     61501

Confusion Matrix:
[[51766  4770]
 [  338  4627]]
ROC AUC Score:
0.9877600398202732
Importance des caractéristiques :
                                    feature  importance
156           BUREAU_CLOSED_DAYS_CREDIT_VAR         412
185     BUREAU_LAST6M_DEBT_CREDIT_DIFF_MEAN         365
184   BUREAU_LAST6M_AMT_CREDIT_SUM_DEBT_SUM         327
115                  BUREAU_DAYS_CREDIT_MAX         291
221  PREV_NAME_CONTRACT_STATUS_Refused_MEAN         286
29                          YEARS_BUILD_AVG         249
535           WALLSMATERIAL_MODE_Stonebrick         225
43                         YEARS_BUILD_MODE         199
104                 AMT_INCOM

## CatBoosting