<b><font color="SteelBlue" size="+3">Implémentez un modèle de scoring</font></b>

# Introduction

Ce note book est la suite de l'EDA, il va permettre de présenter le prétraitement des données, le feature engineering et la modélisation.

In [1]:
# Chargement des librairies

# Built-in
import os

# Data Manipulation and Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
import lightgbm as lgb
from catboost import CatBoostClassifier

# Custom Feature Engineering Pipeline
from feature_pipeline import FeatureEngineeringPipeline

# Data préparation

## Chargement des données

In [2]:
os.listdir("data/sources/")

['application_test.csv',
 'application_train.csv',
 'bureau.csv',
 'bureau_balance.csv',
 'credit_card_balance.csv',
 'HomeCredit_columns_description.csv',
 'installments_payments.csv',
 'POS_CASH_balance.csv',
 'previous_application.csv',
 'Projet+Mise+en+prod+-+home-credit-default-risk.zip',
 'sample_submission.csv']

In [3]:
PATH = "./data/sources/"

In [4]:
application_train = pd.read_csv(PATH+"/application_train.csv")
application_test = pd.read_csv(PATH+"/application_test.csv")
bureau = pd.read_csv(PATH+"/bureau.csv")
bureau_balance = pd.read_csv(PATH+"/bureau_balance.csv")
credit_card_balance = pd.read_csv(PATH+"/credit_card_balance.csv")
installments_payments = pd.read_csv(PATH+"/installments_payments.csv")
previous_application = pd.read_csv(PATH+"/previous_application.csv")
POS_CASH_balance = pd.read_csv(PATH+"/POS_CASH_balance.csv")

## Split du jeu de données

In [5]:
# Division des données en ensembles d'entraînement et de test
train, test = train_test_split(application_train, test_size=0.2, random_state=42, stratify=application_train['TARGET'])

## Gestion des valeurs manquantes et abérrantes

In [6]:
# Supprimer les lignes où TARGET est manquant dans application_train
train.dropna(subset=['TARGET'], inplace=True)
test.dropna(subset=['TARGET'], inplace=True)

In [7]:
# Suppression des lignes où les clés sont absentes
def drop_missing_keys(df, key, reference_df, reference_key):
    valid_keys = reference_df[reference_key].unique()
    return df[df[key].isin(valid_keys)]

# Supprimer les lignes où les clés sont absentes
bureau = drop_missing_keys(bureau, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
bureau_balance = drop_missing_keys(bureau_balance, 'SK_ID_BUREAU', bureau, 'SK_ID_BUREAU')
previous_application = drop_missing_keys(previous_application, 'SK_ID_CURR', application_train, 'SK_ID_CURR')
POS_CASH_balance = drop_missing_keys(POS_CASH_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
installments_payments = drop_missing_keys(installments_payments, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')
credit_card_balance = drop_missing_keys(credit_card_balance, 'SK_ID_PREV', previous_application, 'SK_ID_PREV')

In [8]:
# Fonction pour supprimer les colonnes avec plus de 80% de valeurs manquantes
def drop_missing_columns(df, threshold=0.8):
    initial_columns = df.shape[1]
    missing_percentage = df.isnull().mean()
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    df = df.drop(columns=columns_to_drop)
    final_columns = df.shape[1]
    print(f"Colonnes supprimées: {initial_columns - final_columns}")
    print(f"Colonnes restantes: {final_columns}")
    return df

In [9]:
# Fonction pour gérer les valeurs aberrantes
def cap_values(series, target, threshold=0.05):
    lower_percentile = np.percentile(series, 1)
    upper_percentile = np.percentile(series, 99)
    outliers = (series < lower_percentile) | (series > upper_percentile)

    if target is not None:
        outlier_pct_target1 = (outliers & (target == 1)).sum() / (target == 1).sum()
        outlier_pct_target0 = (outliers & (target == 0)).sum() / (target == 0).sum()

        if outlier_pct_target1 > threshold or outlier_pct_target0 > threshold:
            print("Significant outliers detected, not capping values.")
            return series  # Ne pas appliquer le cap si les valeurs aberrantes sont significatives
        else:
            return np.clip(series, lower_percentile, upper_percentile)
    else:
        return np.clip(series, lower_percentile, upper_percentile)

In [10]:
def cap_outliers(df, target=None):
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = cap_values(df[col], target)
    return df

In [11]:
# Fonction pour imputer les valeurs manquantes
def impute_missing_values(df, target=None):
    # Imputation pour les colonnes numériques
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if target is not None:
            missing_pct_target1 = df[col][target == 1].isnull().mean()
            missing_pct_target0 = df[col][target == 0].isnull().mean()
            if abs(missing_pct_target1 - missing_pct_target0) < 0.05:  # Seuil de 5% de différence
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df.loc[target == 1, col] = df[col][target == 1].fillna(df[col][target == 1].median())
                df.loc[target == 0, col] = df[col][target == 0].fillna(df[col][target == 0].median())
        else:
            df[col].fillna(df[col].median(), inplace=True)
    
    # Imputation pour les colonnes catégorielles
    categorical_cols = df.select_dtypes(include=[object]).columns
    for col in categorical_cols:
        if target is not None:
            mode_target1 = df[col][target == 1].mode()[0]
            mode_target0 = df[col][target == 0].mode()[0]
            df.loc[target == 1, col] = df[col][target == 1].fillna(mode_target1)
            df.loc[target == 0, col] = df[col][target == 0].fillna(mode_target0)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

In [12]:
# Appliquer le filtre des colonnes manquantes et la gestion des valeurs aberrantes à chaque DataFrame
dfs = {
    "Train": train,
    "Test": test,
    "Application Test": application_test,
    "Bureau": bureau,
    "Bureau Balance": bureau_balance,
    "Credit Card Balance": credit_card_balance,
    "Installments Payments": installments_payments,
    "Previous Application": previous_application,
    "POS CASH Balance": POS_CASH_balance
}

In [13]:
for name, df in dfs.items():
    print(f"\n{name}:")
    df = drop_missing_columns(df)
    if name in ["Train", "Test"]:
        df = cap_outliers(df, df['TARGET'])
        df = impute_missing_values(df, df['TARGET'])
    else:
        df = cap_outliers(df)
        df = impute_missing_values(df)
    dfs[name] = df


Train:
Colonnes supprimées: 0
Colonnes restantes: 122

Test:
Colonnes supprimées: 0
Colonnes restantes: 122

Application Test:
Colonnes supprimées: 0
Colonnes restantes: 121

Bureau:
Colonnes supprimées: 0
Colonnes restantes: 17

Bureau Balance:
Colonnes supprimées: 0
Colonnes restantes: 3

Credit Card Balance:
Colonnes supprimées: 0
Colonnes restantes: 23

Installments Payments:
Colonnes supprimées: 0
Colonnes restantes: 8

Previous Application:
Colonnes supprimées: 2
Colonnes restantes: 35

POS CASH Balance:
Colonnes supprimées: 0
Colonnes restantes: 8


In [14]:
# Fonction pour vérifier les valeurs manquantes dans un DataFrame
def check_missing_values(df):
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        print("Colonnes avec des valeurs manquantes :")
        print(missing_values)
    else:
        print("Aucune valeur manquante détectée.")

# Vérification des valeurs manquantes dans chaque DataFrame
for name, df in dfs.items():
    print(f"\n{name}:")
    check_missing_values(df)



Train:
Aucune valeur manquante détectée.

Test:
Aucune valeur manquante détectée.

Application Test:
Aucune valeur manquante détectée.

Bureau:
Aucune valeur manquante détectée.

Bureau Balance:
Aucune valeur manquante détectée.

Credit Card Balance:
Aucune valeur manquante détectée.

Installments Payments:
Aucune valeur manquante détectée.

Previous Application:
Aucune valeur manquante détectée.

POS CASH Balance:
Aucune valeur manquante détectée.


## Sauvegarde des données nettoyées

In [15]:
# Chemin du dossier où sauvegarder les DataFrames nettoyés
output_dir = "data/Cleaned"

In [16]:
# Fonction pour sauvegarder un DataFrame
def save_dataframe(df, filename, output_dir):
    output_path = os.path.join(output_dir, filename)
    df.to_csv(output_path, index=False)
    print(f"DataFrame sauvegardé sous : {output_path}")

In [17]:
# Dictionnaire contenant les noms des fichiers
original_names = {
    "Train": "train.csv",
    "Test": "test.csv",
    "Application Test": "application_test.csv",
    "Bureau": "bureau.csv",
    "Bureau Balance": "bureau_balance.csv",
    "Credit Card Balance": "credit_card_balance.csv",
    "Installments Payments": "installments_payments.csv",
    "Previous Application": "previous_application.csv",
    "POS CASH Balance": "POS_CASH_balance.csv"
}

In [18]:
# Sauvegarder chaque DataFrame dans le dossier output_dir avec le nom d'origine
for name, df in dfs.items():
    save_dataframe(df, original_names[name], output_dir)

DataFrame sauvegardé sous : data/Cleaned\train.csv
DataFrame sauvegardé sous : data/Cleaned\test.csv
DataFrame sauvegardé sous : data/Cleaned\application_test.csv
DataFrame sauvegardé sous : data/Cleaned\bureau.csv
DataFrame sauvegardé sous : data/Cleaned\bureau_balance.csv
DataFrame sauvegardé sous : data/Cleaned\credit_card_balance.csv
DataFrame sauvegardé sous : data/Cleaned\installments_payments.csv
DataFrame sauvegardé sous : data/Cleaned\previous_application.csv
DataFrame sauvegardé sous : data/Cleaned\POS_CASH_balance.csv


# Feature engineering

## Sur le df train

In [19]:
# Initialiser le pipeline
pipeline = FeatureEngineeringPipeline()

In [20]:
train.shape

(246008, 122)

In [21]:
# Entraîner le pipeline
train_transformed = pipeline.fit(train)

Bureau and bureau_balance data - done in 13s
previous_application - done in 20s
previous applications balances - done in 558s
Colonnes supprimées: 40, Colonnes restantes: 734


  diff_b_a = subtract(b, a)


In [22]:
# Sauvegarder les paramètres de transformation
pipeline.save('param')

In [23]:
train_transformed.shape

(246005, 694)

In [24]:
train_transformed.head(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO,PAYMENT_MIN_TO_ANNUITY_RATIO,PAYMENT_MAX_TO_ANNUITY_RATIO,PAYMENT_MEAN_TO_ANNUITY_RATIO,CTA_CREDIT_TO_ANNUITY_MAX_RATIO,CTA_CREDIT_TO_ANNUITY_MEAN_RATIO,DAYS_DECISION_MEAN_TO_BIRTH,DAYS_CREDIT_MEAN_TO_BIRTH,DAYS_DECISION_MEAN_TO_EMPLOYED,DAYS_CREDIT_MEAN_TO_EMPLOYED
0,310536.0,0,Cash loans,F,N,N,2,90000.0,227520.0,13189.5,...,0.851831,0.005728,2.388263,0.732455,1.159415,0.546073,0.095804,0.095609,1.245455,1.242918
1,365516.0,0,Cash loans,M,Y,Y,0,90000.0,161730.0,13095.0,...,0.649658,0.292223,1.068567,0.744035,0.789781,0.608937,0.021545,0.065316,3.600543,0.514865
2,242055.0,1,Cash loans,M,N,Y,0,135000.0,728847.0,26307.0,...,0.493823,0.493613,0.493823,0.493771,0.12669,0.12669,0.130056,0.0707,1.525516,0.829283
3,452674.96,1,Cash loans,M,N,N,0,135000.0,474183.0,34636.5,...,0.490376,0.10236,1.550433,0.555726,0.745151,0.543613,0.060188,0.065316,0.601681,0.514865
4,448321.0,0,Cash loans,F,N,Y,0,180000.0,254700.0,27558.0,...,0.112585,0.112585,0.219177,0.122275,0.974059,0.974059,0.0819,0.065316,0.711361,0.514865


In [25]:
train_transformed.dtypes.value_counts()

float64    637
int64       41
object      16
Name: count, dtype: int64

In [26]:
def unique_values_per_object_column(df):    # Sélectionner les colonnes de type object
    object_columns = df.select_dtypes(include=['object']).columns
    
    # Créer un dictionnaire pour stocker le nombre de valeurs uniques par colonne
    unique_values_counts = {}
    
    # Boucle sur les colonnes object et compter les valeurs uniques
    for col in object_columns:
        unique_values_counts[col] = df[col].nunique()
    
    # Afficher les résultats
    for col, count in unique_values_counts.items():
        print(f"{col}: {count} valeurs uniques")

unique_values_per_object_column(train_transformed)

NAME_CONTRACT_TYPE: 2 valeurs uniques
CODE_GENDER: 2 valeurs uniques
FLAG_OWN_CAR: 2 valeurs uniques
FLAG_OWN_REALTY: 2 valeurs uniques
NAME_TYPE_SUITE: 7 valeurs uniques
NAME_INCOME_TYPE: 8 valeurs uniques
NAME_EDUCATION_TYPE: 5 valeurs uniques
NAME_FAMILY_STATUS: 6 valeurs uniques
NAME_HOUSING_TYPE: 6 valeurs uniques
OCCUPATION_TYPE: 18 valeurs uniques
WEEKDAY_APPR_PROCESS_START: 7 valeurs uniques
ORGANIZATION_TYPE: 58 valeurs uniques
FONDKAPREMONT_MODE: 4 valeurs uniques
HOUSETYPE_MODE: 3 valeurs uniques
WALLSMATERIAL_MODE: 7 valeurs uniques
EMERGENCYSTATE_MODE: 2 valeurs uniques


In [27]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [28]:
missing_data(train_transformed).head(10)

Unnamed: 0,Total,Percent
SK_ID_CURR,0,0.0
PREV_LAST12M_SIMPLE_INTERESTS_MEAN,0,0.0
PREV_Cash_APPLICATION_CREDIT_RATIO_MEAN,0,0.0
PREV_Cash_DAYS_DECISION_MAX,0,0.0
PREV_Cash_DAYS_LAST_DUE_1ST_VERSION_MAX,0,0.0
PREV_Cash_DAYS_LAST_DUE_1ST_VERSION_MEAN,0,0.0
PREV_Cash_CNT_PAYMENT_MEAN,0,0.0
PREV_LAST12M_AMT_CREDIT_SUM,0,0.0
PREV_LAST12M_AMT_ANNUITY_MEAN,0,0.0
PREV_LAST12M_AMT_ANNUITY_MAX,0,0.0


## Sur le df test

In [29]:
# Charger les paramètres de transformation
pipeline.load('param')

# Transformer les données de test
test_transformed = pipeline.transform(test)

Bureau and bureau_balance data - done in 12s
previous_application - done in 20s
Colonnes supprimées: 40, Colonnes restantes: 734


  diff_b_a = subtract(b, a)


previous applications balances - done in 576s


In [30]:
test.shape

(61503, 122)

In [31]:
test_transformed.shape

(61501, 694)

# Stratégie d'Évaluation et d'Optimisation

## Définition du Score Métier

In [None]:
def cost_function(y_true, y_pred):
    fp_cost = 1
    fn_cost = 10
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return fp_cost * fp + fn_cost * fn

In [None]:
custom_scorer = make_scorer(cost_function, greater_is_better=False)

## Stratégie d'évaluation

# Modèles de machine learning

## Gestion du déséquilibre des classes

## Régression logistique

## Random forest

## Lightgbm

## CatBoosting