In [14]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.pipeline import make_pipeline 
from lightgbm import LGBMClassifier

import joblib
import json

#importer les données d'entrainement
# Charger le fichier CSV
df_reduced = pd.read_csv("df_data_non_scalées.csv")


# Afficher toutes les colonnes
print(df_reduced.columns)


Index(['CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_WORK_PHONE',
       'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT_W_CITY', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'FLOORSMAX_MEDI',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_3', 'PAYMENT_RATE',
       'BURO_AMT_CREDIT_SUM_SUM',
       'PREV_NAME_PAYMENT_TYPE_Cash_through_the_bank_MEAN',
       'PREV_NAME_CLIENT_TYPE_New_MEAN', 'APPROVED_APP_CREDIT_PERC_MIN',
       'APPROVED_RATE_DOWN_PAYMENT_MAX', 'APPROVED_CNT_PAYMENT_MEAN',
       'POS_MONTHS_BALANCE_MAX', 'POS_MONTHS_BALANCE_SIZE',
       'INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE', 'INSTAL_DPD_MAX',
       'INSTAL_AMT_PAYMENT_MIN', 'INSTAL_AMT_PAYMENT_SUM', 'DAYS_ID_PUBLISH',
       'ANNUITY_INCOME_PERC', 'AMT_ANNUITY', 'TARGET'],
      dtype='object')


In [2]:
# Fonction pour vérifier le nombre de NaN par colonne
def check_nan_count(df, message=""):
    if isinstance(df, pd.DataFrame):  # Si c'est déjà un DataFrame
        nan_count = df.isna().sum()
    else:  # Si c'est un ndarray, on le convertit en DataFrame temporairement
        nan_count = pd.DataFrame(df).isna().sum()
    print(f"\n{message}")
    print(nan_count)

In [3]:
check_nan_count(df_reduced)



CNT_CHILDREN                                             0
DAYS_BIRTH                                               0
DAYS_EMPLOYED                                         6940
FLAG_WORK_PHONE                                          0
CNT_FAM_MEMBERS                                          0
REGION_RATING_CLIENT_W_CITY                              0
EXT_SOURCE_1                                         15384
EXT_SOURCE_2                                             7
EXT_SOURCE_3                                          6505
FLOORSMAX_MEDI                                       17510
OBS_30_CNT_SOCIAL_CIRCLE                                19
FLAG_DOCUMENT_3                                          0
PAYMENT_RATE                                            17
BURO_AMT_CREDIT_SUM_SUM                               4827
PREV_NAME_PAYMENT_TYPE_Cash_through_the_bank_MEAN      691
PREV_NAME_CLIENT_TYPE_New_MEAN                         691
APPROVED_APP_CREDIT_PERC_MIN                          

In [4]:
# Séparer X_train (toutes les colonnes sauf la dernière) et Y_train (dernière colonne)= target)

X_train = df_reduced.iloc[:, :-1]  # Toutes les colonnes sauf la dernière
Y_train = df_reduced.iloc[:, -1]   # Dernière colonne

In [5]:
X_train.head()

Unnamed: 0,CNT_CHILDREN,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,FLOORSMAX_MEDI,...,APPROVED_CNT_PAYMENT_MEAN,POS_MONTHS_BALANCE_MAX,POS_MONTHS_BALANCE_SIZE,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_SUM,DAYS_ID_PUBLISH,ANNUITY_INCOME_PERC,AMT_ANNUITY
0,2,-10349,-3608.0,0,3.0,2,0.511762,0.667579,0.15664,0.1667,...,8.5,-10.0,38.0,2.0,7.0,143.91,510546.375,-453,0.078933,10656.0
1,2,-14757,-5751.0,0,4.0,2,0.482605,0.660309,0.450747,0.3333,...,13.0,-1.0,70.0,3.0,10.0,16.695,2924252.775,-4839,0.091291,22594.5
2,0,-16891,-970.0,0,2.0,2,,0.622271,0.733815,,...,15.0,-2.0,32.0,1.0,44.0,1.35,586393.29,-424,0.170906,24610.5
3,1,-18186,-4381.0,0,3.0,2,,0.563785,0.531686,,...,6.0,-51.0,7.0,1.0,0.0,6143.895,36871.875,-1734,0.2305,20745.0
4,0,-9105,-1579.0,0,1.0,3,0.136241,0.268036,0.336062,,...,7.75,-11.0,32.0,3.0,5.0,3920.94,236219.94,-744,0.109894,23242.5


In [6]:
X_train.shape

(36558, 28)

In [7]:
# Créer un ImbPipeline avec des étapes de prétraitement et SMOTE
preprocessing_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputation des valeurs manquantes
    ('scaler', StandardScaler()),  # Mise à l'échelle des données
    ('smote', SMOTE(random_state=42)),  # Rééchantillonnage avec SMOTE
    
])


In [8]:
# verifier le pipeline
print(preprocessing_pipeline.named_steps)

{'imputer': SimpleImputer(strategy='median'), 'scaler': StandardScaler(), 'smote': SMOTE(random_state=42)}


In [9]:
# Étape 3 : Prétraiter les données et appliquer SMOTE
X_train_preprocessed, Y_train_resampled = preprocessing_pipeline.fit_resample(X_train, Y_train)

In [10]:
# Vérifier le nombre de NaN après imputation
check_nan_count(X_train_preprocessed, "Après l'imputation")


Après l'imputation
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
dtype: int64


In [16]:
# Charger les hyperparamètres à partir du fichier JSON
with open("best_model_params.json", "r") as json_file:
    best_params = json.load(json_file)

# Afficher les hyperparamètres pour vérification
print(best_params)

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.4, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': 15, 'min_child_samples': 500, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 75, 'n_jobs': -1, 'num_leaves': 31, 'objective': 'binary', 'random_state': 76, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.3, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [17]:
# Créer le modèle LGBMClassifier avec les hyperparamètres chargés
lgbm_classifier = LGBMClassifier(**best_params)

# Entraîner le modèle avec les données d'entraînement
lgbm_classifier.fit(X_train_preprocessed, Y_train_resampled)

[LightGBM] [Info] Number of positive: 24557, number of negative: 24557
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6973
[LightGBM] [Info] Number of data points in the train set: 49114, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [18]:
# Sauvegarder le modèle et le pipeline
joblib.dump(preprocessing_pipeline, "artifacts/preprocessing_pipeline.joblib")
joblib.dump(lgbm_classifier, "artifacts/lgmb_model.joblib")


# Sauvegarder les transformateurs
joblib.dump(preprocessing_pipeline.named_steps['imputer'],"artifacts/imputer.joblib")
joblib.dump(preprocessing_pipeline.named_steps['scaler'],"artifacts/scaler.joblib")


['artifacts/scaler.joblib']