In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.pipeline import make_pipeline 
from lightgbm import LGBMClassifier

import joblib
import json

import os


In [2]:
# Changer le répertoire courant
os.chdir(r"C:\Users\Vincent\Open Classroom\Projet7_GIT\data")



# Chemin relatif ou absolu du fichier
file_path = "df_data_train_non_scalées.csv"


# Charger le fichier CSV
df_import_train = pd.read_csv(file_path)

print("Fichier chargé avec succès.")



Fichier chargé avec succès.


In [3]:
df_import_train.shape

(34594, 29)

In [4]:
# Séparer X_train (toutes les colonnes sauf la dernière) et Y_train (dernière colonne)= target)

X_train = df_import_train.iloc[:, :-1]  # Toutes les colonnes sauf la dernière
Y_train = df_import_train.iloc[:, -1]   # Dernière colonne

In [5]:
X_train.head()

Unnamed: 0,CNT_CHILDREN,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,FLOORSMAX_MEDI,...,APPROVED_CNT_PAYMENT_MEAN,POS_MONTHS_BALANCE_MAX,POS_MONTHS_BALANCE_SIZE,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_SUM,DAYS_ID_PUBLISH,ANNUITY_INCOME_PERC,AMT_ANNUITY
0,1,-10679,-335.0,1,2.0,2,,0.468021,0.365617,0.1667,...,11.333333,-3.0,34.0,2.0,3.0,5.985,180160.965,-1553,0.066747,4747.5
1,0,-14124,-231.0,1,2.0,3,0.644363,0.449388,0.647977,,...,17.4,-5.0,26.0,10.0,12.0,354.78,2576693.07,-4776,0.159491,39474.0
2,2,-9953,-2769.0,1,4.0,2,,0.332827,0.185202,0.1667,...,10.0,-25.0,48.0,3.0,0.0,76.455,601048.305,-2581,0.179744,34780.5
3,0,-19004,-1533.0,0,2.0,2,,0.314252,0.758393,0.1667,...,12.0,-2.0,23.0,2.0,0.0,21136.59,554475.96,-2528,0.195429,30780.0
4,0,-17650,-1462.0,0,2.0,3,,0.559628,0.595456,,...,10.666667,-8.0,34.0,2.0,0.0,6144.03,249639.03,-1181,0.448,40320.0


In [6]:
X_train.shape

(34594, 28)

In [7]:
# Créer un ImbPipeline avec des étapes de prétraitement et SMOTE
preprocessing_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputation des valeurs manquantes
    ('scaler', StandardScaler()),  # Mise à l'échelle des données
    ('smote', SMOTE(random_state=42)),  # Rééchantillonnage avec SMOTE
    
])


In [8]:
# verifier le pipeline
print(preprocessing_pipeline.named_steps)

{'imputer': SimpleImputer(strategy='median'), 'scaler': StandardScaler(), 'smote': SMOTE(random_state=42)}


In [9]:
# Étape 3 : Prétraiter les données et appliquer SMOTE
X_train_preprocessed, Y_train_resampled = preprocessing_pipeline.fit_resample(X_train, Y_train)

In [11]:
# Fonction pour vérifier le nombre de NaN par colonne
def check_nan_count(df, message=""):
    if isinstance(df, pd.DataFrame):  # Si c'est déjà un DataFrame
        nan_count = df.isna().sum()
    else:  # Si c'est un ndarray, on le convertit en DataFrame temporairement
        nan_count = pd.DataFrame(df).isna().sum()
    print(f"\n{message}")
    print(nan_count)

In [12]:
# Vérifier le nombre de NaN après imputation
check_nan_count(X_train_preprocessed, "Après l'imputation")


Après l'imputation
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
dtype: int64


In [13]:
# Changer le répertoire courant
os.chdir(r"C:\Users\Vincent\Open Classroom\Projet7_GIT\Models")


# Charger les hyperparamètres à partir du fichier JSON
with open("best_model_params.json", "r") as json_file:
    best_params = json.load(json_file)

# Afficher les hyperparamètres pour vérification
print(best_params)

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.6, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': 15, 'min_child_samples': 500, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 80, 'n_jobs': -1, 'num_leaves': 31, 'objective': 'binary', 'random_state': 76, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.3, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [14]:
# Créer le modèle LGBMClassifier avec les hyperparamètres chargés
lgbm_classifier = LGBMClassifier(**best_params)

# Entraîner le modèle avec les données d'entraînement
lgbm_classifier.fit(X_train_preprocessed, Y_train_resampled)

[LightGBM] [Info] Number of positive: 31781, number of negative: 31781
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7140
[LightGBM] [Info] Number of data points in the train set: 63562, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [15]:
# Changer le répertoire courant
os.chdir(r"C:\Users\Vincent\Open Classroom\Projet7_GIT\artifacts")


# Sauvegarder le modèle et le pipeline
joblib.dump(preprocessing_pipeline, "preprocessing_pipeline.joblib")
joblib.dump(lgbm_classifier, "lgmb_model.joblib")


# Sauvegarder les transformateurs
joblib.dump(preprocessing_pipeline.named_steps['imputer'],"imputer.joblib")
joblib.dump(preprocessing_pipeline.named_steps['scaler'],"scaler.joblib")


['scaler.joblib']