In [7]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.pipeline import make_pipeline 
from lightgbm import LGBMClassifier

import joblib

#importer les données d'entrainement
# Charger le fichier CSV
df_reduced = pd.read_csv("df_data_non_scalées.csv")


# Afficher toutes les colonnes
print(df_reduced.columns)


Index(['CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_ID_PUBLISH', 'FLAG_WORK_PHONE',
       'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT_W_CITY', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'YEARS_BEGINEXPLUATATION_MEDI',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'FLAG_DOCUMENT_3', 'PAYMENT_RATE', 'BURO_DAYS_CREDIT_MEAN',
       'ACTIVE_DAYS_CREDIT_MAX', 'PREV_APP_CREDIT_PERC_VAR',
       'PREV_RATE_DOWN_PAYMENT_MIN', 'PREV_NAME_YIELD_GROUP_high_MEAN',
       'APPROVED_HOUR_APPR_PROCESS_START_MAX', 'POS_MONTHS_BALANCE_MAX',
       'POS_MONTHS_BALANCE_SIZE', 'INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE',
       'INSTAL_DPD_MAX', 'INSTAL_AMT_PAYMENT_SUM', 'TARGET'],
      dtype='object')


In [8]:
# Fonction pour vérifier le nombre de NaN par colonne
def check_nan_count(df, message=""):
    if isinstance(df, pd.DataFrame):  # Si c'est déjà un DataFrame
        nan_count = df.isna().sum()
    else:  # Si c'est un ndarray, on le convertit en DataFrame temporairement
        nan_count = pd.DataFrame(df).isna().sum()
    print(f"\n{message}")
    print(nan_count)

In [9]:
check_nan_count(df_reduced)



CNT_CHILDREN                                 0
DAYS_BIRTH                                   0
DAYS_ID_PUBLISH                              0
FLAG_WORK_PHONE                              0
CNT_FAM_MEMBERS                              0
REGION_RATING_CLIENT_W_CITY                  0
EXT_SOURCE_1                             12991
EXT_SOURCE_2                                48
EXT_SOURCE_3                              4584
YEARS_BEGINEXPLUATATION_MEDI             11176
OBS_30_CNT_SOCIAL_CIRCLE                    76
DEF_30_CNT_SOCIAL_CIRCLE                    76
FLAG_DOCUMENT_3                              0
PAYMENT_RATE                                 1
BURO_DAYS_CREDIT_MEAN                     3347
ACTIVE_DAYS_CREDIT_MAX                    6853
PREV_APP_CREDIT_PERC_VAR                  1259
PREV_RATE_DOWN_PAYMENT_MIN                1259
PREV_NAME_YIELD_GROUP_high_MEAN           1259
APPROVED_HOUR_APPR_PROCESS_START_MAX      1336
POS_MONTHS_BALANCE_MAX                    1349
POS_MONTHS_

In [10]:
# Séparer X_train (toutes les colonnes sauf la dernière) et Y_train (dernière colonne)= target)

X_train = df_reduced.iloc[:, :-1]  # Toutes les colonnes sauf la dernière
Y_train = df_reduced.iloc[:, -1]   # Dernière colonne

In [11]:
X_train.head()

Unnamed: 0,CNT_CHILDREN,DAYS_BIRTH,DAYS_ID_PUBLISH,FLAG_WORK_PHONE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT_W_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,YEARS_BEGINEXPLUATATION_MEDI,...,ACTIVE_DAYS_CREDIT_MAX,PREV_APP_CREDIT_PERC_VAR,PREV_RATE_DOWN_PAYMENT_MIN,PREV_NAME_YIELD_GROUP_high_MEAN,APPROVED_HOUR_APPR_PROCESS_START_MAX,POS_MONTHS_BALANCE_MAX,POS_MONTHS_BALANCE_SIZE,INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE,INSTAL_DPD_MAX,INSTAL_AMT_PAYMENT_SUM
0,4,-13203,-3420,0,6.0,2,,0.034773,0.228883,0.9786,...,-698.0,0.065379,0.0,0.55,17.0,-1.0,42.0,2.0,27.0,473998.815
1,0,-15141,-4071,0,2.0,2,,0.26652,0.633032,,...,,0.02808,0.0,0.6,15.0,-1.0,60.0,2.0,251.0,341375.085
2,0,-9123,-1763,0,1.0,2,,0.228621,0.517297,0.9712,...,-87.0,0.071709,0.015412,0.0,15.0,-12.0,6.0,2.0,0.0,357790.185
3,1,-16652,-197,0,2.0,2,0.652153,0.614909,0.432962,,...,-81.0,0.001583,0.0,0.1,15.0,-1.0,39.0,1.0,0.0,233086.725
4,0,-16818,-370,1,2.0,2,0.668817,0.630694,0.189595,,...,-276.0,0.11635,0.046677,0.0,14.0,-7.0,22.0,13.0,15.0,605062.89


In [12]:
X_train.shape

(23063, 25)

In [13]:
# Créer un ImbPipeline avec des étapes de prétraitement et SMOTE
preprocessing_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputation des valeurs manquantes
    ('scaler', StandardScaler()),  # Mise à l'échelle des données
    ('smote', SMOTE(random_state=42)),  # Rééchantillonnage avec SMOTE
    
])


In [14]:
# verifier le pipeline
print(preprocessing_pipeline.named_steps)

{'imputer': SimpleImputer(strategy='median'), 'scaler': StandardScaler(), 'smote': SMOTE(random_state=42)}


In [15]:
# Étape 3 : Prétraiter les données et appliquer SMOTE
X_train_preprocessed, Y_train_resampled = preprocessing_pipeline.fit_resample(X_train, Y_train)

In [16]:
# Vérifier le nombre de NaN après imputation
check_nan_count(X_train_preprocessed, "Après l'imputation")


Après l'imputation
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
dtype: int64


In [17]:
# Charger les hyperparamètres à partir du fichier CSV
best_params_df = pd.read_csv("best_model_params.csv")

# Convertir les hyperparamètres en dictionnaire
best_params = best_params_df.iloc[0].to_dict()

# Afficher les hyperparamètres pour vérification
print(best_params)

{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.5, 'importance_type': 'split', 'learning_rate': 0.5, 'max_depth': -1, 'min_child_samples': 300, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 200, 'n_jobs': -1, 'num_leaves': 31, 'objective': 'binary', 'random_state': 76, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.4, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [18]:
# Créer le modèle LGBMClassifier avec les hyperparamètres chargés
lgbm_classifier = LGBMClassifier(**best_params)

# Entraîner le modèle avec les données d'entraînement
lgbm_classifier.fit(X_train_preprocessed, Y_train_resampled)

[LightGBM] [Info] Number of positive: 21203, number of negative: 21203
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6361
[LightGBM] [Info] Number of data points in the train set: 42406, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [19]:
# Sauvegarder le modèle et le pipeline
joblib.dump(preprocessing_pipeline, "artifacts/preprocessing_pipeline.joblib")
joblib.dump(lgbm_classifier, "artifacts/lgmb_model.joblib")


# Sauvegarder les transformateurs
joblib.dump(preprocessing_pipeline.named_steps['imputer'],"artifacts/imputer.joblib")
joblib.dump(preprocessing_pipeline.named_steps['scaler'],"artifacts/scaler.joblib")


['artifacts/scaler.joblib']