In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.pipeline import make_pipeline 
from lightgbm import LGBMClassifier

import joblib

In [25]:
# Charger les transformateurs et le modèle
imputer = joblib.load("artifacts/imputer.joblib")
scaler = joblib.load("artifacts/scaler.joblib")
model = joblib.load("artifacts/lgmb_model.joblib")


In [26]:
# Créer un pipeline de production
production_pipeline = Pipeline(steps=[
    ('imputer', imputer),   # Appliquer le même imputeur que pendant l'entraînement
    ('scaler', scaler),     # Appliquer le même scaler que pendant l'entraînement
    ('classifier', model)   # Utiliser le modèle entraîné
])

In [29]:
# Exemple de données de production à tester
# Supposons que vous ayez un DataFrame avec des NaN (exemple ici)
X_production = pd.DataFrame({
    "CNT_CHILDREN": [4.961729233406375],
    "DAYS_BIRTH": [0.6494141368593394],
    "DAYS_ID_PUBLISH": [-0.28208800158267566],
    "FLAG_WORK_PHONE": [-0.49899677178931173],
    "CNT_FAM_MEMBERS": [4.2246985025429815],
    "REGION_RATING_CLIENT_W_CITY": [-0.0626928526971028],
    "EXT_SOURCE_1": [0.016460129067917988],
    "EXT_SOURCE_2": [np.nan],
    "EXT_SOURCE_3": [-1.4471805253488148],
    "YEARS_BEGINEXPLUATATION_MEDI": [0.014154247869191033],
    "OBS_30_CNT_SOCIAL_CIRCLE": [-0.17585232135929785],
    "DEF_30_CNT_SOCIAL_CIRCLE": [-0.32107538979547706],
    "FLAG_DOCUMENT_3": [np.nan],
    "PAYMENT_RATE": [0.6042966523494969],
    "BURO_DAYS_CREDIT_MEAN": [-1.746064981800085],
    "ACTIVE_DAYS_CREDIT_MAX": [-698.0],
    "PREV_APP_CREDIT_PERC_VAR": [-0.02082433473905336],
    "PREV_RATE_DOWN_PAYMENT_MIN": [-0.5668721219826327],
    "PREV_NAME_YIELD_GROUP_high_MEAN": [1.0907783598714118],
    "APPROVED_HOUR_APPR_PROCESS_START_MAX": [17.0],
    "POS_MONTHS_BALANCE_MAX": [0.6215807000708212],
    "POS_MONTHS_BALANCE_SIZE": [0.5032035038702302],
    "INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE":[ -0.16220038715397642],
    "INSTAL_DPD_MAX": [0.0845668354102822],
    "INSTAL_AMT_PAYMENT_SUM": [-0.232566621486729]
}, index=[0])  # Ajout d'un index

In [28]:
# Vérifier les NaN dans les données d'entrée avant le pipeline de production
print("NaN avant imputation :\n", X_production.isna().sum())


NaN avant imputation :
 CNT_CHILDREN                             0
DAYS_BIRTH                               0
DAYS_ID_PUBLISH                          0
FLAG_WORK_PHONE                          0
CNT_FAM_MEMBERS                          0
REGION_RATING_CLIENT_W_CITY              0
EXT_SOURCE_1                             0
EXT_SOURCE_2                             1
EXT_SOURCE_3                             0
YEARS_BEGINEXPLUATATION_MEDI             0
OBS_30_CNT_SOCIAL_CIRCLE                 0
DEF_30_CNT_SOCIAL_CIRCLE                 0
FLAG_DOCUMENT_3                          0
PAYMENT_RATE                             0
BURO_DAYS_CREDIT_MEAN                    0
ACTIVE_DAYS_CREDIT_MAX                   0
PREV_APP_CREDIT_PERC_VAR                 0
PREV_RATE_DOWN_PAYMENT_MIN               0
PREV_NAME_YIELD_GROUP_high_MEAN          0
APPROVED_HOUR_APPR_PROCESS_START_MAX     0
POS_MONTHS_BALANCE_MAX                   0
POS_MONTHS_BALANCE_SIZE                  0
INSTAL_NUM_INSTALMENT_VERSION_

In [16]:
X_production.shape

(1, 25)

In [23]:
# Appliquer le pipeline de production sur les données
X_production_processed = production_pipeline.predict(X_production)  # Transformation des données

In [24]:
# Sauvegarder le pipeline de production
joblib.dump(production_pipeline,"artifacts/production_pipeline.joblib")

['artifacts/production_pipeline.joblib']