In [1]:
import numpy as np
import pandas as pd
import gc
import time
import re
from contextlib import contextmanager
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm

from sklearn import pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, fbeta_score, make_scorer
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier, early_stopping

import mlflow
from mlflow.models import infer_signature

from bayes_opt import BayesianOptimization

num_rows = None
nan_as_category = False

# Read data and merge
train_df = pd.read_csv('Input_data/application_train.csv', nrows= num_rows)
test_df = pd.read_csv('Input_data/application_test.csv', nrows= num_rows)
print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))
df = pd.concat([train_df, test_df]).reset_index(drop=True)
# Optional: Remove 4 applications with XNA CODE_GENDER (train set)
df = df[df['CODE_GENDER'] != 'XNA']

# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])

# Categorical features with One-Hot encode
original_columns = list(df.columns)
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

# instancie objet OHE
onehot_encoder = OneHotEncoder(sparse_output=False)

# Utilise OHE
df_new_cols = onehot_encoder.fit_transform(df[categorical_columns])

# Créez un DataFrame avec les nouvelles colonnes encodées
encoded_df = pd.DataFrame(df_new_cols, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Supprimer les colonnes catégorielles d'origine du DataFrame initial
df = df.drop(categorical_columns, axis=1)

# Concaténez le DataFrame d'origine avec le DataFrame contenant les nouvelles colonnes encodées
df = pd.concat([df, encoded_df], axis=1)

# NaN values for DAYS_EMPLOYED: 365.243 -> nan
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
# Some simple new features (percentages)
df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

# Divide in training/validation and test data
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

# Séparation de la variable cible et des features
y = train_df["TARGET"]
X = train_df.drop("TARGET", axis=1)

# Diviser les données en ensembles d'entraînement et de validation
x_train_strat, x_valid_strat, y_train_strat, y_valid_strat = train_test_split(X, y, test_size=0.15, stratify=y, random_state=88)

# Prétraitement : suppression des caractères spéciaux des noms de colonnes
x_train_strat = x_train_strat.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
x_valid_strat = x_valid_strat.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Fonction d'évaluation à optimiser
def lgb_evaluate(n_estimators, num_leaves, colsample_bytree, subsample, max_depth, reg_alpha, reg_lambda, min_split_gain):
    # Définir le modèle avec les hyperparamètres à optimiser
    clf = LGBMClassifier(
        n_estimators=int(n_estimators),
        num_leaves=int(num_leaves),
        colsample_bytree=max(min(colsample_bytree, 1), 0),
        subsample=max(min(subsample, 1), 0),
        max_depth=int(max_depth),
        reg_alpha=max(reg_alpha, 0),
        reg_lambda=max(reg_lambda, 0),
        min_split_gain=min_split_gain
    )

    # Entraîner le modèle sur l'ensemble d'entraînement
    clf.fit(x_train_strat, y_train_strat, eval_set=[(x_valid_strat, y_valid_strat)], eval_metric='auc', callbacks=[early_stopping(stopping_rounds=10)])
    
    # Prédire les probabilités sur le jeu de validation
    y_proba = clf.predict_proba(x_valid_strat)[:, 1]
    y_pred = (y_proba >= 0.081).astype(int)

    roc_auc = roc_auc_score(y_valid_strat, y_proba)

    return roc_auc

# Définir les limites de recherche pour chaque hyperparamètre
param_V1 = {
            'n_estimators': (100, 1000),              # Nombre d'estimations entre 100 et 1000
            'num_leaves': (10, 100),                  # Nombre de feuilles entre 10 et 100
            'colsample_bytree': (0.5, 1),             # Pourcentage d'échantillons par arbre
            'subsample': (0.5, 1),                    # Pourcentage d'échantillons dans chaque arbre
            'max_depth': (5, 20),                     # Profondeur maximale de l'arbre
            'reg_alpha': (0, 1),                      # Régularisation L1
            'reg_lambda': (0, 1),                     # Régularisation L2
            'min_split_gain': (0, 0.1)                # Gain minimal pour diviser un nœud
            }

# Instancier l'optimiseur bayésien
optimizer = BayesianOptimization(f=lgb_evaluate, pbounds=param_V1, random_state=88)

# Lancer l'optimisation bayésienne
optimizer.maximize(init_points=5, n_iter=25)

# Enregistrer les meilleurs hyperparamètres obtenus dans MLflow
best_params = optimizer.max['params']

# Hyperparamètres optimisés
param_V2 = {
            'n_estimators': int(optimizer.max['params']['n_estimators']),
            'num_leaves': int(optimizer.max['params']['num_leaves']),
            'colsample_bytree': optimizer.max['params']['colsample_bytree'],
            'subsample': optimizer.max['params']['subsample'],
            'max_depth': int(optimizer.max['params']['max_depth']),
            'reg_alpha': optimizer.max['params']['reg_alpha'],
            'reg_lambda': optimizer.max['params']['reg_lambda'],
            'min_split_gain': optimizer.max['params']['min_split_gain']
            }

# Créer le modèle avec ces paramètres
clf = lightgbm.LGBMClassifier(**param_V2)

# Créer le pipeline avec le modèle défini
pipeline = pipeline.Pipeline([('Classifier', clf)])

pipeline.fit(x_train_strat, y_train_strat)

joblib.dump(pipeline, 'pipeline_scoring.joblib')

signature = infer_signature(x_train_strat, y_train_strat)

mlflow.sklearn.save_model(pipeline, 'mlflow_model', signature=signature)

Train samples: 307511, test samples: 48744
|   iter    |  target   | colsam... | max_depth | min_sp... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 21101, number of negative: 240279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12979
[LightGBM] [Info] Number of data points in the train set: 261380, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.762934	valid_0's binary_logloss: 



Early stopping, best iteration is:
[167]	valid_0's auc: 0.762914	valid_0's binary_logloss: 0.244643
| [0m6        [0m | [0m0.7629   [0m | [0m0.5631   [0m | [0m5.926    [0m | [0m0.01662  [0m | [0m988.5    [0m | [0m90.93    [0m | [0m0.9064   [0m | [0m0.03397  [0m | [0m0.624    [0m |
[LightGBM] [Info] Number of positive: 21101, number of negative: 240279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12979
[LightGBM] [Info] Number of data points in the train set: 261380, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[72]	valid_0's auc: 0.761338	valid_0's b

Early stopping, best iteration is:
[133]	valid_0's auc: 0.761751	valid_0's binary_logloss: 0.244857
| [0m10       [0m | [0m0.7618   [0m | [0m0.7832   [0m | [0m6.437    [0m | [0m0.009297 [0m | [0m156.8    [0m | [0m51.12    [0m | [0m0.2362   [0m | [0m0.8646   [0m | [0m0.9836   [0m |
[LightGBM] [Info] Number of positive: 21101, number of negative: 240279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12979
[LightGBM] [Info] Number of data points in the train set: 261380, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[130]	valid_0's auc: 0.763371	valid_0's 

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.76364	valid_0's binary_logloss: 0.244582
| [0m17       [0m | [0m0.7636   [0m | [0m0.6797   [0m | [0m9.574    [0m | [0m0.05011  [0m | [0m536.4    [0m | [0m71.75    [0m | [0m0.034    [0m | [0m0.7631   [0m | [0m0.673    [0m |
[LightGBM] [Info] Number of positive: 21101, number of negative: 240279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12979
[LightGBM] [Info] Number of data points in the train set: 261380, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.4324

Early stopping, best iteration is:
[154]	valid_0's auc: 0.762744	valid_0's binary_logloss: 0.244606
| [0m21       [0m | [0m0.7627   [0m | [0m0.7004   [0m | [0m5.518    [0m | [0m0.08999  [0m | [0m172.2    [0m | [0m92.78    [0m | [0m0.9572   [0m | [0m0.1628   [0m | [0m0.7899   [0m |
[LightGBM] [Info] Number of positive: 21101, number of negative: 240279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12979
[LightGBM] [Info] Number of data points in the train set: 261380, number of used features: 243


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.763001	valid_0's binary_logloss: 0.244723
| [0m22       [0m | [0m0.763    [0m | [0m0.8982   [0m | [0m11.38    [0m | [0m0.007951 [0m | [0m856.3    [0m | [0m40.87    [0m | [0m0.3116   [0m | [0m0.3673   [0m | [0m0.7371   [0m |
[LightGBM] [Info] Number of positive: 21101, number of negative: 240279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12979
[LightGBM] [Info] Number of data points in the train set: 261380, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432



MlflowException: Path 'mlflow_model' already exists and is not empty