## Importation des librairies :

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import mlflow
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

### Importation du fichier : 

In [2]:
# Chargement des données
chemin_dossier = "C:/Users/paulm/Documents/Projet 7/Projet7withCSV/data/" 
df = pd.read_csv(os.path.join(chemin_dossier, 'X_predictionV1.csv'))

#### Premier modèle : 

In [20]:
# Adresse de MLflow
mlflow_url = "http://127.0.0.1:5000"

# Coût d'un faux positif et d'un faux négatif
cost_fp = 1  
cost_fn = 10  

# Conserver la colonne d'identifiant unique
id_column = 'SK_ID_CURR'
df_id = test[[id_column]]  # Conserver les identifiants

# Séparation des caractéristiques (X) et de la cible (y)
X = test.drop(columns=[id_column, 'TARGET'])  
y = test['TARGET']  

# Séparation des données en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Création du modèle :
class_weight = (len(y_train) - sum(y_train)) / sum(y_train)
model = XGBClassifier(scale_pos_weight=class_weight, random_state=42)
params = {'n_estimators': [50, 100], 'max_depth': [3, 6, 9]}  

# Configurer MLflow
mlflow.set_tracking_uri(mlflow_url)
mlflow.set_experiment("Credit Scoring Experiment")

# Échantillonner les données
sample_size = 0.1 
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=(1 - sample_size), random_state=42)
X_test_sample, _, y_test_sample, _ = train_test_split(X_test, y_test, test_size=(1 - sample_size), random_state=42)

# Entraînement du modèle XGBoost et obtention des meilleurs paramètres
with mlflow.start_run(run_name="XGBoost"):
    print("Entraînement du modèle XGBoost...")

    grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='f1', cv=3)
    grid_search.fit(X_train_sample, y_train_sample)

    # Loguer les paramètres dans MLflow
    mlflow.log_params(grid_search.best_params_)

    # Affichage des meilleurs paramètres et le score F1
    print(f"Meilleurs paramètres pour XGBoost: {grid_search.best_params_}")
    print(f"Score F1 moyen sur le jeu de validation: {grid_search.best_score_:.3f}")

    # Utilisation du meilleur modèle pour les prédictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_sample)

    # Évaluation du modèle avec le meilleur seuil pour minimiser le coût métier
    y_prob = best_model.predict_proba(X_test_sample)[:, 1]
    thresholds = np.linspace(0, 1, 100)
    costs = []
    for threshold in thresholds:
        y_pred_thresholded = (y_prob > threshold).astype(int)
        fp = np.sum((y_pred_thresholded == 1) & (y_test_sample == 0)) * cost_fp
        fn = np.sum((y_pred_thresholded == 0) & (y_test_sample == 1)) * cost_fn
        total_cost = fp + fn
        costs.append(total_cost)

    best_threshold = thresholds[np.argmin(costs)]
    print(f"Meilleur seuil pour minimiser le coût métier avec XGBoost: {best_threshold}")

    y_pred_best_threshold = (y_prob > best_threshold).astype(int)

    # Affichage du rapport de classification et de la matrice de confusion
    classification_report_str = classification_report(y_test_sample, y_pred_best_threshold)
    confusion_matrix_str = confusion_matrix(y_test_sample, y_pred_best_threshold)

    print(f"Rapport de classification avec le meilleur seuil pour XGBoost:")
    print(classification_report_str)

    print(f"Matrice de confusion avec le meilleur seuil pour XGBoost:")
    print(confusion_matrix_str)
    print()

    # Loguer les métriques et les résultats dans MLflow
    mlflow.log_metric("best_score_f1", grid_search.best_score_)
    mlflow.log_metric("best_threshold", best_threshold)
    mlflow.log_text(classification_report_str, "classification_report.txt")
    mlflow.log_text(str(confusion_matrix_str), "confusion_matrix.txt")

    # Loguer le modèle
    model_path = 'model/xgboost_model.pkl'
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    mlflow.sklearn.log_model(best_model, "model")

    # Sauvegarder le modèle dans un fichier .pkl
    joblib.dump(best_model, model_path)

Entraînement du modèle XGBoost...
Meilleurs paramètres pour XGBoost: {'max_depth': 3, 'n_estimators': 50}
Score F1 moyen sur le jeu de validation: 0.121
Meilleur seuil pour minimiser le coût métier avec XGBoost: 0.6060606060606061
Rapport de classification avec le meilleur seuil pour XGBoost:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      5642
           1       0.10      0.04      0.06       507

    accuracy                           0.89      6149
   macro avg       0.51      0.50      0.50      6149
weighted avg       0.85      0.89      0.87      6149

Matrice de confusion avec le meilleur seuil pour XGBoost:
[[5439  203]
 [ 485   22]]



In [18]:
# Chargement des données
chemin_dossier_test = "C:/Users/paulm/Documents/Projet 7/Projet7withCSV/script/" 
test = pd.read_csv(os.path.join(chemin_dossier, 'X_predictionV1.csv'))

In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307492 entries, 0 to 307491
Columns: 426 entries, SK_ID_CURR to TARGET
dtypes: float64(424), int64(2)
memory usage: 999.4 MB


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307492 entries, 0 to 307491
Columns: 426 entries, SK_ID_CURR to TARGET
dtypes: float64(424), int64(2)
memory usage: 999.4 MB


In [13]:
df.describe()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_NaN,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_NaN,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
count,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,...,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0
mean,278169.515916,2.278108e-10,-6.916689e-10,-1.715779e-09,-6.006369e-11,-2.892853e-09,-2.737011e-09,-7.488488e-11,3.118813e-09,1.101622e-12,...,-4.163562e-09,1.255545e-08,2.968939e-09,1.603359e-08,-2.311491e-08,-1.44329e-09,-1.413656e-08,3.084308e-09,2.808476e-09,0.080731
std,102783.820231,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,0.272421
min,100002.0,-0.5775304,-0.6036714,-1.37647,-1.758958,-1.348269,-1.487793,-2.106351,-0.5784959,-5.587935,...,-0.07628357,-1.016924,-0.07288873,-0.5229786,-0.5167911,-0.1332191,-0.9492149,-1.037639,-0.08734234,0.0
25%,189139.75,-0.5775304,-0.2374159,-0.817457,-0.7302934,-0.812094,-0.7853317,-0.8352686,-0.4712448,-0.7078923,...,-0.07628357,-1.016924,-0.07288873,-0.5229786,-0.5167911,-0.1332191,-0.9492149,-1.037639,-0.08734234,0.0
50%,278192.5,-0.5775304,-0.09129329,-0.2124032,-0.1521547,-0.2393616,-0.1459154,0.0657403,-0.4603016,0.1368615,...,-0.07628357,0.9833576,-0.07288873,-0.5229786,-0.5167911,-0.1332191,-0.9492149,0.9637264,-0.08734234,0.0
75%,367125.25,0.8072851,0.1421234,0.5208211,0.5166481,0.3821141,0.5635589,0.8304063,-0.4537542,0.844797,...,-0.07628357,0.9833576,-0.07288873,-0.5229786,-0.5167911,-0.1332191,1.053502,0.9637264,-0.08734234,0.0
max,456235.0,25.73397,492.6894,8.573968,15.93296,9.509276,3.733528,1.95873,2.133608,1.415346,...,13.10898,0.9833576,13.71954,1.912124,1.935018,7.506432,1.053502,0.9637264,11.4492,1.0


In [19]:
test.describe()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_NaN,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_NaN,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
count,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,...,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0,307492.0
mean,278169.515916,2.278108e-10,-6.916689e-10,-1.715779e-09,-6.006369e-11,-2.892853e-09,-2.737011e-09,-7.488488e-11,3.118813e-09,1.101622e-12,...,-4.163562e-09,1.255545e-08,2.968939e-09,1.603359e-08,-2.311491e-08,-1.44329e-09,-1.413656e-08,3.084308e-09,2.808476e-09,0.080731
std,102783.820231,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,0.272421
min,100002.0,-0.5775304,-0.6036714,-1.37647,-1.758958,-1.348269,-1.487793,-2.106351,-0.5784959,-5.587935,...,-0.07628357,-1.016924,-0.07288873,-0.5229786,-0.5167911,-0.1332191,-0.9492149,-1.037639,-0.08734234,0.0
25%,189139.75,-0.5775304,-0.2374159,-0.817457,-0.7302934,-0.812094,-0.7853317,-0.8352686,-0.4712448,-0.7078923,...,-0.07628357,-1.016924,-0.07288873,-0.5229786,-0.5167911,-0.1332191,-0.9492149,-1.037639,-0.08734234,0.0
50%,278192.5,-0.5775304,-0.09129329,-0.2124032,-0.1521547,-0.2393616,-0.1459154,0.0657403,-0.4603016,0.1368615,...,-0.07628357,0.9833576,-0.07288873,-0.5229786,-0.5167911,-0.1332191,-0.9492149,0.9637264,-0.08734234,0.0
75%,367125.25,0.8072851,0.1421234,0.5208211,0.5166481,0.3821141,0.5635589,0.8304063,-0.4537542,0.844797,...,-0.07628357,0.9833576,-0.07288873,-0.5229786,-0.5167911,-0.1332191,1.053502,0.9637264,-0.08734234,0.0
max,456235.0,25.73397,492.6894,8.573968,15.93296,9.509276,3.733528,1.95873,2.133608,1.415346,...,13.10898,0.9833576,13.71954,1.912124,1.935018,7.506432,1.053502,0.9637264,11.4492,1.0
