In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, mean_squared_error
import joblib

# 1. Chargement des données

In [5]:
data = pd.read_csv('credit_risk_dataset.csv', sep=';')

In [6]:
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


# 2. Prétraitement des données

In [7]:
# Recherche de valeurs manquantes
data.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [10]:
# Gestion des valeurs manquantes pour les colonnes numériques
numerical_cols = data.select_dtypes(include=[np.number]).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())
data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

# Transformation des variables catégorielles

In [11]:
# Encodage des valeurs catégorielles en numériques afin qu'elles soient interprétables par les algorithmes de machine learning  
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [12]:
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,3,123.0,4,3,35000,16.02,1,0.59,1,3
1,21,9600,2,5.0,1,1,1000,11.14,0,0.1,0,2
2,25,9600,0,1.0,3,2,5500,12.87,1,0.57,0,3
3,23,65500,3,4.0,3,2,35000,15.23,1,0.53,0,2
4,24,54400,3,8.0,3,2,35000,14.27,1,0.55,1,4


# Séparation des features et de la target

In [14]:
target = 'loan_status'
X = data.drop(columns=[target])
y = data[target]

# Division en train et test

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisation des données

In [16]:
# Normalisation des données afin de les mettre à la même échelle et accélérer l'apprentissage
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 3. Entraînement des modèles

In [17]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nModèle: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Modèle: Logistic Regression
Accuracy: 0.8371950283872948
ROC-AUC: 0.6952269328588737
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90      5072
           1       0.72      0.44      0.55      1445

    accuracy                           0.84      6517
   macro avg       0.79      0.70      0.72      6517
weighted avg       0.83      0.84      0.82      6517


Modèle: Decision Tree
Accuracy: 0.8843025932177382
ROC-AUC: 0.8417880786569591
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.93      5072
           1       0.73      0.77      0.75      1445

    accuracy                           0.88      6517
   macro avg       0.83      0.84      0.84      6517
weighted avg       0.89      0.88      0.89      6517



In [19]:
mse_scores = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores[name] = mse
    print(f"\nModèle: {name}")
    print("Mean Squared Error:", mse)

# Comparaison des MSE
best_model = min(mse_scores, key=mse_scores.get)
print(f"\nLe modèle avec le plus petit MSE est: {best_model} avec un MSE de {mse_scores[best_model]}")


Modèle: Logistic Regression
Mean Squared Error: 0.16280497161270524

Modèle: Decision Tree
Mean Squared Error: 0.11569740678226177

Le modèle avec le plus petit MSE est: Decision Tree avec un MSE de 0.11569740678226177


Le modèle Decision Tree a de meilleures performances en général, nous allons donc prendre celui-ci.

# 4. Sauvegarde du meilleur modèle (on choisit le Decision Tree ici)

In [20]:
best_model = models['Decision Tree']
joblib.dump(best_model, 'model.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']