In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import skim
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
!pip install explainerdashboard
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

## Step 2: Load Data

In [10]:
# Load data
df = pd.read_csv('data_set/train.csv')
df_copy = df.copy()

## Step 3: Data Cleaning and Preprocessing for Categorical Data

In [11]:
# Include the object type columns except the 'Loan_ID' column
df_cat = df.select_dtypes(include=['object']).drop('Loan_ID', axis=1)

# Replace missing values with the most frequent value
for col in df_cat.columns:
    df_cat[col] = df_cat[col].fillna(df_cat[col].mode()[0])

df_cat.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y
3,Male,Yes,0,Not Graduate,No,Urban,Y
4,Male,No,0,Graduate,No,Urban,Y


In [12]:
# Separate the target variable before encoding
target = df_cat['Loan_Status']
df_cat = df_cat.drop(columns=['Loan_Status'])

## Step 4: Data Preprocessing for Numerical Data

In [13]:
numeric = df.select_dtypes(include=['int64', 'float64']).columns

for value in numeric:
    df[value] = df[value].fillna(df[value].median())

df_num = df[numeric]

## Step 5: Encode Categorical Variables

In [14]:
# Encode Categorical Variables using OneHotEncoder
df_cat['Dependents'] = df_cat['Dependents'].replace('3+', 3).astype(int)

one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
df_encoded = one_hot_encoder.fit_transform(df_cat)

## Step 6: Concatenate DataFrames

In [15]:
# Create a DataFrame from the encoded data
df_encoded = pd.DataFrame(df_encoded, columns=one_hot_encoder.get_feature_names_out())

# Concatenate DataFrames
df_concat = pd.concat([df_encoded, df_num.reset_index(drop=True)], axis=1)

In [18]:
# Add the target variable back to the DataFrame
df_concat['Loan_Status'] = target.reset_index(drop=True).map({'Y': 1, 'N': 0})

## Step 7: Data Information

In [19]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Male              614 non-null    float64
 1   Married_Yes              614 non-null    float64
 2   Dependents_1             614 non-null    float64
 3   Dependents_2             614 non-null    float64
 4   Dependents_3             614 non-null    float64
 5   Education_Not Graduate   614 non-null    float64
 6   Self_Employed_Yes        614 non-null    float64
 7   Property_Area_Semiurban  614 non-null    float64
 8   Property_Area_Urban      614 non-null    float64
 9   ApplicantIncome          614 non-null    int64  
 10  CoapplicantIncome        614 non-null    float64
 11  LoanAmount               614 non-null    float64
 12  Loan_Amount_Term         614 non-null    float64
 13  Credit_History           614 non-null    float64
 14  Loan_Status              6

## Step 8: Split Data into Training and Test Sets

In [20]:
# Split the data
X = df_concat.drop(columns=['Loan_Status'])
y = df_concat['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=21)

## Step 9: Normalize Data

In [21]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert X_train and X_test back to DataFrames for the ExplainerDashboard
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)


## Step 10: Train Logistic Regression Model

In [22]:
# Train Logistic Regression Model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

## Step 11: Model Prediction

In [23]:
predict_lr = model_lr.predict(X_test)
predict_lr

array([0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1], dtype=int64)

## Step 12: Evaluate Model Performance

In [24]:
# Evaluate model performance
train_score_lr = model_lr.score(X_train, y_train)
test_score_lr = model_lr.score(X_test, y_test)
train_score_lr, test_score_lr

(0.8085539714867617, 0.8048780487804879)

## Step 13: Correlation Matrix

In [25]:
matrix = df_concat.corr()
matrix

Unnamed: 0,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
Gender_Male,1.0,0.364569,-0.004466,0.129953,0.096319,0.045364,-0.000525,-0.108623,0.03453,0.058809,0.082912,0.106904,-0.07403,0.00917,0.017987
Married_Yes,0.364569,1.0,0.113853,0.249547,0.132566,0.012304,0.004489,0.005845,0.000546,0.051708,0.075948,0.146546,-0.100912,0.010938,0.091478
Dependents_1,-0.004466,0.113853,1.0,-0.198046,-0.134337,-0.013355,0.082044,0.011661,0.06932,0.040861,-0.029769,0.061976,-0.088492,0.009757,-0.03874
Dependents_2,0.129953,0.249547,-0.198046,1.0,-0.133547,0.020822,0.032434,-0.012017,0.016569,-0.03465,0.010016,0.020126,-0.010609,0.007987,0.062384
Dependents_3,0.096319,0.132566,-0.134337,-0.133547,1.0,0.055288,0.003278,0.007863,-0.04746,0.156687,0.041491,0.15285,-0.077273,-0.060473,-0.026123
Education_Not Graduate,0.045364,0.012304,-0.013355,0.020822,0.055288,1.0,-0.010383,-0.03941,-0.034279,-0.14076,-0.06229,-0.168759,-0.073928,-0.073658,-0.085884
Self_Employed_Yes,-0.000525,0.004489,0.082044,0.032434,0.003278,-0.010383,1.0,0.00871,-0.030338,0.12718,-0.0161,0.1151,-0.033739,-0.00155,-0.0037
Property_Area_Semiurban,-0.108623,0.005845,0.011661,-0.012017,0.007863,-0.03941,0.00871,1.0,-0.547574,-0.014246,-0.027044,-0.005804,0.059141,0.035976,0.13654
Property_Area_Urban,0.03453,0.000546,0.06932,0.016569,-0.04746,-0.034279,-0.030338,-0.547574,1.0,-0.000598,0.022776,-0.03605,-0.094279,-0.016934,-0.043621
ApplicantIncome,0.058809,0.051708,0.040861,-0.03465,0.156687,-0.14076,0.12718,-0.014246,-0.000598,1.0,-0.116605,0.565181,-0.046531,-0.018615,-0.00471


## Step 14: Train Random Forest Model with Hyperparameter Tuning

In [None]:
model_rf = RandomForestClassifier(random_state=21)

# Define the parameters to test
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Configure Grid Search
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)


grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Meilleurs paramètres : {best_params}")


best_model = grid_search.best_estimator_
train_score_rf = best_model.score(X_train, y_train)
test_score_rf = best_model.score(X_test, y_test)
print(f"Score d'entraînement : {train_score_rf}")
print(f"Score de test : {test_score_rf}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Meilleurs paramètres : {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Score d'entraînement : 0.835030549898167
Score de test : 0.8130081300813008


## Step 15: Cross-Validation

In [None]:
# Cross-validation with RandomForest
cv_scores_rf = cross_val_score(best_model, X_train, y_train, cv=5)
cv_score_mean_rf = cv_scores_rf.mean()

print(f"Score d'entraînement : {train_score_rf}")
print(f"Score de test : {test_score_rf}")
print(f"Score de validation croisée (moyenne) : {cv_score_mean_rf}")

Score d'entraînement : 0.835030549898167
Score de test : 0.8130081300813008
Score de validation croisée (moyenne) : 0.8064316635745208


# Step 16: Create ExplainerDashboard for Logistic Regression

In [None]:
explainer_lr = ClassifierExplainer(model_lr, X_test, y_test)
dashboard_lr = ExplainerDashboard(explainer_lr, title="Logistic Regression Explainer Dashboard")
dashboard_lr.run(port=8050, host='localhost')

Note: model_output='probability' is currently not supported for linear classifiers models with shap. So defaulting to model_output='logodds' If you really need probability outputs use shap='kernel' instead.
Note: shap values for shap='linear' get calculated against X_background, but paramater X_background=None, so using X instead...
Generating self.shap_explainer = shap.LinearExplainer(model, X)...
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Ca

# Step 17: Create ExplainerDashboard for Random Forest

In [None]:
explainer_rf = ClassifierExplainer(best_model, X_test, y_test)
dashboard_rf = ExplainerDashboard(explainer_rf, title="Random Forest Explainer Dashboard")
dashboard_rf.run(port=8050, host='localhost')

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
Generating layout...
Calculating shap values...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating shap interaction values... (this may take a while)
Reminder: TreeShap computational complexity is O(TLD^2), where T is the number of trees, L is the maximum number of leaves in any tree and D the maximal depth of any tree. So reducing these will s