In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import skim
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
!pip install explainerdashboard
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

## Step 2: Load Data

In [2]:
# Load data
df = pd.read_csv('data_set/train.csv')
df_copy = df.copy()

## Step 3: Data Cleaning and Preprocessing for Categorical Data

In [3]:
# Include the object type columns except the 'Loan_ID' column
df_cat = df.select_dtypes(include=['object']).drop('Loan_ID', axis=1)

# Replace missing values with the most frequent value
for col in df_cat.columns:
    df_cat[col] = df_cat[col].fillna(df_cat[col].mode()[0])

df_cat.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y
3,Male,Yes,0,Not Graduate,No,Urban,Y
4,Male,No,0,Graduate,No,Urban,Y


## Step 4: Data Preprocessing for Numerical Data

In [4]:
numeric = df.select_dtypes(include=['int64', 'float64']).columns

for value in numeric:
    df[value] = df[value].fillna(df[value].median())

df_num = df[numeric]

## Step 5: Encode Categorical Variables

In [5]:
# Encode Categorical Variables using OneHotEncoder
df_cat['Dependents'] = df_cat['Dependents'].replace('3+', 3).astype(int)

one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
df_encoded = one_hot_encoder.fit_transform(df_cat)

## Step 6: Concatenate DataFrames

In [6]:
# Create a DataFrame from the encoded data
df_encoded = pd.DataFrame(df_encoded, columns=one_hot_encoder.get_feature_names_out(df_cat.columns))

# Concatenate DataFrames
df_concat = pd.concat([df_encoded, df_num.reset_index(drop=True)], axis=1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,0,0,0,1,0,0,1,5849,0.0,128.0,360.0,1.0
1,0,1,1,1,0,1,0,4583,1508.0,128.0,360.0,1.0
2,0,1,0,1,1,0,1,3000,0.0,66.0,360.0,1.0
3,0,1,0,0,0,0,1,2583,2358.0,120.0,360.0,1.0
4,0,0,0,1,0,0,1,6000,0.0,141.0,360.0,1.0


## Step 7: Data Information

In [7]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int32  
 1   Married            614 non-null    int32  
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    int32  
 4   Self_Employed      614 non-null    int32  
 5   Property_Area      614 non-null    int32  
 6   Loan_Status        614 non-null    int32  
 7   ApplicantIncome    614 non-null    int64  
 8   CoapplicantIncome  614 non-null    float64
 9   LoanAmount         614 non-null    float64
 10  Loan_Amount_Term   614 non-null    float64
 11  Credit_History     614 non-null    float64
dtypes: float64(4), int32(7), int64(1)
memory usage: 40.9 KB


## Step 8: Split Data into Training and Test Sets

In [8]:
# Split the data
X = df_concat.drop(columns=['Loan_Status', 'Loan_Amount_Term', 'Gender', 'Self_Employed'], axis=1)
y = df_concat['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=21)

## Step 9: Normalize Data

In [9]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert X_train and X_test back to DataFrames for the ExplainerDashboard
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)


## Step 10: Train Logistic Regression Model

In [10]:
# Train Logistic Regression Model
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

## Step 11: Model Prediction

In [11]:
predict_lr = model_lr.predict(X_test)
predict_lr

array([0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1])

## Step 12: Evaluate Model Performance

In [12]:
# Evaluate model performance
train_score_lr = model_lr.score(X_train, y_train)
test_score_lr = model_lr.score(X_test, y_test)
train_score_lr, test_score_lr

(0.8085539714867617, 0.8130081300813008)

## Step 13: Correlation Matrix

In [13]:
matrix = df_concat.corr()
matrix

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
Gender,1.0,-0.364569,-0.172914,0.045364,0.000525,0.082045,-0.017987,-0.058809,-0.082912,-0.106904,0.07403,-0.00917
Married,-0.364569,1.0,0.334216,-0.012304,0.004489,0.003071,0.091478,0.051708,0.075948,0.146546,-0.100912,0.010938
Dependents,-0.172914,0.334216,1.0,-0.055752,0.056798,0.001781,0.010118,0.118202,0.03043,0.163103,-0.103864,-0.04016
Education,0.045364,-0.012304,-0.055752,1.0,0.010383,0.003592,0.085884,0.14076,0.06229,0.168759,0.073928,0.073658
Self_Employed,0.000525,0.004489,0.056798,0.010383,1.0,0.021996,-0.0037,0.12718,-0.0161,0.1151,-0.033739,-0.00155
Property_Area,0.082045,0.003071,0.001781,0.003592,0.021996,1.0,0.103253,-0.007894,-0.028356,0.016808,0.086879,0.030248
Loan_Status,-0.017987,0.091478,0.010118,0.085884,-0.0037,0.103253,1.0,-0.00471,-0.059187,-0.033214,-0.022549,0.540556
ApplicantIncome,-0.058809,0.051708,0.118202,0.14076,0.12718,-0.007894,-0.00471,1.0,-0.116605,0.565181,-0.046531,-0.018615
CoapplicantIncome,-0.082912,0.075948,0.03043,0.06229,-0.0161,-0.028356,-0.059187,-0.116605,1.0,0.189218,-0.059383,0.011134
LoanAmount,-0.106904,0.146546,0.163103,0.168759,0.1151,0.016808,-0.033214,0.565181,0.189218,1.0,0.03696,-0.000607


## Step 14: Train Random Forest Model with Hyperparameter Tuning

In [14]:
model_rf = RandomForestClassifier(random_state=21)

# Define the parameters to test
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Configure Grid Search
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2)


grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Meilleurs paramètres : {best_params}")


best_model = grid_search.best_estimator_
train_score_rf = best_model.score(X_train, y_train)
test_score_rf = best_model.score(X_test, y_test)
print(f"Score d'entraînement : {train_score_rf}")
print(f"Score de test : {test_score_rf}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Meilleurs paramètres : {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Score d'entraînement : 0.835030549898167
Score de test : 0.8130081300813008


## Step 15: Cross-Validation

In [15]:
# Cross-validation with RandomForest
cv_scores_rf = cross_val_score(best_model, X_train, y_train, cv=5)
cv_score_mean_rf = cv_scores_rf.mean()

print(f"Score d'entraînement : {train_score_rf}")
print(f"Score de test : {test_score_rf}")
print(f"Score de validation croisée (moyenne) : {cv_score_mean_rf}")

Score d'entraînement : 0.835030549898167
Score de test : 0.8130081300813008
Score de validation croisée (moyenne) : 0.8064316635745208


# Step 16: Create ExplainerDashboard for Logistic Regression

In [16]:
explainer_lr = ClassifierExplainer(model_lr, X_test, y_test)
dashboard_lr = ExplainerDashboard(explainer_lr, title="Logistic Regression Explainer Dashboard")
dashboard_lr.run(port=8050, host='localhost')

Note: model_output='probability' is currently not supported for linear classifiers models with shap. So defaulting to model_output='logodds' If you really need probability outputs use shap='kernel' instead.
Note: shap values for shap='linear' get calculated against X_background, but paramater X_background=None, so using X instead...
Generating self.shap_explainer = shap.LinearExplainer(model, X)...
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Ca

# Step 17: Create ExplainerDashboard for Random Forest

In [17]:
explainer_rf = ClassifierExplainer(best_model, X_test, y_test)
dashboard_rf = ExplainerDashboard(explainer_rf, title="Random Forest Explainer Dashboard")
dashboard_rf.run(port=8050, host='localhost')

Detected RandomForestClassifier model: Changing class type to RandomForestClassifierExplainer...
Note: model_output=='probability', so assuming that raw shap output of RandomForestClassifier is in probability space...
Generating self.shap_explainer = shap.TreeExplainer(model)
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
Generating layout...
Calculating shap values...
Calculating prediction probabilities...
Calculating metrics...
Calculating confusion matrices...
Calculating classification_dfs...
Calculating roc auc curves...
Calculating pr auc curves...
Calculating liftcurve_dfs...
Calculating shap interaction values... (this may take a while)
Reminder: TreeShap computational complexity is O(TLD^2), where T is the number of trees, L is the maximum number of leaves in any tree and D the maximal depth of any tree. So reducing these will s