In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Loading cleaned version of Dataset

df = pd.read_csv('cleaned_loan_default.csv')

## Encoding Categorical Columns

I have convert all categorical columns into numerical values using one hot encoding.
This is required because machine learning models such as Logistic Regression and Decision Tree cannot work with text labels.

I am using pd.get_dummies() to ensure safe and simple encoding.

In [None]:
# Dropping the ID column because it is not useful in prediction

df = df.drop(columns='ID')

In [None]:
# Encoding Features

df_encoded = pd.get_dummies(df, drop_first=True)

In [None]:
df_encoded.shape

In [None]:
df_encoded.head()

## Train–Test Split

I have separate the data into features (X) and target (y) and then split it into training and testing sets.
The training set is used to fit the model, and the test set is used to evaluate how well the model generalizes to unseen data.

In [None]:
X = df_encoded.drop(columns='Status', axis=1)
Y = df_encoded['Status']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

## Train Logistic Regression Model

I have train a Logistic Regression model to predict loan default.
This algorithm works well for binary classification problems like default vs non default.
I have first scale the numeric features for better performance then fit the model and generate predictions.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, Y_train)

Y_pred_log = log_reg.predict(X_test_scaled)

In [None]:
# Evaluation

# Confusion Matrix:
cm_log = confusion_matrix(Y_test, Y_pred_log)

# Confusion Matrix HeatMap:
plt.figure(figsize=(6,4))
sns.heatmap(cm_log, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Report & Accuracy

log_reg_accuracy = accuracy_score(Y_test, Y_pred_log)
print("Logistic Regression Accuracy:", log_reg_accuracy)

print("Classification Report - Logistic Regression:")
print(classification_report(Y_test, Y_pred_log))


## Train Decision Tree Classifier

I have train a Decision Tree classifier as well on the same train test split.
Decision Trees can capture non linear patterns and don’t require feature scaling so that's why I train this as well to see which model performs better.


In [None]:

dt = DecisionTreeClassifier(random_state=42, max_depth=5)

dt.fit(X_train, Y_train)

y_pred_dt = dt.predict(X_test)

In [None]:
# Evaluation

# Confusion Matrix

cm_dt = confusion_matrix(Y_test, y_pred_dt)

plt.figure(figsize=(6,4))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Classification Report

print("Classification Report - Decision Tree:")
print(classification_report(Y_test, y_pred_dt))

# Accuracy
dt_accuracy = accuracy_score(Y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_accuracy)

## Accuracy Comparison 

I have compare Logistic Regression and Decision Tree based on their accuracy using a bar chart.

In [None]:
# Accuracy Comparison Bar Plot
model_names = ['Logistic Regression', 'Decision Tree']
accuracies = [log_reg_accuracy, dt_accuracy]

plt.figure(figsize=(7,5))
sns.barplot(x=model_names, y=accuracies)
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0,1)
plt.show()

Both models performed well but the Decision Tree achieved slightly higher accuracy proving that the relationships in this dataset are non linear and better captured by a tree based model.
However, Logistic Regression still delivered strong performance and gives more interpretable coefficients.