## 1. Import Libraries

In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer


## 2. Load Basic Engineered Dataset

In [72]:
df = pd.read_csv("../data/processed/credit_default_engineered.csv")
df.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,max_delay,avg_delay,delay_count,total_bill_6m,avg_bill_6m,bill_trend,total_pay_6m,avg_pay_6m,utilization_ratio,payment_ratio
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,2,-0.333333,2,7704,1284.0,3913,689,114.833333,0.3852,0.089422
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,2,0.5,2,17077,2846.166667,-579,5000,833.333333,0.142308,0.292774
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,0,0.0,0,101653,16942.166667,13690,11018,1836.333333,1.129478,0.108387
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,0,0.0,0,231334,38555.666667,17443,8388,1398.0,4.62668,0.036259
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,0,-0.333333,0,109339,18223.166667,-10514,59049,9841.5,2.18678,0.540049


## 3. Trainâ€“Test Split

In [74]:
X = df.drop("default payment next month", axis=1)
y = df["default payment next month"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


## 4. Handle Missing Values

In [75]:
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


## 5. Baseline Models

### 5.1 Logistic Regression

In [76]:
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.8038333333333333
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      4673
           1       0.64      0.26      0.37      1327

    accuracy                           0.80      6000
   macro avg       0.73      0.61      0.63      6000
weighted avg       0.78      0.80      0.77      6000

[[4473  200]
 [ 977  350]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 5.2 Decision Tree

In [77]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))


Decision Tree Accuracy: 0.7196666666666667
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      4673
           1       0.38      0.40      0.39      1327

    accuracy                           0.72      6000
   macro avg       0.60      0.61      0.60      6000
weighted avg       0.73      0.72      0.72      6000

[[3785  888]
 [ 794  533]]


### 5.3 Random Forest

In [78]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.809
              precision    recall  f1-score   support

           0       0.84      0.94      0.88      4673
           1       0.62      0.36      0.45      1327

    accuracy                           0.81      6000
   macro avg       0.73      0.65      0.67      6000
weighted avg       0.79      0.81      0.79      6000

[[4382  291]
 [ 855  472]]


### 5.4 XGBoost (Default)

In [79]:
xgb = XGBClassifier(
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))


XGBoost Accuracy: 0.8126666666666666
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4673
           1       0.63      0.38      0.47      1327

    accuracy                           0.81      6000
   macro avg       0.73      0.66      0.68      6000
weighted avg       0.79      0.81      0.79      6000

[[4376  297]
 [ 827  500]]


## 6. Comparison Summary

In [82]:
print("\nBASELINE ACCURACY COMPARISON:")
print()
print("Logistic Regression:", accuracy_score(y_test, y_pred_lr))
print("Decision Tree:", accuracy_score(y_test, y_pred_dt))
print("Random Forest:", accuracy_score(y_test, y_pred_rf))
print("XGBoost:", accuracy_score(y_test, y_pred_xgb))



BASELINE ACCURACY COMPARISON:

Logistic Regression: 0.8038333333333333
Decision Tree: 0.7196666666666667
Random Forest: 0.809
XGBoost: 0.8126666666666666
