# 1- Importing Libraries

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import mlxtend
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


#Evauluation Metrics
from sklearn.metrics import accuracy_score,f1_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import pickle
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# 2 - Importing Dataset

In [None]:
df = pd.read_csv("../data/cleaned_data.csv")
df.head()

Unnamed: 0,person_age,person_income,home_ownership_type,employement_duration,loan_intent,loan_grade,loan_amount,loan_int_rate,loan_status,loan_percent_income,loan_default,credit_history_duration
0,22.0,59000.0,RENT,14.5,PERSONAL,D,23125.0,16.02,1,0.44,Y,3.0
1,21.0,9600.0,OWN,5.0,EDUCATION,B,1000.0,11.14,0,0.1,N,2.0
2,25.0,9600.0,MORTGAGE,1.0,MEDICAL,C,5500.0,12.87,1,0.44,N,3.0
3,23.0,65500.0,RENT,4.0,MEDICAL,C,23125.0,15.23,1,0.44,N,2.0
4,24.0,54400.0,RENT,8.0,MEDICAL,C,23125.0,14.27,1,0.44,Y,4.0


# 3- Encoding Categorical Variables

In [43]:
import pandas as pd

# One-hot encode with drop_first=True to avoid dummy variable trap
dummies_ownership = pd.get_dummies(df['home_ownership_type'], prefix='own', drop_first=True).astype(int)
dummies_intent = pd.get_dummies(df['loan_intent'], prefix='intent', drop_first=True).astype


In [44]:
from sklearn.preprocessing import OrdinalEncoder

# Define order
ordinal = OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])
df['loan_grade_encoded'] = ordinal.fit_transform(df[['loan_grade']])


In [45]:
# Drop original columns
df_cleaned = df.drop(['home_ownership_type', 'loan_intent', 'loan_grade'], axis=1)

# Concatenate encoded features
df_encoded = pd.concat([df_cleaned, dummies_ownership, dummies_intent], axis=1)


In [46]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_encoded['loan_default'] = le.fit_transform(df_encoded['loan_default'])

In [47]:
X = df_encoded.drop('loan_default', axis=1)
y = df_encoded['loan_default']
y

0        1
1        0
2        0
3        0
4        1
        ..
32411    0
32412    0
32413    0
32414    0
32415    0
Name: loan_default, Length: 32416, dtype: int64

In [48]:
X_train,X_test,y_train,y_test  =train_test_split(X,y,test_size=0.3,random_state=42)

# 4- Training Multiple Models

In [69]:
models = {
    "LogisticRegression": LogisticRegression(solver="saga", penalty="elasticnet", max_iter=10),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "CatBoostClassifier": CatBoostClassifier(verbose=0),
    "XGBClassifier": XGBClassifier(eval_metric="logloss", use_label_encoder=False),
    "KNeighborsClassifier": KNeighborsClassifier()
}

params = {
    "LogisticRegression": {
        "l1_ratio": [0.0, 0.5, 1.0],
        "C": [0.1, 1, 10]
    },
    "AdaBoostClassifier": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1]
    },
    "RandomForestClassifier": {
        "n_estimators": [100, 200],
        "max_depth": [10, 30, 50],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "CatBoostClassifier": {
        "iterations": [100, 200],
        "depth": [4, 6, 10],
        "learning_rate": [0.01, 0.1]
    },
    "XGBClassifier": {
        "n_estimators": [100, 200],
        "max_depth": [3, 6, 10],
        "learning_rate": [0.01, 0.1],
        "subsample": [0.8, 1]
    },
    "KNeighborsClassifier": {
        "n_neighbors": [5, 10, 15],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    }
}


In [120]:
from sklearn.model_selection import GridSearchCV

best_models = {}

for name, model in models.items():
    print(f"\n Running GridSearchCV for: {name}")
    grid = GridSearchCV(estimator=model, param_grid=params[name], cv=2, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"✅ Best Score: {grid.best_score_:.4f}")
    print(f"✅ Best Params: {grid.best_params_}")



 Running GridSearchCV for: LogisticRegression




✅ Best Score: 0.8246
✅ Best Params: {'C': 0.1, 'l1_ratio': 0.0}

 Running GridSearchCV for: AdaBoostClassifier




✅ Best Score: 0.8316
✅ Best Params: {'learning_rate': 1, 'n_estimators': 200}

 Running GridSearchCV for: RandomForestClassifier
✅ Best Score: 0.8305
✅ Best Params: {'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

 Running GridSearchCV for: CatBoostClassifier
✅ Best Score: 0.8307
✅ Best Params: {'depth': 10, 'iterations': 100, 'learning_rate': 0.1}

 Running GridSearchCV for: XGBClassifier


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

✅ Best Score: 0.8314
✅ Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

 Running GridSearchCV for: KNeighborsClassifier
✅ Best Score: 0.8232
✅ Best Params: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}


In [122]:
best_cat = best_models["CatBoostClassifier"]
best_xgb = best_models["XGBClassifier"]
best_rf  = best_models["RandomForestClassifier"]


In [125]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(max_depth=50, n_estimators=200))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


# 6 -Evaulation Model Performance

In [127]:
print(f"accuracy_score:{accuracy_score(y_test,y_pred)}")
print(f"f1:{f1_score(y_test,y_pred)}")
print(f"classification_report:{classification_report(y_test,y_pred)}")
print(f"confusion_matrix:{confusion_matrix(y_test,y_pred)}")

accuracy_score:0.8190231362467867
f1:0.5582329317269076
classification_report:              precision    recall  f1-score   support

           0       0.91      0.86      0.89      7974
           1       0.50      0.64      0.56      1751

    accuracy                           0.82      9725
   macro avg       0.71      0.75      0.72      9725
weighted avg       0.84      0.82      0.83      9725

confusion_matrix:[[6853 1121]
 [ 639 1112]]
