Data - pandas, numpy

Visualisation - matplotlib, seaborn

Encoding - OneHotEncoder, LabelEncoder

Pipelining - imblearn.Pipeline, sklearn.compose.ColumnTransformer

Scaling - StandardScaler

PCA - PCA

Resampling - imblearn.under_sampling.RandomUnderSampler()

Model selection - sklearn models, xgboost, lightgbm

Hyperparameter tuning - GridSearchCV

Ensembling - sklearn.ensemble.VotingClassifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('dataset/loan_dataset_2.csv')
data = data.drop(columns=['LoanID'], axis=1)
# data
X = data.drop(columns=['Default'], axis=1)
y = data['Default']
# X
# y
# data
data['Education'].value_counts

<bound method IndexOpsMixin.value_counts of 0          Bachelor's
1            Master's
2            Master's
3         High School
4          Bachelor's
             ...     
255342     Bachelor's
255343    High School
255344    High School
255345    High School
255346     Bachelor's
Name: Education, Length: 255347, dtype: object>

In [3]:
data

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255342,19,37979,210682,541,109,4,14.11,12,0.85,Bachelor's,Full-time,Married,No,No,Other,No,0
255343,32,51953,189899,511,14,2,11.55,24,0.21,High School,Part-time,Divorced,No,No,Home,No,1
255344,56,84820,208294,597,70,3,5.29,60,0.50,High School,Self-employed,Married,Yes,Yes,Auto,Yes,0
255345,42,85109,60575,809,40,1,20.90,48,0.44,High School,Part-time,Single,Yes,Yes,Other,No,0


In [4]:
numerical_columns=['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
ordinal_columns=['Education']
nominal_columns=['EmploymentType', 'MaritalStatus', 'LoanPurpose']
binary_columns=['HasMortgage', 'HasDependents', 'HasCoSigner']

For ordinal_columns - Ordinal Encoder
For nominal_columns - OneHotEncoder
for binary_columns - OneHotEncoder, drop='if_binary'
for numerical_columns - StandardScaler()

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.compose import ColumnTransformer

education_order = [
    "High School",
    "Bachelor's",
    "Master's",
    "PhD"
]

preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=[education_order]), ordinal_columns),
    ('nominal', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_columns),
    ('binary', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False), binary_columns),
    ('scaling', StandardScaler(), numerical_columns),
])

In [8]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [9]:
# model = Pipeline([
#     ('preprocessing', preprocessor),
#     ('oversampling', ),
#     # ('pca', PCA(n_components=0.95)),
#     # ('model', RandomForestClassifier(
#     #     n_estimators=300,
#     #     random_state=42,
#     #     max_depth=5,
#     #     class_weight='balanced',
#     #     n_jobs=-1
#     # ))
#     ('model', DecisionTreeClassifier())
# ])
# model = SVC(gamma='auto')
# model = LogisticRegression()
processed_data = Pipeline([
    ('preprocessing', preprocessor),
    ('undersampling', RandomUnderSampler(
        random_state=42
    )),
    # ('PCA', PCA(n_components=5, random_state=42))
])

In [10]:
y = data['Default']
X = data.drop(['Default'], axis=1)
# X = data

In [11]:
pca = PCA(
    n_components=5,
    random_state=42,
)
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y ,test_size=0.2,random_state=42, stratify=y)

X_train_undersampled, y_train_undersampled = processed_data.fit_resample(X_train_raw, y_train_raw)

X_train = pca.fit_transform(X_train_undersampled)
X_test = pca.transform(preprocessor.transform(X_test_raw))

y_train = y_train_undersampled
y_test = y_test_raw

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (47444, 5)
y_train shape:  (47444,)
X_test shape:  (51070, 5)
y_test shape:  (51070,)


In [12]:
models = {
    'LogisticRegression' : LogisticRegression(),
    'RandomForestClassifier' : RandomForestClassifier(
        max_depth= 4,
        n_estimators= 100,
        n_jobs= -1,
    ),
    'DecisionTreeClassifier' : DecisionTreeClassifier(
        max_depth= 2,
    ),
    'XGBoost' : XGBClassifier(
        n_jobs= -1,
        n_estimators=100,
    ),
    'LightGBM' : LGBMClassifier(
        n_jobs= -1,
    ),
    'SVM' : SVC(),
    'MLPClassifier' : MLPClassifier(),
    'KNeighborsClassifier' : KNeighborsClassifier(),
}

In [None]:
from sklearn.model_selection import cross_val_score

res_train, res_cross_val = [], []
for item in models:
    model = models[item]
    model.fit(X_train, y_train)
    res_train.append(model.score(X_train, y_train))
    res_cross_val.append(cross_val_score(model, X_train, y_train, scoring='precision').mean())

pd.DataFrame({'Model' : models.keys(), 'Training score' : res_train, 'Cross_val_score_mean' : res_cross_val})

[LightGBM] [Info] Number of positive: 23722, number of negative: 23722
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 47444, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 18978, number of negative: 18977
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 37955, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500013 -> initscore=0.000053
[LightGBM] [Info] Start training from score 0.000053




[LightGBM] [Info] Number of positive: 18978, number of negative: 18977
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 37955, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500013 -> initscore=0.000053
[LightGBM] [Info] Start training from score 0.000053
[LightGBM] [Info] Number of positive: 18977, number of negative: 18978
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 37955, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499987 -> initscore=-0.000053
[LightGBM] [Info] Start training from score -0.000053




[LightGBM] [Info] Number of positive: 18977, number of negative: 18978
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 37955, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499987 -> initscore=-0.000053
[LightGBM] [Info] Start training from score -0.000053
[LightGBM] [Info] Number of positive: 18978, number of negative: 18978
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 37956, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




____Ensembling KNN and XGBoost____

In [None]:
# # After fitting the pipeline
# smote_step = model.named_steps['oversampling']

# # Get the resampled X and y from the SMOTE step
# X_resampled, y_resampled = smote_step.fit_resample(
#     model.named_steps['preprocessing'].transform(X_train),  # after preprocessing
#     y_train
# )

# print("After RandomUnderSampling — class distribution:")
# print(pd.Series(y_resampled).value_counts())

In [None]:
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

Metrics - cross_val_score, accuracy_score, precision_score, f1_score,

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, classification_report, zero_one_loss
from sklearn.model_selection import cross_val_score

In [None]:
accuracy_test = accuracy_score(y_test, y_pred)
accuracy_train = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Training accuracy: ", accuracy_train)
print("Testing accuracy: ", accuracy_test)
print("Precision: ", precision)
print("F1: ", f1)


from sklearn.metrics import classification_report, f1_score, roc_auc_score, recall_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

# Macro-averaged F1 (treats both classes equally)
print("F1-macro :", f1_score(y_test, y_pred, average='macro'))
print("Recall :", recall_score(y_test, y_pred))
print("Zero_One_loss :", zero_one_loss(y_test, y_pred))
# print("AUC      :", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

In [None]:
# def debug_pipeline(pipe, X):
#     """Print shape + feature names after every step."""
#     X_cur = X.copy()
#     print("\n=== PIPELINE DEBUG ===")
#     for name, step in pipe.named_steps.items():
#         # Fit the step (if not already fitted)
#         if hasattr(step, "fit_transform"):
#             X_cur = step.fit_transform(X_cur) if name != 'clf' else X_cur
#         elif hasattr(step, "fit_resample"):
#             X_cur, _ = step.fit_resample(X_cur, y_train)
#         else:
#             continue

#         # After pre-processing we can get names
#         if name == 'prep':
#             names = step.get_feature_names_out()
#             print(f"After '{name}': shape {X_cur.shape} → {len(names)} columns")
#             print("  Sample names:", names[:8], "...")
#         else:
#             print(f"After '{name}': shape {X_cur.shape}")

# debug_pipeline(model, X_train)
# kept_names = preprocessor.get_feature_names_out()
# original_cols = X_train.columns.tolist()

# dropped = [c for c in original_cols if c not in [n.split('__')[-1] for n in kept_names]]
# print("\nColumns that were **dropped** by ColumnTransformer:", dropped)

In [None]:
# prep = model.named_steps['preprocessing']
# kept_names = prep.get_feature_names_out()
# original_cols = X_train.columns.tolist()

# dropped = [c for c in original_cols if c not in [n.split('__')[-1] for n in kept_names]]
# print("\nColumns that were **dropped** by ColumnTransformer:", dropped)