Data - pandas, numpy

Visualisation - matplotlib, seaborn

Encoding - OneHotEncoder, LabelEncoder

Pipelining - imblearn.Pipeline, sklearn.compose.ColumnTransformer

Scaling - StandardScaler

PCA - PCA

Resampling - imblearn.under_sampling.RandomUnderSampler()

Model selection - sklearn models, xgboost, lightgbm

Hyperparameter tuning - GridSearchCV

Ensembling - sklearn.ensemble.VotingClassifier

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('dataset/loan_dataset_2.csv')
data = data.drop(columns=['LoanID'], axis=1)
# data
X = data.drop(columns=['Default'], axis=1)
y = data['Default']
# X
# y
# data
data['Education'].value_counts()

Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

In [4]:
numerical_columns=['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
ordinal_columns=['Education']
nominal_columns=['EmploymentType', 'MaritalStatus', 'LoanPurpose']
binary_columns=['HasMortgage', 'HasDependents', 'HasCoSigner']

For ordinal_columns - Ordinal Encoder
For nominal_columns - OneHotEncoder
for binary_columns - OneHotEncoder, drop='if_binary'
for numerical_columns - StandardScaler()

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.compose import ColumnTransformer

education_order = [
    "High School",
    "Bachelor's",
    "Master's",
    "PhD"
]

preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=[education_order]), ordinal_columns),
    ('nominal', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_columns),
    ('binary', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False), binary_columns),
    ('scaling', StandardScaler(), numerical_columns),
])

In [8]:
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN, SMOTE

In [9]:
model = Pipeline([
    ('preprocessing', preprocessor),
    ('oversampling', ADASYN()),
    ('pca', PCA(n_components=0.95)),
    ('model', RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        max_depth=5,
        class_weight='balanced',
        n_jobs=-1
    ))
])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('oversampling', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ordinal', ...), ('nominal', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['High School', ""Bachelor's"", ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,sampling_strategy,'auto'
,random_state,
,n_neighbors,5

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
# After fitting the pipeline
smote_step = model.named_steps['oversampling']

# Get the resampled X and y from the SMOTE step
X_resampled, y_resampled = smote_step.fit_resample(
    model.named_steps['preprocessing'].transform(X_train),  # after preprocessing
    y_train
)

print("After SMOTE — class distribution:")
print(pd.Series(y_resampled).value_counts())

After SMOTE — class distribution:
Default
0    180524
1    176673
Name: count, dtype: int64


In [13]:
y_pred = model.predict(X_test)

Metrics - cross_val_score, accuracy_score, precision_score, f1_score,

In [14]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score

In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("F1: ", f1)


from sklearn.metrics import classification_report, f1_score, roc_auc_score, recall_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

# Macro-averaged F1 (treats both classes equally)
print("F1-macro :", f1_score(y_test, y_pred, average='macro'))
print("Recall :", recall_score(y_test, y_pred))
print("AUC      :", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

Accuracy:  0.6687879381241433
Precision:  0.2109985831977751
F1:  0.3222342428977842
              precision    recall  f1-score   support

           0      0.941     0.667     0.781     45170
           1      0.211     0.682     0.322      5900

    accuracy                          0.669     51070
   macro avg      0.576     0.674     0.552     51070
weighted avg      0.857     0.669     0.728     51070

F1-macro : 0.551539882937821
Recall : 0.6815254237288135
AUC      : 0.7324741616417076


In [None]:
def debug_pipeline(pipe, X):
    """Print shape + feature names after every step."""
    X_cur = X.copy()
    print("\n=== PIPELINE DEBUG ===")
    for name, step in pipe.named_steps.items():
        # Fit the step (if not already fitted)
        if hasattr(step, "fit_transform"):
            X_cur = step.fit_transform(X_cur) if name != 'clf' else X_cur
        elif hasattr(step, "fit_resample"):
            X_cur, _ = step.fit_resample(X_cur, y_train)
        else:
            continue

        # After pre-processing we can get names
        if name == 'prep':
            names = step.get_feature_names_out()
            print(f"After '{name}': shape {X_cur.shape} → {len(names)} columns")
            print("  Sample names:", names[:8], "...")
        else:
            print(f"After '{name}': shape {X_cur.shape}")

debug_pipeline(model, X_train)
kept_names = preprocessor.get_feature_names_out()
original_cols = X_train.columns.tolist()

dropped = [c for c in original_cols if c not in [n.split('__')[-1] for n in kept_names]]
print("\nColumns that were **dropped** by ColumnTransformer:", dropped)


=== PIPELINE DEBUG ===
After 'preprocessing': shape (204277, 25)


In [None]:
prep = model.named_steps['preprocessing']
kept_names = prep.get_feature_names_out()
original_cols = X_train.columns.tolist()

dropped = [c for c in original_cols if c not in [n.split('__')[-1] for n in kept_names]]
print("\nColumns that were **dropped** by ColumnTransformer:", dropped)