In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance
import joblib

In [88]:
DATASET_PATH = "../data/Loan_Default.csv"

MAP_OPEN_CREDIT      = {'opc': 1, 'nopc': 0}
MAP_NEG_AMMO         = {'neg_amm': 1, 'not_neg': 0}
MAP_INTEREST_ONLY    = {'int_only': 1, 'not_int': 0}
MAP_LUMP_SUM_PAYMENT = {'lpsm': 1, 'not_lpsm': 0}
MAP_AGE              = {'<25': 0, '25-34': 1, '35-44': 2, '45-54': 3, '55-64': 4, '65-74': 5, '>74': 6 }
MAP_REGION           = {'south':0, 'North': 1, 'central': 2, 'North-East': 3}
MAP_BS_OR_COMM       = {'nob/c': 0, 'b/c': 1}
MAP_OCC_TYPE         = {'pr': 0, 'sr': 1, 'ir': 2}
MAP_SECURED_BY       = {'home': 0, 'land': 1}

LOW_QUANTILE = 0.10
HIGH_QUANTILE = 0.90
IRQ_COEFF = 3

# Data Processing

### Loading Dataset

In [89]:
ds = pd.read_csv(DATASET_PATH)
important_df =  ds[['loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges',
                    'term', 'property_value', 'LTV', 'Credit_Score', 'income', 'dtir1',
                    'open_credit', 'Neg_ammortization', 'interest_only', 'lump_sum_payment',
                    'age', 'Region', 'business_or_commercial', 'occupancy_type', 'Secured_by',
                    'Status']]

important_df.loc[:, 'open_credit']            = important_df['open_credit'].map(MAP_OPEN_CREDIT)
important_df.loc[:, 'Neg_ammortization']      = important_df['Neg_ammortization'].map(MAP_NEG_AMMO)
important_df.loc[:, 'interest_only']          = important_df['interest_only'].map(MAP_INTEREST_ONLY)
important_df.loc[:, 'lump_sum_payment']       = important_df['lump_sum_payment'].map(MAP_LUMP_SUM_PAYMENT)
important_df.loc[:, 'age']                    = important_df['age'].map(MAP_AGE)
important_df.loc[:, 'Region']                 = important_df['Region'].map(MAP_REGION)
important_df.loc[:, 'business_or_commercial'] = important_df['business_or_commercial'].map(MAP_BS_OR_COMM)
important_df.loc[:, 'occupancy_type']         = important_df['occupancy_type'].map(MAP_OCC_TYPE)
important_df.loc[:, 'Secured_by']             = important_df['Secured_by'].map(MAP_SECURED_BY)

important_df = important_df.apply(lambda col: pd.to_numeric(col, errors='coerce') if col.dtype == 'object' else col)

important_df.drop(columns=['LTV', 'dtir1', 'Interest_rate_spread'], inplace=True)

In [90]:
important_df.head(5)

Unnamed: 0,loan_amount,rate_of_interest,Upfront_charges,term,property_value,Credit_Score,income,open_credit,Neg_ammortization,interest_only,lump_sum_payment,age,Region,business_or_commercial,occupancy_type,Secured_by,Status
0,116500,,,360.0,118000.0,758,1740.0,0,0.0,0,0,1.0,0,0,0,0,1
1,206500,,,360.0,,552,4980.0,0,0.0,0,1,4.0,1,1,0,0,1
2,406500,4.56,595.0,360.0,508000.0,834,9480.0,0,1.0,0,0,2.0,0,0,0,0,0
3,456500,4.25,,360.0,658000.0,587,11880.0,0,0.0,0,0,3.0,1,0,0,0,0
4,696500,4.0,0.0,360.0,758000.0,602,10440.0,0,0.0,0,0,1.0,1,0,0,0,0


### Handling NaN values

In [91]:
missing = list()
for x in important_df.columns:
    if important_df[x].isnull().sum() != 0:
        print(f"{x:<30}{important_df[x].isnull().sum():<10}{(important_df[x].isnull().sum() / important_df.shape[0])*100}%")
        missing.append(x)

rate_of_interest              36439     24.509988565278807%
Upfront_charges               39642     26.664424564471652%
term                          41        0.027577856998722002%
property_value                15098     10.15537768211475%
income                        9150      6.154570525324544%
Neg_ammortization             121       0.08138830967915517%
age                           200       0.13452613170108293%


In [92]:
for col in missing:
    if col == 'Neg_ammortization' or col == 'age':
        important_df[col].fillna(important_df[col].mode()[0], inplace=True)
    else:
        important_df[col].fillna(important_df[col].median(), inplace=True)

print(important_df.isnull().sum())

loan_amount               0
rate_of_interest          0
Upfront_charges           0
term                      0
property_value            0
Credit_Score              0
income                    0
open_credit               0
Neg_ammortization         0
interest_only             0
lump_sum_payment          0
age                       0
Region                    0
business_or_commercial    0
occupancy_type            0
Secured_by                0
Status                    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  important_df[col].fillna(important_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  important_df[col].fillna(important_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [93]:
columns_to_convert = ['Neg_ammortization', 'age']
important_df[columns_to_convert] = important_df[columns_to_convert].astype(int)

print(important_df.dtypes)

loan_amount                 int64
rate_of_interest          float64
Upfront_charges           float64
term                      float64
property_value            float64
Credit_Score                int64
income                    float64
open_credit                 int64
Neg_ammortization           int64
interest_only               int64
lump_sum_payment            int64
age                         int64
Region                      int64
business_or_commercial      int64
occupancy_type              int64
Secured_by                  int64
Status                      int64
dtype: object


In [94]:
important_df.head(5)

Unnamed: 0,loan_amount,rate_of_interest,Upfront_charges,term,property_value,Credit_Score,income,open_credit,Neg_ammortization,interest_only,lump_sum_payment,age,Region,business_or_commercial,occupancy_type,Secured_by,Status
0,116500,3.99,2596.45,360.0,118000.0,758,1740.0,0,0,0,0,1,0,0,0,0,1
1,206500,3.99,2596.45,360.0,418000.0,552,4980.0,0,0,0,1,4,1,1,0,0,1
2,406500,4.56,595.0,360.0,508000.0,834,9480.0,0,1,0,0,2,0,0,0,0,0
3,456500,4.25,2596.45,360.0,658000.0,587,11880.0,0,0,0,0,3,1,0,0,0,0
4,696500,4.0,0.0,360.0,758000.0,602,10440.0,0,0,0,0,1,1,0,0,0,0


### Detecting anomalies

In [95]:
def detect_anomalies(df, column):
    Q1 = df[column].quantile(LOW_QUANTILE)
    Q3 = df[column].quantile(HIGH_QUANTILE)
    IQR = Q3 - Q1
    lower_bound = Q1 - IRQ_COEFF * IQR
    upper_bound = Q3 + IRQ_COEFF * IQR
    anomalies = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return anomalies

numerical_columns = ['loan_amount', 'rate_of_interest', 'Upfront_charges', 'term', 'property_value', 'Credit_Score', 'income']
for col in numerical_columns:
    anomalies = detect_anomalies(important_df, col)
    print(f"Anomalies in {col}:")
    print(anomalies.shape[0])

Anomalies in loan_amount:
50
Anomalies in rate_of_interest:
0
Anomalies in Upfront_charges:
33
Anomalies in term:
0
Anomalies in property_value:
344
Anomalies in Credit_Score:
0
Anomalies in income:
484


In [96]:
def replace_anomalies_with_minmax_values(df, column):
    sorted_values = df[column].sort_values()
    Q1 = sorted_values.quantile(LOW_QUANTILE)
    Q3 = sorted_values.quantile(HIGH_QUANTILE)
    IQR = Q3 - Q1
    lower_bound = Q1 - IRQ_COEFF * IQR
    upper_bound = Q3 + IRQ_COEFF * IQR

    min_real_value = sorted_values[sorted_values >= lower_bound].min()
    max_real_value = sorted_values[sorted_values <= upper_bound].max()

    df.loc[df[column] < lower_bound, column] = min_real_value
    df.loc[df[column] > upper_bound, column] = max_real_value

numerical_columns = ['loan_amount', 'rate_of_interest', 'Upfront_charges', 'term', 'property_value', 'Credit_Score', 'income']
for col in numerical_columns:
    replace_anomalies_with_minmax_values(important_df, col)

print("Anomalies replaced with real values. Updated DataFrame:")
important_df.head(5)

Anomalies replaced with real values. Updated DataFrame:


Unnamed: 0,loan_amount,rate_of_interest,Upfront_charges,term,property_value,Credit_Score,income,open_credit,Neg_ammortization,interest_only,lump_sum_payment,age,Region,business_or_commercial,occupancy_type,Secured_by,Status
0,116500,3.99,2596.45,360.0,118000.0,758,1740.0,0,0,0,0,1,0,0,0,0,1
1,206500,3.99,2596.45,360.0,418000.0,552,4980.0,0,0,0,1,4,1,1,0,0,1
2,406500,4.56,595.0,360.0,508000.0,834,9480.0,0,1,0,0,2,0,0,0,0,0
3,456500,4.25,2596.45,360.0,658000.0,587,11880.0,0,0,0,0,3,1,0,0,0,0
4,696500,4.0,0.0,360.0,758000.0,602,10440.0,0,0,0,0,1,1,0,0,0,0


### Scaling

In [97]:
def scale_columns(df, columns):
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        df[col] = (df[col] - mean) / std

if False:
    numerical_columns = ['loan_amount', 'rate_of_interest', 'Upfront_charges', 'term', 'property_value', 'Credit_Score', 'income']
    scale_columns(important_df, numerical_columns)

    print("Numerical columns scaled using z-score normalization:")
    important_df.head(5)

# Model Training

In [98]:
status_counts = important_df['Status'].value_counts()
print("Count of rows where Status is 1:", status_counts.get(1, 0))
print("Count of rows where Status is 0:", status_counts.get(0, 0))

Count of rows where Status is 1: 36639
Count of rows where Status is 0: 112031


In [99]:
X = important_df.drop(columns=['Status'])
y = important_df['Status']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(solver='liblinear'))
    ]),
    'Naive Bayes': Pipeline([
        ('model', GaussianNB())
    ]),
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(kernel='rbf', probability=True))
    ])
}

In [100]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nCross-Validation Results on Training Data:\n")
for name, pipeline in models.items():
    print(f"Model: {name}")
    results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring)
    for metric in scoring:
        print(f"{metric:10s}: {np.mean(results['test_' + metric]):.4f}")
    print()


Cross-Validation Results on Training Data:

Model: Logistic Regression
accuracy  : 0.7750
precision : 0.6907
recall    : 0.1574
f1        : 0.2563
roc_auc   : 0.6906

Model: Naive Bayes
accuracy  : 0.9274
precision : 0.7782
recall    : 0.9866
f1        : 0.8701
roc_auc   : 0.9738

Model: SVM
accuracy  : 0.9351
precision : 0.8099
recall    : 0.9628
f1        : 0.8798
roc_auc   : 0.9865



In [102]:
print("\nFinal Evaluation on Validation Set:\n")
for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    y_proba = pipeline.predict_proba(X_val)[:, 1] if hasattr(pipeline.named_steps['model'], 'predict_proba') else None

    acc  = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec  = recall_score(y_val, y_pred)
    f1   = f1_score(y_val, y_pred)
    roc  = roc_auc_score(y_val, y_proba) if y_proba is not None else None

    print(f"Model: {name}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")
    if roc is not None:
        print(f"ROC AUC  : {roc:.4f}")
    print()


Final Evaluation on Validation Set:

Model: Logistic Regression
Accuracy : 0.7793
Precision: 0.7173
Recall   : 0.1728
F1 Score : 0.2785
ROC AUC  : 0.6928

Model: Naive Bayes
Accuracy : 0.9259
Precision: 0.7743
Recall   : 0.9869
F1 Score : 0.8678
ROC AUC  : 0.9724

Model: SVM
Accuracy : 0.9371
Precision: 0.8106
Recall   : 0.9720
F1 Score : 0.8840
ROC AUC  : 0.9870



### Saving models

In [104]:
for name, pipeline in models.items():
    filename = f"{name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(pipeline, filename)
    print(f"Saved {name} to {filename}\n")

Saved Logistic Regression to logistic_regression_model.pkl

Saved Naive Bayes to naive_bayes_model.pkl

Saved SVM to svm_model.pkl



# Feature Importance

In [85]:
log_model = models['Logistic Regression'].named_steps['model']
feature_names = X_train.columns

coeffs = log_model.coef_[0]
logreg_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': coeffs,
    'Abs_Importance': np.abs(coeffs)
}).sort_values('Abs_Importance', ascending=False)

print("Top Logistic Regression Features:")
print(logreg_importance.head(20))

Top Logistic Regression Features:
                   Feature  Importance  Abs_Importance
4           property_value   -0.415809        0.415809
0              loan_amount    0.361626        0.361626
8        Neg_ammortization    0.351474        0.351474
10        lump_sum_payment    0.339224        0.339224
2          Upfront_charges   -0.292692        0.292692
1         rate_of_interest   -0.224170        0.224170
6                   income   -0.219272        0.219272
11                     age    0.182858        0.182858
14          occupancy_type    0.174710        0.174710
13  business_or_commercial    0.115967        0.115967
3                     term   -0.079339        0.079339
9            interest_only    0.070162        0.070162
15              Secured_by    0.066425        0.066425
7              open_credit    0.047593        0.047593
12                  Region   -0.040926        0.040926
5             Credit_Score   -0.017689        0.017689


In [None]:
nb_model = models['Naive Bayes'].named_steps['model']
class_means = nb_model.theta_

mean_diff = class_means[1] - class_means[0]
nb_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': mean_diff,
    'Abs_Importance': np.abs(mean_diff)
}).sort_values('Abs_Importance', ascending=False)

print("Top Naive Bayes Features:")
print(nb_importance.head(20))

Top Naive Bayes Features:
                   Feature    Importance  Abs_Importance
4           property_value -66369.567259    66369.567259
0              loan_amount -16937.851638    16937.851638
6                   income   -972.076249      972.076249
2          Upfront_charges   -611.976775      611.976775
5             Credit_Score      0.839423        0.839423
3                     term     -0.237280        0.237280
11                     age      0.143266        0.143266
8        Neg_ammortization      0.109438        0.109438
13  business_or_commercial      0.074087        0.074087
10        lump_sum_payment      0.062635        0.062635
1         rate_of_interest     -0.052415        0.052415
12                  Region     -0.033523        0.033523
14          occupancy_type      0.030856        0.030856
9            interest_only      0.008086        0.008086
7              open_credit     -0.001445        0.001445
15              Secured_by      0.000785        0.000785


In [None]:
svm_pipeline = models['SVM']
result = permutation_importance(svm_pipeline, X_val, y_val, n_repeats=10, random_state=42, scoring='accuracy')

svm_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': result.importances_mean,
    'Std': result.importances_std
}).sort_values('Importance', ascending=False)

print("Top SVM Features (via permutation importance):")
print(svm_importance.head(20))