In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

In [3]:
DATASET_PATH = "../data/Loan_Default.csv"

MAP_OPEN_CREDIT      = {'opc': 1, 'nopc': 0}
MAP_NEG_AMMO         = {'neg_amm': 1, 'not_neg': 0}
MAP_INTEREST_ONLY    = {'int_only': 1, 'not_int': 0}
MAP_LUMP_SUM_PAYMENT = {'lpsm': 1, 'not_lpsm': 0}
MAP_AGE              = {'<25': 0, '25-34': 1, '35-44': 2, '45-54': 3, '55-64': 4, '65-74': 5, '>74': 6 }
MAP_REGION           = {'south':0, 'North': 1, 'central': 2, 'North-East': 3}
MAP_BS_OR_COMM       = {'nob/c': 0, 'b/c': 1}
MAP_OCC_TYPE         = {'pr': 0, 'sr': 1, 'ir': 2}
MAP_SECURED_BY       = {'home': 0, 'land': 1}

LOW_QUANTILE = 0.10
HIGH_QUANTILE = 0.90
IRQ_COEFF = 3

# Data Processing

### Loading Dataset

In [4]:
ds = pd.read_csv(DATASET_PATH)
important_df =  ds[['loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges',
                    'term', 'property_value', 'LTV', 'Credit_Score', 'income', 'dtir1',
                    'open_credit', 'Neg_ammortization', 'interest_only', 'lump_sum_payment',
                    'age', 'Region', 'business_or_commercial', 'occupancy_type', 'Secured_by',
                    'Status']]

important_df.loc[:, 'open_credit']            = important_df['open_credit'].map(MAP_OPEN_CREDIT)
important_df.loc[:, 'Neg_ammortization']      = important_df['Neg_ammortization'].map(MAP_NEG_AMMO)
important_df.loc[:, 'interest_only']          = important_df['interest_only'].map(MAP_INTEREST_ONLY)
important_df.loc[:, 'lump_sum_payment']       = important_df['lump_sum_payment'].map(MAP_LUMP_SUM_PAYMENT)
important_df.loc[:, 'age']                    = important_df['age'].map(MAP_AGE)
important_df.loc[:, 'Region']                 = important_df['Region'].map(MAP_REGION)
important_df.loc[:, 'business_or_commercial'] = important_df['business_or_commercial'].map(MAP_BS_OR_COMM)
important_df.loc[:, 'occupancy_type']         = important_df['occupancy_type'].map(MAP_OCC_TYPE)
important_df.loc[:, 'Secured_by']             = important_df['Secured_by'].map(MAP_SECURED_BY)

important_df = important_df.apply(lambda col: pd.to_numeric(col, errors='coerce') if col.dtype == 'object' else col)

important_df.drop(columns=['LTV', 'dtir1', 'Interest_rate_spread'], inplace=True)

In [5]:
important_df.head(5)

Unnamed: 0,loan_amount,rate_of_interest,Upfront_charges,term,property_value,Credit_Score,income,open_credit,Neg_ammortization,interest_only,lump_sum_payment,age,Region,business_or_commercial,occupancy_type,Secured_by,Status
0,116500,,,360.0,118000.0,758,1740.0,0,0.0,0,0,1.0,0,0,0,0,1
1,206500,,,360.0,,552,4980.0,0,0.0,0,1,4.0,1,1,0,0,1
2,406500,4.56,595.0,360.0,508000.0,834,9480.0,0,1.0,0,0,2.0,0,0,0,0,0
3,456500,4.25,,360.0,658000.0,587,11880.0,0,0.0,0,0,3.0,1,0,0,0,0
4,696500,4.0,0.0,360.0,758000.0,602,10440.0,0,0.0,0,0,1.0,1,0,0,0,0


### Handling NaN values

In [6]:
missing = list()
for x in important_df.columns:
    if important_df[x].isnull().sum() != 0:
        print(f"{x:<30}{important_df[x].isnull().sum():<10}{(important_df[x].isnull().sum() / important_df.shape[0])*100}%")
        missing.append(x)

rate_of_interest              36439     24.509988565278807%
Upfront_charges               39642     26.664424564471652%
term                          41        0.027577856998722002%
property_value                15098     10.15537768211475%
income                        9150      6.154570525324544%
Neg_ammortization             121       0.08138830967915517%
age                           200       0.13452613170108293%


In [7]:
for col in missing:
    if col == 'Neg_ammortization' or col == 'age':
        important_df[col].fillna(important_df[col].mode()[0], inplace=True)
    else:
        important_df[col].fillna(important_df[col].median(), inplace=True)

print(important_df.isnull().sum())

loan_amount               0
rate_of_interest          0
Upfront_charges           0
term                      0
property_value            0
Credit_Score              0
income                    0
open_credit               0
Neg_ammortization         0
interest_only             0
lump_sum_payment          0
age                       0
Region                    0
business_or_commercial    0
occupancy_type            0
Secured_by                0
Status                    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  important_df[col].fillna(important_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  important_df[col].fillna(important_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [8]:
columns_to_convert = ['Neg_ammortization', 'age']
important_df[columns_to_convert] = important_df[columns_to_convert].astype(int)

print(important_df.dtypes)

loan_amount                 int64
rate_of_interest          float64
Upfront_charges           float64
term                      float64
property_value            float64
Credit_Score                int64
income                    float64
open_credit                 int64
Neg_ammortization           int64
interest_only               int64
lump_sum_payment            int64
age                         int64
Region                      int64
business_or_commercial      int64
occupancy_type              int64
Secured_by                  int64
Status                      int64
dtype: object


In [9]:
important_df.head(5)

Unnamed: 0,loan_amount,rate_of_interest,Upfront_charges,term,property_value,Credit_Score,income,open_credit,Neg_ammortization,interest_only,lump_sum_payment,age,Region,business_or_commercial,occupancy_type,Secured_by,Status
0,116500,3.99,2596.45,360.0,118000.0,758,1740.0,0,0,0,0,1,0,0,0,0,1
1,206500,3.99,2596.45,360.0,418000.0,552,4980.0,0,0,0,1,4,1,1,0,0,1
2,406500,4.56,595.0,360.0,508000.0,834,9480.0,0,1,0,0,2,0,0,0,0,0
3,456500,4.25,2596.45,360.0,658000.0,587,11880.0,0,0,0,0,3,1,0,0,0,0
4,696500,4.0,0.0,360.0,758000.0,602,10440.0,0,0,0,0,1,1,0,0,0,0


### Detecting anomalies

In [10]:
def detect_anomalies(df, column):
    Q1 = df[column].quantile(LOW_QUANTILE)
    Q3 = df[column].quantile(HIGH_QUANTILE)
    IQR = Q3 - Q1
    lower_bound = Q1 - IRQ_COEFF * IQR
    upper_bound = Q3 + IRQ_COEFF * IQR
    anomalies = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return anomalies

numerical_columns = ['loan_amount', 'rate_of_interest', 'Upfront_charges', 'term', 'property_value', 'Credit_Score', 'income']
for col in numerical_columns:
    anomalies = detect_anomalies(important_df, col)
    print(f"Anomalies in {col}:")
    print(anomalies.shape[0])

Anomalies in loan_amount:
50
Anomalies in rate_of_interest:
0
Anomalies in Upfront_charges:
33
Anomalies in term:
0
Anomalies in property_value:
344
Anomalies in Credit_Score:
0
Anomalies in income:
484


In [11]:
def replace_anomalies_with_minmax_values(df, column):
    sorted_values = df[column].sort_values()
    Q1 = sorted_values.quantile(LOW_QUANTILE)
    Q3 = sorted_values.quantile(HIGH_QUANTILE)
    IQR = Q3 - Q1
    lower_bound = Q1 - IRQ_COEFF * IQR
    upper_bound = Q3 + IRQ_COEFF * IQR

    min_real_value = sorted_values[sorted_values >= lower_bound].min()
    max_real_value = sorted_values[sorted_values <= upper_bound].max()

    df.loc[df[column] < lower_bound, column] = min_real_value
    df.loc[df[column] > upper_bound, column] = max_real_value

numerical_columns = ['loan_amount', 'rate_of_interest', 'Upfront_charges', 'term', 'property_value', 'Credit_Score', 'income']
for col in numerical_columns:
    replace_anomalies_with_minmax_values(important_df, col)

print("Anomalies replaced with real values. Updated DataFrame:")
important_df.head(5)

Anomalies replaced with real values. Updated DataFrame:


Unnamed: 0,loan_amount,rate_of_interest,Upfront_charges,term,property_value,Credit_Score,income,open_credit,Neg_ammortization,interest_only,lump_sum_payment,age,Region,business_or_commercial,occupancy_type,Secured_by,Status
0,116500,3.99,2596.45,360.0,118000.0,758,1740.0,0,0,0,0,1,0,0,0,0,1
1,206500,3.99,2596.45,360.0,418000.0,552,4980.0,0,0,0,1,4,1,1,0,0,1
2,406500,4.56,595.0,360.0,508000.0,834,9480.0,0,1,0,0,2,0,0,0,0,0
3,456500,4.25,2596.45,360.0,658000.0,587,11880.0,0,0,0,0,3,1,0,0,0,0
4,696500,4.0,0.0,360.0,758000.0,602,10440.0,0,0,0,0,1,1,0,0,0,0


# Hyperparams tuning

In [None]:
param_grid = {
    'model__var_smoothing': np.logspace(-12, -6, 7)
}

pipeline = Pipeline([
    ('model', GaussianNB())
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scoring,
    refit='roc_auc',
    cv=cv,
    return_train_score=True,
    n_jobs=-1
)

In [13]:
X = important_df.drop(columns=['Status'])
y = important_df['Status']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

grid.fit(X_train, y_train)

print("The best hyperparams:")
print(grid.best_params_)
print(f"The best ROC AUC: {grid.best_score_:.4f}")

results_df = pd.DataFrame(grid.cv_results_)
print("\nTop results (ROC AUC):")
print(results_df.sort_values(by='mean_test_roc_auc', ascending=False)[[
    'param_model__var_smoothing', 'mean_test_accuracy',
    'mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'mean_test_roc_auc'
]].head(20))

The best hyperparams:
{'model__var_smoothing': np.float64(1e-12)}
The best ROC AUC: 0.9612

Top results (ROC AUC):
   param_model__var_smoothing  mean_test_accuracy  mean_test_precision  \
0                1.000000e-12            0.947453             0.830688   
1                1.000000e-11            0.927931             0.779212   
2                1.000000e-10            0.927460             0.778146   
3                1.000000e-09            0.927244             0.777606   
4                1.000000e-08            0.926272             0.775222   
5                1.000000e-07            0.918216             0.756236   
6                1.000000e-06            0.879121             0.675687   

   mean_test_recall  mean_test_f1  mean_test_roc_auc  
0          0.988294      0.902649           0.961195  
1          0.987445      0.871043           0.947956  
2          0.987202      0.870282           0.947562  
3          0.987202      0.869944           0.947419  
4          0.9871

### Saving model

In [15]:
best_model = grid.best_estimator_
joblib.dump(best_model, 'best_naive_bayes_model.pkl')

['best_naive_bayes_model.pkl']