## Importing cleaned data

In [86]:
import pandas as pd
loans = pd.read_csv("loans_cleaned.csv")

In [87]:
loans = loans.drop(columns = ["monthly_utilization", "age", "NumberOfTime30-59DaysPastDueNotWorse", "DebtRatio", "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate", "NumberRealEstateLoansOrLines", "NumberOfTime60-89DaysPastDueNotWorse", "NumberOfDependents"])

In [88]:
loans.sample(10)

Unnamed: 0,Sr_No,SeriousDlqin2yrs,util_new,num_30_59_dpd_new,MonthlyIncome_ind,MonthlyIncome_new,age_new,DebtRatio_new,Open_Credit_lines_new,num_90_dpd_new,Real_estate_loans_new,num_60_89_dpd_new,NumberOfDependents_new
137251,137252,0,0.063825,0,1,12000.0,36,0.270811,8,0,2,0,1.0
94315,94316,0,0.014222,0,0,5400.0,52,0.366508,7,0,0,0,0.0
47225,47226,0,0.229548,4,1,7240.0,52,0.223174,8,0,2,0,0.0
139529,139530,0,0.015524,0,1,3000.0,70,0.007997,5,0,0,0,0.0
117148,117149,0,0.732174,0,1,12583.0,56,0.394787,15,0,3,0,0.0
115006,115007,0,0.063596,0,1,9100.0,61,0.337545,15,0,2,0,0.0
354,355,1,0.953344,2,0,5400.0,48,0.366508,5,1,2,2,0.0
5968,5969,0,0.022763,0,1,14166.0,51,0.177102,8,0,2,0,1.0
36546,36547,0,0.889421,0,1,8000.0,37,0.338083,12,0,1,0,2.0
107882,107883,0,0.0,0,0,5400.0,45,0.366508,6,0,1,0,0.0


In [89]:
loans.isnull().sum()

Unnamed: 0,0
Sr_No,0
SeriousDlqin2yrs,0
util_new,0
num_30_59_dpd_new,0
MonthlyIncome_ind,0
MonthlyIncome_new,0
age_new,0
DebtRatio_new,0
Open_Credit_lines_new,0
num_90_dpd_new,0


## 3) Logistic Regression Model building

This model is easy to explain as we can clearly see the impact of each variable on the output 
-> So, due to it's interpretability, Logistic Regression is preferrred here 

In [90]:
import statsmodels.api as sm
import statsmodels.formula.api as smf 
import numpy as np
from sklearn.model_selection import train_test_split

X=loans.drop(["SeriousDlqin2yrs","Sr_No"],axis=1)
y=loans["SeriousDlqin2yrs"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [92]:
model=sm.Logit(y_train, X_train).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.205680
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:       SeriousDlqin2yrs   No. Observations:               120000
Model:                          Logit   Df Residuals:                   119989
Method:                           MLE   Df Model:                           10
Date:                Mon, 23 Feb 2026   Pseudo R-squ.:                  0.1692
Time:                        18:17:12   Log-Likelihood:                -24682.
converged:                       True   LL-Null:                       -29710.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
util_new                   0.9453      0.033     28.617      0.000       0.881      

In [93]:
pred_train=model.predict(X_train)
pred_test=model.predict(X_test)

In [96]:
from sklearn.metrics import confusion_matrix,accuracy_score

# Confusion matrix and Accuracy on train data
confusion_matrix(y_train,np.round(pred_train))   # np.round is used to convert the predicted probabilities into binary outcomes (0 or 1) based on a threshold of 0.5
acc=accuracy_score(y_train,np.round(pred_train))
print("Train Accuracy", acc)

# Confusion matrix and Accuracy on test data
confusion_matrix(y_test,np.round(pred_test))
acc=accuracy_score(y_test,np.round(pred_test))
print("Test Accuracy", acc)

Train Accuracy 0.935075
Test Accuracy 0.9389333333333333


### Multicollinearity Checking

In [98]:
#Vif Function
import statsmodels.formula.api as sm

def vif_cal(input_data, dependent_col):
    x_vars=input_data.drop([dependent_col], axis=1)
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]]
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared
        vif=round(1/(1-rsq),2)
        print(xvar_names[i], " VIF = " , vif)

In [99]:
vif_cal(loans.drop(["Sr_No"],axis=1),"SeriousDlqin2yrs")

util_new  VIF =  1.18
num_30_59_dpd_new  VIF =  1.31
MonthlyIncome_ind  VIF =  1.1
MonthlyIncome_new  VIF =  1.03
age_new  VIF =  1.14
DebtRatio_new  VIF =  1.41
Open_Credit_lines_new  VIF =  1.35
num_90_dpd_new  VIF =  1.23
Real_estate_loans_new  VIF =  1.49
num_60_89_dpd_new  VIF =  1.36
NumberOfDependents_new  VIF =  1.12


### Final Model

In [103]:
# Dropping DebtRatio_new based on the p-value

X=loans.drop(["SeriousDlqin2yrs","Sr_No", "DebtRatio_new","Open_Credit_lines_new"],axis=1)
y=loans["SeriousDlqin2yrs"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

import statsmodels.api as sm
model=sm.Logit(y_train,X_train).fit()
print(model.summary())

train_pred=model.predict(X_train)
test_pred=model.predict(X_test)

train_accuracy=accuracy_score(y_train,np.round(train_pred))
test_accuracy=accuracy_score(y_test,np.round(test_pred))


print("Train Accuracy", train_accuracy)
print("Test Accuracy", test_accuracy)


Optimization terminated successfully.
         Current function value: 0.205689
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:       SeriousDlqin2yrs   No. Observations:               120000
Model:                          Logit   Df Residuals:                   119991
Method:                           MLE   Df Model:                            8
Date:                Mon, 23 Feb 2026   Pseudo R-squ.:                  0.1692
Time:                        18:36:34   Log-Likelihood:                -24683.
converged:                       True   LL-Null:                       -29710.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
util_new                   0.9364      0.032     29.437      0.000       0.874      

## 4) Model Validation & Class Imbalance

In [104]:
#Classwise Accuracy
loans["SeriousDlqin2yrs"].value_counts()/len(loans)

Unnamed: 0_level_0,count
SeriousDlqin2yrs,Unnamed: 1_level_1
0,0.93316
1,0.06684


In [105]:
train_cm=confusion_matrix(y_train,np.round(train_pred))
test_cm=confusion_matrix(y_test,np.round(test_pred))

print("Train Confusion Matrix\n", train_cm)
print("Test Confusion Matrix\n", test_cm)

Train Confusion Matrix
 [[111012    867]
 [  6921   1200]]
Test Confusion Matrix
 [[27890   205]
 [ 1623   282]]


In [None]:
#Class-0 and Class-1 Accuracy
class_0_acc=train_cm[0,0]/(train_cm[0,0]+train_cm[0,1])
class_1_acc=train_cm[1,1]/(train_cm[1,0]+train_cm[1,1])

print("Class-0 Accuracy", class_0_acc)
print("Class-1 Accuracy", class_1_acc)

# Class 1 Accuracy is more important as we want to identify the defaulters correctly. 
# So, we will try to improve the Class-1 Accuracy by using SMOTE technique.

Class-0 Accuracy 0.9922505564046872
Class-1 Accuracy 0.1477650535648319


### Handling Class Imablance using SMOTE technique

In [120]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy=0.8, random_state=44)

X_train_smote, y_train_smote=smote.fit_resample(X_train,y_train)

import collections
print("Before SMOTE", collections.Counter(y_train))
print("After SMOTE", collections.Counter(y_train_smote))

Before SMOTE Counter({0: 111879, 1: 8121})
After SMOTE Counter({0: 111879, 1: 89503})


### Model with balanced data - SMOTE

In [121]:
model=sm.Logit(y_train_smote,X_train_smote).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.532253
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:       SeriousDlqin2yrs   No. Observations:               201382
Model:                          Logit   Df Residuals:                   201373
Method:                           MLE   Df Model:                            8
Date:                Mon, 23 Feb 2026   Pseudo R-squ.:                  0.2252
Time:                        18:58:55   Log-Likelihood:            -1.0719e+05
converged:                       True   LL-Null:                   -1.3834e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
util_new                   1.9518      0.015    129.920      0.000       1.922      

In [122]:
#Confusion matrix and Accuracy
train_pred=model.predict(X_train_smote)
test_pred=model.predict(X_test)

#confusion matrix
train_cm=confusion_matrix(y_train_smote,np.round(train_pred))
test_cm=confusion_matrix(y_test,np.round(test_pred))

print("Train Confusion Matrix\n", train_cm)
print("Test Confusion Matrix\n", test_cm)

Train Confusion Matrix
 [[90402 21477]
 [31338 58165]]
Test Confusion Matrix
 [[22635  5460]
 [  604  1301]]


In [123]:
#Class-0 and Class-1 Accuracy on training data
class_0_acc=train_cm[0,0]/(train_cm[0,0]+train_cm[0,1])
class_1_acc=train_cm[1,1]/(train_cm[1,0]+train_cm[1,1])

print("Class-0 Accuracy", class_0_acc)
print("Class-1 Accuracy", class_1_acc)

# Before SMOTE:
#Class-0 Accuracy 0.9922505564046872
#Class-1 Accuracy 0.1477650535648319

Class-0 Accuracy 0.8080336792427534
Class-1 Accuracy 0.6498664849222932


In [124]:
#Class-0 and Class-1 Accuracy on test data
class_0_acc=test_cm[0,0]/(test_cm[0,0]+test_cm[0,1])
class_1_acc=test_cm[1,1]/(test_cm[1,0]+test_cm[1,1])

print("Class-0 Accuracy", class_0_acc)
print("Class-1 Accuracy", class_1_acc)

Class-0 Accuracy 0.8056593699946609
Class-1 Accuracy 0.6829396325459317
