In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/Debt_collection_project/DataFile1.csv')

In [3]:
df = data.copy()

In [4]:
pd.set_option('display.max_columns', 50)

In [5]:
# Dropping columns

df = df.drop(['Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'], axis = 1 )

In [6]:
# checking null values

df.isnull().sum()/len(df) * 100

EntityID                       0.000000
OriginalCreditor[Redacted]     0.000000
AccountID                      0.000000
CurrentBalance                 0.000000
DebtLoadPrincipal              0.000000
BalanceAtDebtLoad              0.000000
PurchasePrice                  0.662364
ProductOrDebtType              0.000000
CollectionStatus               0.000000
IsStatBarred                   0.000000
ClosureReason                 97.778177
InBankruptcy                   0.000000
AccountInsolvencyType         99.929876
CustomerInsolvencyType        97.900955
IsLegal                        0.000000
LastPaymentAmount             74.416556
LastPaymentMethod             74.416556
NumLiableParties               0.030018
CustomerAge                    7.254019
NumPhones                      0.000000
NumEmails                      0.000000
NumAddresses                   0.000000
dtype: float64

In [7]:
# checking duplicates

df.duplicated().sum()

0

In [8]:
#dropping columns with max null values

df = df.drop(['ClosureReason', 'AccountInsolvencyType', 'CustomerInsolvencyType', 'LastPaymentMethod', 'LastPaymentAmount'], axis = 1 )

In [9]:
# Converting columns to float
columns_to_convert = ['CurrentBalance', 'DebtLoadPrincipal', 'BalanceAtDebtLoad']

In [10]:
for column in columns_to_convert:
    df[column] = df[column].str.replace(',', '').astype(float)

In [11]:
df.isnull().sum()/len(df) * 100

EntityID                      0.000000
OriginalCreditor[Redacted]    0.000000
AccountID                     0.000000
CurrentBalance                0.000000
DebtLoadPrincipal             0.000000
BalanceAtDebtLoad             0.000000
PurchasePrice                 0.662364
ProductOrDebtType             0.000000
CollectionStatus              0.000000
IsStatBarred                  0.000000
InBankruptcy                  0.000000
IsLegal                       0.000000
NumLiableParties              0.030018
CustomerAge                   7.254019
NumPhones                     0.000000
NumEmails                     0.000000
NumAddresses                  0.000000
dtype: float64

In [12]:
df.describe()

Unnamed: 0,EntityID,AccountID,CurrentBalance,DebtLoadPrincipal,BalanceAtDebtLoad,PurchasePrice,NumLiableParties,CustomerAge,NumPhones,NumEmails,NumAddresses
count,406423.0,406423.0,406423.0,406423.0,406423.0,403731.0,406301.0,376941.0,406423.0,406423.0,406423.0
mean,39704430.0,395438000.0,1301.866266,1539.010928,1600.933847,5.618606,1.017145,45.728061,0.435652,0.208113,0.847354
std,46980700.0,465476900.0,4030.51371,4416.229311,4531.889319,5.441934,0.132866,12.950395,0.714852,0.434321,0.45969
min,160.0,3677.0,-7717.2,0.0,0.0,0.19,1.0,-41.0,0.0,0.0,0.0
25%,3010600.0,30230880.0,85.33,246.97,249.875,3.07,1.0,36.0,0.0,0.0,1.0
50%,3010949.0,30450750.0,457.51,619.0,630.74,4.22,1.0,44.0,0.0,0.0,1.0
75%,99901310.0,990189100.0,1159.365,1393.78,1433.755,6.59,1.0,54.0,1.0,0.0,1.0
max,99901590.0,990495800.0,441681.52,844343.0,844343.0,52.18,4.0,133.0,8.0,5.0,7.0


In [13]:
# converting negative age to positive

df['CustomerAge'] = df['CustomerAge'].abs()

In [14]:
# imputing null values in age column

mean_age = df['CustomerAge'].mean()
df['CustomerAge'].fillna(mean_age, inplace=True)

In [15]:
# dropping null rows

df.dropna(inplace=True)

In [16]:
# dropping insignificant columns for our analysis

df = df.drop(['EntityID', 'AccountID', 'OriginalCreditor[Redacted]'], axis = 1 )

In [17]:
# seperating columns on basis of dataypes and putting in a list

obj1 = []
int1 = []
flo1 = []

for i in df.columns:
    if df[i].dtypes == 'object':
        obj1.append(i)
    elif df[i].dtypes == 'int64':
        int1.append(i)
    elif df[i].dtypes == 'float':
        flo1.append(i)

In [18]:
# renaming column

df.rename(columns={'CurrentBalance': 'Debt_Pending'}, inplace=True)

In [19]:
#dropping columns

df = df.drop(['Debt_Pending', 'DebtLoadPrincipal', 'InBankruptcy', 'IsLegal'], axis = 1 )

In [20]:
# mapping values for statue barred column

df['IsStatBarred'] = df['IsStatBarred'].map({'Y': 1, 'N': 0})

In [21]:
df

Unnamed: 0,BalanceAtDebtLoad,PurchasePrice,ProductOrDebtType,CollectionStatus,IsStatBarred,NumLiableParties,CustomerAge,NumPhones,NumEmails,NumAddresses
0,1160.20,4.22,Other,PAID_IN_FULL,0,1.0,53.000000,0,0,1
1,182.90,4.22,Other,CANCELLED_WITHDRAWN,1,1.0,45.730273,0,0,1
2,538.57,4.22,Other,PAID_IN_FULL,0,1.0,45.730273,1,0,1
3,8279.50,4.22,Other,PASSIVE,1,1.0,45.730273,1,0,1
4,523.00,4.22,Other,PAID_IN_FULL,1,1.0,46.000000,2,0,1
...,...,...,...,...,...,...,...,...,...,...
406418,448.20,7.38,Finance Company - Other,ACTIVE,0,1.0,36.000000,1,1,1
406419,1678.37,7.38,Finance Company - Other,ACTIVE,0,1.0,37.000000,0,1,1
406420,3512.60,7.38,Finance Company - Other,ACTIVE,0,1.0,71.000000,1,1,1
406421,4477.31,7.38,Finance Company - Other,ACTIVE,0,1.0,38.000000,1,1,1


In [22]:
categorical_features = ['ProductOrDebtType', 'CollectionStatus']

In [23]:
# Applying one-hot encoding to the categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [24]:
df = df_encoded.copy()

In [25]:
df

Unnamed: 0,BalanceAtDebtLoad,PurchasePrice,IsStatBarred,NumLiableParties,CustomerAge,NumPhones,NumEmails,NumAddresses,ProductOrDebtType_Credit Cards,ProductOrDebtType_Finance Company - Other,ProductOrDebtType_Hire Purchase,ProductOrDebtType_Loans,ProductOrDebtType_Other,ProductOrDebtType_Personal Loans,ProductOrDebtType_Residential Electricity,ProductOrDebtType_Store Cards,ProductOrDebtType_Utilities/Telco - Other,CollectionStatus_CANCELLED_WITHDRAWN,CollectionStatus_CLOSED,CollectionStatus_HOLDING,CollectionStatus_LEGAL,CollectionStatus_LEGAL_ARRANGEMENT,CollectionStatus_NON_COLLECTION,CollectionStatus_PAID_IN_FULL,CollectionStatus_PASSIVE,CollectionStatus_PENDING,CollectionStatus_SETTLED FOR LESS,CollectionStatus_UNDER_ARRANGEMENT
0,1160.20,4.22,0,1.0,53.000000,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,182.90,4.22,1,1.0,45.730273,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,538.57,4.22,0,1.0,45.730273,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,8279.50,4.22,1,1.0,45.730273,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,523.00,4.22,1,1.0,46.000000,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406418,448.20,7.38,0,1.0,36.000000,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
406419,1678.37,7.38,0,1.0,37.000000,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
406420,3512.60,7.38,0,1.0,71.000000,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
406421,4477.31,7.38,0,1.0,38.000000,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
y = df['IsStatBarred'].astype(int)  # Target variable
X = df.drop(columns=['IsStatBarred']).astype(int)  # Features

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

In [28]:
# standardization 

from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

X_standardized = standard_scaler.fit_transform(X)

In [30]:
#model training and prediction

models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("NB", GaussianNB()))
models.append(("RF", RandomForestClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")

LR Accuracy: 0.949187 (0.001183)
Training Accuracy for LR: 0.94925484997894

Test Accuracy for LR: 0.9502743951537975
Confusion Matrix for LR:
[[21515  2632]
 [ 1382 55194]]

Classification Report for LR:
              precision    recall  f1-score   support

           0       0.94      0.89      0.91     24147
           1       0.95      0.98      0.96     56576

    accuracy                           0.95     80723
   macro avg       0.95      0.93      0.94     80723
weighted avg       0.95      0.95      0.95     80723


Recall for LR: 0.9755726809954751

Precision for LR: 0.9544841420814166

ROC AUC for LR: 0.9332868167473752
----------------------------------------------------------------------------
LDA Accuracy: 0.941568 (0.001353)
Training Accuracy for LDA: 0.9415617799360769

Test Accuracy for LDA: 0.9426557486713824
Confusion Matrix for LDA:
[[20484  3663]
 [  966 55610]]

Classification Report for LDA:
              precision    recall  f1-score   support

           0   