In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tabulate import tabulate
%matplotlib inline
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('C:/Users/Nikhil_Chamle/Desktop/P1/Collection prediction/Data.csv')

In [3]:
df = data.copy()

In [4]:
pd.set_option('display.max_columns', 50)

In [5]:
df.head(7)

Unnamed: 0,EntityID,OriginalCreditor[Redacted],AccountID,CurrentBalance,DebtLoadPrincipal,BalanceAtDebtLoad,PurchasePrice,ProductOrDebtType,CollectionStatus,IsStatBarred,ClosureReason,InBankruptcy,AccountInsolvencyType,CustomerInsolvencyType,IsLegal,LastPaymentAmount,LastPaymentMethod,NumLiableParties,CustomerAge,NumPhones,NumEmails,NumAddresses,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,932,Creditor 1,3677,0.0,1160.2,1160.2,4.22,Other,PAID_IN_FULL,N,,N,,,Y,10.0,Cheque,1.0,53.0,0,0,1,,,,
1,160,Creditor 2,4276,182.9,182.9,182.9,4.22,Other,CANCELLED_WITHDRAWN,Y,,N,,,N,,,1.0,,0,0,1,,,,
2,932,Creditor 1,8525,0.0,538.57,538.57,4.22,Other,PAID_IN_FULL,N,,N,,,N,5.37,Cheque,1.0,,1,0,1,,,,
3,160,Creditor 2,9859,8279.5,8279.5,8279.5,4.22,Other,PASSIVE,Y,,N,,,N,,,1.0,,1,0,1,,,,
4,932,Creditor 1,12807,0.0,523.0,523.0,4.22,Other,PAID_IN_FULL,Y,,N,,,Y,5.0,Cheque,1.0,46.0,2,0,1,,,,
5,932,Creditor 1,13465,1118.74,790.3,790.3,4.22,Other,PASSIVE,Y,,N,,,Y,10.0,Cheque,1.0,,0,0,1,,,,
6,932,Creditor 1,18664,0.0,71.89,71.89,4.22,Other,PAID_IN_FULL,N,,N,,,Y,91.27,Cheque,1.0,50.0,2,0,1,,,,


In [6]:
# Dropping columns

df = df.drop(['Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'], axis = 1 )

In [7]:
# checking null values

df.isnull().sum()/len(df) * 100

EntityID                       0.000000
OriginalCreditor[Redacted]     0.000000
AccountID                      0.000000
CurrentBalance                 0.000000
DebtLoadPrincipal              0.000000
BalanceAtDebtLoad              0.000000
PurchasePrice                  0.662364
ProductOrDebtType              0.000000
CollectionStatus               0.000000
IsStatBarred                   0.000000
ClosureReason                 97.778177
InBankruptcy                   0.000000
AccountInsolvencyType         99.929876
CustomerInsolvencyType        97.900955
IsLegal                        0.000000
LastPaymentAmount             74.416556
LastPaymentMethod             74.416556
NumLiableParties               0.030018
CustomerAge                    7.254019
NumPhones                      0.000000
NumEmails                      0.000000
NumAddresses                   0.000000
dtype: float64

In [8]:
# checking duplicates

df.duplicated().sum()

0

In [9]:
#dropping columns with max null values

df = df.drop(['ClosureReason', 'AccountInsolvencyType', 'CustomerInsolvencyType', 'LastPaymentMethod', 'LastPaymentAmount'], axis = 1 )

In [10]:
# Converting columns to float
columns_to_convert = ['CurrentBalance', 'DebtLoadPrincipal', 'BalanceAtDebtLoad']
for column in columns_to_convert:
    df[column] = df[column].str.replace(',', '').astype(float)

In [11]:
df.describe()

Unnamed: 0,EntityID,AccountID,CurrentBalance,DebtLoadPrincipal,BalanceAtDebtLoad,PurchasePrice,NumLiableParties,CustomerAge,NumPhones,NumEmails,NumAddresses
count,406423.0,406423.0,406423.0,406423.0,406423.0,403731.0,406301.0,376941.0,406423.0,406423.0,406423.0
mean,39704430.0,395438000.0,1301.866266,1539.010928,1600.933847,5.618606,1.017145,45.728061,0.435652,0.208113,0.847354
std,46980700.0,465476900.0,4030.51371,4416.229311,4531.889319,5.441934,0.132866,12.950395,0.714852,0.434321,0.45969
min,160.0,3677.0,-7717.2,0.0,0.0,0.19,1.0,-41.0,0.0,0.0,0.0
25%,3010600.0,30230880.0,85.33,246.97,249.875,3.07,1.0,36.0,0.0,0.0,1.0
50%,3010949.0,30450750.0,457.51,619.0,630.74,4.22,1.0,44.0,0.0,0.0,1.0
75%,99901310.0,990189100.0,1159.365,1393.78,1433.755,6.59,1.0,54.0,1.0,0.0,1.0
max,99901590.0,990495800.0,441681.52,844343.0,844343.0,52.18,4.0,133.0,8.0,5.0,7.0


In [12]:
# converting negative age to positive

df['CustomerAge'] = df['CustomerAge'].abs()

In [13]:
# imputing null values in age column

mean_age = df['CustomerAge'].mean()
df['CustomerAge'].fillna(mean_age, inplace=True)

In [14]:
# dropping null rows

df.dropna(inplace=True)

In [15]:
# dropping insignificant columns for our analysis

df = df.drop(['EntityID', 'AccountID', 'OriginalCreditor[Redacted]'], axis = 1 )

In [16]:
# renaming column

df.rename(columns={'CurrentBalance': 'Debt_Pending'}, inplace=True)

In [17]:
#dropping columns

df = df.drop(['Debt_Pending', 'DebtLoadPrincipal', 'InBankruptcy', 'IsLegal'], axis = 1 )

In [18]:
# mapping values for statue barred column

df['IsStatBarred'] = df['IsStatBarred'].map({'Y': 1, 'N': 0})

In [19]:
categorical_features = ['ProductOrDebtType', 'CollectionStatus']

label_encoder = LabelEncoder()

for col in categorical_features:
    df[col] = label_encoder.fit_transform(df[col])

In [20]:
df.head(7)

Unnamed: 0,BalanceAtDebtLoad,PurchasePrice,ProductOrDebtType,CollectionStatus,IsStatBarred,NumLiableParties,CustomerAge,NumPhones,NumEmails,NumAddresses
0,1160.2,4.22,5,7,0,1.0,53.0,0,0,1
1,182.9,4.22,5,1,1,1.0,45.730273,0,0,1
2,538.57,4.22,5,7,0,1.0,45.730273,1,0,1
3,8279.5,4.22,5,8,1,1.0,45.730273,1,0,1
4,523.0,4.22,5,7,1,1.0,46.0,2,0,1
5,790.3,4.22,5,8,1,1.0,45.730273,0,0,1
6,71.89,4.22,5,7,0,1.0,50.0,2,0,1


In [21]:
y = df['IsStatBarred'].astype(int)  # Target variable
X = df.drop(columns=['IsStatBarred']).astype(int)  # Features

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

In [23]:
standard_scaler = StandardScaler()

X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

X_standardized = standard_scaler.fit_transform(X)

In [24]:
#model training and prediction

models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("NB", GaussianNB()))
models.append(("RF", RandomForestClassifier()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s Accuracy: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

    # Calculating training accuracy
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {name}: {train_accuracy}")
    print()

    # Calculating test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy for {name}: {test_accuracy}")

    # Calculating additional metrics
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}")
    print()
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print()
    print(f"Recall for {name}: {recall_score(y_test, y_pred)}")
    print()
    print(f"Precision for {name}: {precision_score(y_test, y_pred)}")
    print()
    print(f"ROC AUC for {name}: {roc_auc_score(y_test, y_pred)}")
    print("----------------------------------------------------------------------------")

LR Accuracy: 0.810151 (0.001775)
Training Accuracy for LR: 0.8101075295458487

Test Accuracy for LR: 0.8126184606617692
Confusion Matrix for LR:
[[13767 10138]
 [ 4988 51830]]

Classification Report for LR:
              precision    recall  f1-score   support

           0       0.73      0.58      0.65     23905
           1       0.84      0.91      0.87     56818

    accuracy                           0.81     80723
   macro avg       0.79      0.74      0.76     80723
weighted avg       0.81      0.81      0.81     80723


Recall for LR: 0.912210919074941

Precision for LR: 0.8363994319648851

ROC AUC for LR: 0.7440577707694304
----------------------------------------------------------------------------
LDA Accuracy: 0.812505 (0.002151)
Training Accuracy for LDA: 0.8125263249176184

Test Accuracy for LDA: 0.8151456214461802
Confusion Matrix for LDA:
[[12891 11014]
 [ 3908 52910]]

Classification Report for LDA:
              precision    recall  f1-score   support

           0  