In [1]:
# Importing the libraries (refer 05-A-RF-01)

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

import pandasql as psql

# pip install pandasql

# import datetime class from datetime module

from datetime import datetime

In [2]:
# Load the Loan data

loans = pd.read_csv(r"D:\00 Datasets\Others\Data-01\loan_data.csv", header=0)
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [3]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [4]:
# Change the name of variable

loans = loans.rename(columns = {'not.fully.paid': 'NFPaid'}, inplace = False)
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,NFPaid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [5]:
# Count the target or dependent variable by '0' & '1' and 
# their proportion (> 10 : 1, then the dataset is imbalance dataset)

Target_count = loans.NFPaid.value_counts()
print('Class 0:', Target_count[0])
print('Class 1:', Target_count[1])
print('Proportion:', round(Target_count[0] / Target_count[1], 2), ': 1')
print('Total loans Trans:', len(loans))

Class 0: 8045
Class 1: 1533
Proportion: 5.25 : 1
Total loans Trans: 9578


In [6]:
cat_cols = ['purpose']

loans = pd.get_dummies(loans,columns=cat_cols)
loans=pd.DataFrame(loans)
loans.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,NFPaid,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0,1,0,0,0,0,0


In [7]:
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in loans.columns:
    if col != 'NFPaid':
        IndepVar.append(col)

TargetVar = 'NFPaid'

x = loans[IndepVar]
y = loans[TargetVar]

In [8]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42) 

In [9]:
loans.columns

Index(['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'NFPaid',
       'purpose_all_other', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_educational',
       'purpose_home_improvement', 'purpose_major_purchase',
       'purpose_small_business'],
      dtype='object')

In [10]:
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   credit.policy               9578 non-null   int64  
 1   int.rate                    9578 non-null   float64
 2   installment                 9578 non-null   float64
 3   log.annual.inc              9578 non-null   float64
 4   dti                         9578 non-null   float64
 5   fico                        9578 non-null   int64  
 6   days.with.cr.line           9578 non-null   float64
 7   revol.bal                   9578 non-null   int64  
 8   revol.util                  9578 non-null   float64
 9   inq.last.6mths              9578 non-null   int64  
 10  delinq.2yrs                 9578 non-null   int64  
 11  pub.rec                     9578 non-null   int64  
 12  NFPaid                      9578 non-null   int64  
 13  purpose_all_other           9578 

In [11]:
cols1 = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util']

In [12]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train[cols1] = mmscaler.fit_transform(x_train[cols1])
x_train = pd.DataFrame(x_train)

x_test[cols1] = mmscaler.fit_transform(x_test[cols1])
x_test = pd.DataFrame(x_test)

# KNN Algorithm with Random Sampling

In [13]:
# Initialize an array that stores the Accuracy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.metrics import classification_report, confusion_matrix

accuracy = []

for a in range(1, 10, 1):
    k = a
    loansKNN = KNeighborsClassifier(n_neighbors=k)
    loansKNN.fit(x_train, y_train)
    y_pred = loansKNN.predict(x_test)
    print('KNN_K_value = ', a)
    print("Accuracy:", (round(metrics.accuracy_score(y_test, y_pred) * 100, 2)), "%")
    print("Precision:", (round(metrics.precision_score(y_test, y_pred, average='micro') * 100, 2)), '%')
    print("Recall:", (round(metrics.recall_score(y_test, y_pred, average='micro') * 100, 2)), "%")
    print("f1-score:", (round(metrics.f1_score(y_test, y_pred, average='micro') * 100, 2)), '%')
    print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))
    print(confusion_matrix(y_test, y_pred))
    print('----------------------------------------------------------------------------------------------')

KNN_K_value =  1
Accuracy: 75.85 %
Precision: 75.85 %
Recall: 75.85 %
f1-score: 75.85 %
roc_auc_score: 0.543
[[2076  332]
 [ 362  104]]
----------------------------------------------------------------------------------------------
KNN_K_value =  2
Accuracy: 82.01 %
Precision: 82.01 %
Recall: 82.01 %
f1-score: 82.01 %
roc_auc_score: 0.502
[[2342   66]
 [ 451   15]]
----------------------------------------------------------------------------------------------
KNN_K_value =  3
Accuracy: 80.34 %
Precision: 80.34 %
Recall: 80.34 %
f1-score: 80.34 %
roc_auc_score: 0.522
[[2260  148]
 [ 417   49]]
----------------------------------------------------------------------------------------------
KNN_K_value =  4
Accuracy: 82.85 %
Precision: 82.85 %
Recall: 82.85 %
f1-score: 82.85 %
roc_auc_score: 0.51
[[2363   45]
 [ 448   18]]
----------------------------------------------------------------------------------------------
KNN_K_value =  5
Accuracy: 81.77 %
Precision: 81.77 %
Recall: 81.77 %
f1-scor

# Random forest with random sampling

In [17]:
# Build Random Forest classification model and Train the model using the training sets

from sklearn.ensemble import RandomForestClassifier  

loansRF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                                 criterion='entropy', max_depth=None, max_features='auto',
                                 max_leaf_nodes=None, max_samples=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=500,
                                 n_jobs=None, oob_score=False, random_state=0, verbose=0,
                                 warm_start=False)

loansRF = loansRF.fit(x_train, y_train)

# Predict the model with test data set

y1_pred = loansRF.predict(x_test)

# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y1_pred))
print(classification_report(y_test, y1_pred))

# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(y_test, y1_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(y_test, y1_pred, average='micro') * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(y_test, y1_pred, average='micro') * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(y_test, y1_pred, average='micro') * 100, 2)), '%')

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y1_pred), 3))

[[2402    6]
 [ 462    4]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      2408
           1       0.40      0.01      0.02       466

    accuracy                           0.84      2874
   macro avg       0.62      0.50      0.46      2874
weighted avg       0.77      0.84      0.77      2874

Accuracy: 83.72 %
Precision: 83.72 %
Recall: 83.72 %
f1-score: 83.72 %
roc_auc_score: 0.503


# Decision Tree with Random Sampling

In [20]:
# To build the decision tree model with Over sampling 

from sklearn.tree import DecisionTreeClassifier 

loansDT = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                                 max_depth=None, max_features=None, max_leaf_nodes=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0,
                                 random_state=None, splitter='best')

loansDT = loansDT.fit(x_train,y_train)

# Predict with test data

y3_pred = loansDT.predict(x_test)

# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y3_pred))
print(classification_report(y_test, y3_pred))

# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(y_test, y3_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(y_test, y3_pred) * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(y_test, y3_pred) * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(y_test, y3_pred) * 100, 2)), '%')

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y3_pred), 3))

[[1948  460]
 [ 340  126]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83      2408
           1       0.22      0.27      0.24       466

    accuracy                           0.72      2874
   macro avg       0.53      0.54      0.53      2874
weighted avg       0.75      0.72      0.73      2874

Accuracy: 72.16 %
Precision: 21.5 %
Recall: 27.04 %
f1-score: 23.95 %
roc_auc_score: 0.54


# Logistic Regression with Random Sampling

In [21]:
# To build the 'Logistic Regression' model with random sampling

from sklearn.linear_model import LogisticRegression

loansLR = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                             intercept_scaling=1, max_iter=100, multi_class='auto', 
                             n_jobs=None, penalty='l2', random_state=None,
                             solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

loansLR = loansLR.fit(x_train,y_train)

# Predict the model with test data set

y5_pred = loansLR.predict(x_test)

# Display confusion matrix and classifiction report

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y5_pred))
print(classification_report(y_test, y5_pred))

# Evaluate the model performance by metrics

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", (round(metrics.accuracy_score(y_test, y5_pred) * 100, 2)), "%")

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", (round(metrics.precision_score(y_test, y5_pred, average='micro') * 100, 2)), '%')

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", (round(metrics.recall_score(y_test, y5_pred, average='micro') * 100, 2)), "%")

# Model f1-score: weighted average of Precision & Recall
print("f1-score:", (round(metrics.f1_score(y_test, y5_pred, average='micro') * 100, 2)), '%')

# Area under ROC curve 
print('roc_auc_score:', round(roc_auc_score(y_test, y5_pred), 3))

[[2395   13]
 [ 451   15]]
              precision    recall  f1-score   support

           0       0.84      0.99      0.91      2408
           1       0.54      0.03      0.06       466

    accuracy                           0.84      2874
   macro avg       0.69      0.51      0.49      2874
weighted avg       0.79      0.84      0.77      2874

Accuracy: 83.86 %
Precision: 83.86 %
Recall: 83.86 %
f1-score: 83.86 %
roc_auc_score: 0.513
