## Importing Liberaries

In [1]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use("WebAgg")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

## Reading Data

In [2]:
df = pd.read_csv("loan_data.csv")

In [3]:
df

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


## Check Null Values

In [5]:
df.isnull().sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0
mean,0.80497,0.12264,319.089413,10.932117,12.606679,710.846314,4560.767197,16913.96,46.799236,1.577469,0.163708,0.062122,0.160054
std,0.396245,0.026847,207.071301,0.614813,6.88397,37.970537,2496.930377,33756.19,29.014417,2.200245,0.546215,0.262126,0.366676
min,0.0,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1039,163.77,10.558414,7.2125,682.0,2820.0,3187.0,22.6,0.0,0.0,0.0,0.0
50%,1.0,0.1221,268.95,10.928884,12.665,707.0,4139.958333,8596.0,46.3,1.0,0.0,0.0,0.0
75%,1.0,0.1407,432.7625,11.291293,17.95,737.0,5730.0,18249.5,70.9,2.0,0.0,0.0,0.0
max,1.0,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0


## Label Encoding

In [7]:
df["purpose"].unique()

array(['debt_consolidation', 'credit_card', 'all_other',
       'home_improvement', 'small_business', 'major_purchase',
       'educational'], dtype=object)

In [8]:
encode = LabelEncoder()

In [9]:
df["purpose"] = encode.fit_transform(df["purpose"])

In [10]:
df["purpose"].unique()

array([2, 1, 0, 4, 6, 5, 3])

## Check Correlation Between Features

In [11]:
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap = plt.cm.CMRmap_r)
plt.show()

Press Ctrl+C to stop WebAgg server


RuntimeError: This event loop is already running

## Standard Scaling

In [12]:
scaler = StandardScaler()

In [13]:
x = df.drop(columns = "not.fully.paid")

In [14]:
result = scaler.fit_transform(x)

In [15]:
result

array([[ 0.49222226,  0.03317632, -0.13931753, ..., -0.71698894,
        -0.29973008, -0.23700318],
       [ 0.49222226, -0.55966463, -0.57886837, ..., -0.71698894,
        -0.29973008, -0.23700318],
       [ 0.49222226,  0.03317632,  0.48648368, ..., -0.26247044,
        -0.29973008, -0.23700318],
       ...,
       [-2.03160257,  0.03317632, -0.57886837, ...,  2.91915909,
        -0.29973008, -0.23700318],
       [-2.03160257,  1.2188582 ,  1.39166043, ...,  1.55560358,
        -0.29973008, -0.23700318],
       [-2.03160257,  0.03317632,  0.61685894, ...,  2.01012208,
        -0.29973008, -0.23700318]])

In [16]:
x_scaler = pd.DataFrame(result, columns=x.columns)

In [17]:
x_scaler.describe()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
count,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0
mean,-9.495664000000001e-17,4.8962020000000004e-17,-4.7478320000000004e-17,-4.8962020000000004e-17,1.348681e-15,-7.121748000000001e-17,2.848699e-16,-5.93479e-17,-1.1869580000000001e-17,4.1543530000000005e-17,2.3739160000000002e-17,1.1869580000000001e-17,5.638051000000001e-17
std,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052,1.000052
min,-2.031603,-1.152506,-2.333347,-1.465366,-5.505403,-1.831405,-2.603373,-1.75497,-0.5010888,-1.613049,-0.7169889,-0.2997301,-0.2370032
25%,0.4922223,-0.5596646,-0.6980686,-0.7501161,-0.607865,-0.7836264,-0.7597422,-0.6971993,-0.4066715,-0.8340853,-0.7169889,-0.2997301,-0.2370032
50%,0.4922223,0.03317632,-0.02011729,-0.2421486,-0.00525971,0.008472466,-0.1013026,-0.1685393,-0.2464259,-0.01720737,-0.2624704,-0.2997301,-0.2370032
75%,0.4922223,0.03317632,0.672734,0.5489849,0.584234,0.7762382,0.6888249,0.4682925,0.03956625,0.8306913,0.1920481,-0.2997301,-0.2370032
max,0.4922223,2.40454,3.492564,2.999368,5.849627,2.520962,3.059207,5.238382,35.26782,2.488574,14.28212,23.50167,18.83877


In [18]:
x_scaler["not.fully.paid"] = df["not.fully.paid"]

## Check VIF Score of Features

In [19]:
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(x_scaler.values, i) for i in range(x_scaler.shape[1])]
vif['variable'] = x_scaler.columns
print(vif)

         VIF           variable
0   1.667380      credit.policy
1   1.073101            purpose
2   2.836731           int.rate
3   1.609075        installment
4   1.535681     log.annual.inc
5   1.200531                dti
6   3.363273               fico
7   1.305372  days.with.cr.line
8   1.367487          revol.bal
9   1.734463         revol.util
10  1.466583     inq.last.6mths
11  1.140323        delinq.2yrs
12  1.049257            pub.rec
13  1.047676     not.fully.paid


## Handeling Outliers

In [20]:
fig,ax = plt.subplots()
plt.figure(figsize=(40,80))
sns.boxplot(data = x_scaler, ax = ax)
plt.show()

In [21]:
q = x_scaler[x_scaler.columns[7]].quantile(.94)
df_new = x_scaler[x_scaler[x_scaler.columns[7]]<q]

In [22]:
q = df_new[df_new.columns[4]].quantile(.98)
df_new = df_new[df_new[df_new.columns[4]]<q]

q = df_new[df_new.columns[4]].quantile(.98)
df_new = df_new[df_new[df_new.columns[4]]> -q]

In [23]:
q = df_new[df_new.columns[11]].quantile(.98)
df_new = df_new[df_new[df_new.columns[11]]<q]

In [24]:
q = df_new[df_new.columns[10]].quantile(.98)
df_new = df_new[df_new[df_new.columns[10]]<q]

In [25]:
q = df_new[df_new.columns[8]].quantile(.98)
df_new = df_new[df_new[df_new.columns[8]]<q]

In [26]:
q = df_new[df_new.columns[12]].quantile(.99)
df_new = df_new[df_new[df_new.columns[12]]<q]

In [27]:
q = df_new[df_new.columns[1]].quantile(.99)
df_new = df_new[df_new[df_new.columns[1]]<q]

In [28]:
q = df_new[df_new.columns[2]].quantile(.99)
df_new = df_new[df_new[df_new.columns[2]]<q]

In [29]:
fig,ax = plt.subplots()
plt.figure(figsize=(40,80))
sns.boxplot(data = df_new, ax = ax)
plt.show()

In [30]:
df_new.shape

(6762, 14)

## Separating Independent and Dependent Variables

In [31]:
x = df_new.drop(columns="not.fully.paid")

In [32]:
y = df_new["not.fully.paid"]

## Train Test Split

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state= 22)

## Model Building

1) Logistic Regression

In [34]:
"""r = 500
l = []
for i in range(500):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=i)
    xgb_model1 = LogisticRegression()
    xgb_model1.fit(x_train, y_train)
    test = xgb_model1.score(x_test, y_test)
    train = xgb_model1.score(x_train, y_train)
    if test > .7:
        l.append([test, train, i])"""

'r = 500\nl = []\nfor i in range(500):\n    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=i)\n    xgb_model1 = LogisticRegression()\n    xgb_model1.fit(x_train, y_train)\n    test = xgb_model1.score(x_test, y_test)\n    train = xgb_model1.score(x_train, y_train)\n    if test > .7:\n        l.append([test, train, i])'

In [35]:
"""for i in l:
    if i[1] > .86:
        print(i)"""

'for i in l:\n    if i[1] > .86:\n        print(i)'

In [36]:
logistic = LogisticRegression()

In [37]:
logistic.fit(x_train, y_train)

LogisticRegression()

In [38]:
logistic.score(x_train, y_train)

0.8649904922881894

In [39]:
logistic.score(x_test, y_test)

0.8649581074420897

In [40]:
y_test_pred1 = logistic.predict(x_test)

In [41]:
print("Accuracy Score =", round(sm.accuracy_score(y_test, y_test_pred1), 2)) 
print("Precision Score =", sm.precision_score(y_test, y_test_pred1))
print("Confucion Matrix =\n", sm.confusion_matrix(y_test, y_test_pred1))
print("Recall Score =", sm.recall_score(y_test, y_test_pred1))
print("AUC Score =",sm. roc_auc_score(y_test, y_test_pred1))
print("Classification Report =\n", sm.classification_report(y_test, y_test_pred1))


Accuracy Score = 0.86
Precision Score = 1.0
Confucion Matrix =
 [[1753    0]
 [ 274    2]]
Recall Score = 0.007246376811594203
AUC Score = 0.5036231884057971
Classification Report =
               precision    recall  f1-score   support

           0       0.86      1.00      0.93      1753
           1       1.00      0.01      0.01       276

    accuracy                           0.86      2029
   macro avg       0.93      0.50      0.47      2029
weighted avg       0.88      0.86      0.80      2029



2) Decision Tree Classifier

In [42]:
tree = DecisionTreeClassifier()

In [43]:
tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [44]:
tree.score(x_train, y_train)

1.0

In [45]:
tree.score(x_test, y_test)

0.7654016757023164

In [46]:
y_test_pred2 = tree.predict(x_test)

In [47]:
print("Accuracy Score =", round(sm.accuracy_score(y_test, y_test_pred2), 2)) 
print("Precision Score =", sm.precision_score(y_test, y_test_pred2))
print("Confucion Matrix =\n", sm.confusion_matrix(y_test, y_test_pred2))
print("Recall Score =", sm.recall_score(y_test, y_test_pred2))
print("AUC Score =", sm.roc_auc_score(y_test, y_test_pred2))
print("Classification Report =\n", sm.classification_report(y_test, y_test_pred2))


Accuracy Score = 0.77
Precision Score = 0.16216216216216217
Confucion Matrix =
 [[1505  248]
 [ 228   48]]
Recall Score = 0.17391304347826086
AUC Score = 0.5162206403928669
Classification Report =
               precision    recall  f1-score   support

           0       0.87      0.86      0.86      1753
           1       0.16      0.17      0.17       276

    accuracy                           0.77      2029
   macro avg       0.52      0.52      0.52      2029
weighted avg       0.77      0.77      0.77      2029



3) KNeighbors Classifier

In [48]:
"""mean_acc = np.zeros(20)
for i in range(1,21):
    #Train Model and Predict  
    knn = KNeighborsClassifier(n_neighbors = i).fit(x_train,y_train)
    yhat= knn.predict(x_test)
    mean_acc[i-1] = sm.accuracy_score(y_test, yhat)
mean_acc"""

'mean_acc = np.zeros(20)\nfor i in range(1,21):\n    #Train Model and Predict  \n    knn = KNeighborsClassifier(n_neighbors = i).fit(x_train,y_train)\n    yhat= knn.predict(x_test)\n    mean_acc[i-1] = sm.accuracy_score(y_test, yhat)\nmean_acc'

In [49]:
"""r = 500
l = []
for i in range(500):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=i)
    xgb_model1 = KNeighborsClassifier()
    xgb_model1.fit(x_train, y_train)
    test = xgb_model1.score(x_test, y_test)
    train = xgb_model1.score(x_train, y_train)
    if test > .7:
        l.append([test, train, i])"""

'r = 500\nl = []\nfor i in range(500):\n    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=i)\n    xgb_model1 = KNeighborsClassifier()\n    xgb_model1.fit(x_train, y_train)\n    test = xgb_model1.score(x_test, y_test)\n    train = xgb_model1.score(x_train, y_train)\n    if test > .7:\n        l.append([test, train, i])'

In [50]:
"""for i in l:
    if i[0] > .84:
        print(i)"""

'for i in l:\n    if i[0] > .84:\n        print(i)'

In [51]:
KNN = KNeighborsClassifier(n_neighbors=18)

In [52]:
KNN.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=18)

In [53]:
KNN.score(x_train, y_train)

0.8654130572575534

In [54]:
KNN.score(x_test, y_test)

0.8639724001971415

In [55]:
y_test_pred3 = KNN.predict(x_test)

In [56]:
print("Accuracy Score =", round(sm.accuracy_score(y_test, y_test_pred3), 2)) 
print("Precision Score =", sm.precision_score(y_test, y_test_pred3))
print("Confucion Matrix =\n", sm.confusion_matrix(y_test, y_test_pred3))
print("Recall Score =", sm.recall_score(y_test, y_test_pred3))
print("AUC Score =", sm.roc_auc_score(y_test, y_test_pred3))
print("Classification Report =\n", sm.classification_report(y_test, y_test_pred3))


Accuracy Score = 0.86
Precision Score = 0.0
Confucion Matrix =
 [[1753    0]
 [ 276    0]]
Recall Score = 0.0
AUC Score = 0.5
Classification Report =
               precision    recall  f1-score   support

           0       0.86      1.00      0.93      1753
           1       0.00      0.00      0.00       276

    accuracy                           0.86      2029
   macro avg       0.43      0.50      0.46      2029
weighted avg       0.75      0.86      0.80      2029



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4) Random Forest Classifier

In [57]:
random = RandomForestClassifier()

In [58]:
random.fit(x_train, y_train)

RandomForestClassifier()

In [59]:
random.score(x_train, y_train)

1.0

In [60]:
random.score(x_test, y_test)

0.8629866929521932

In [61]:
y_test_pred4 = random.predict(x_test)

In [62]:
print("Accuracy Score =", round(sm.accuracy_score(y_test, y_test_pred4), 2)) 
print("Precision Score =", sm.precision_score(y_test, y_test_pred4))
print("Confucion Matrix =\n", sm.confusion_matrix(y_test, y_test_pred4))
print("Recall Score =", sm.recall_score(y_test, y_test_pred4))
print("AUC Score =", sm.roc_auc_score(y_test, y_test_pred4))
print("Classification Report =\n", sm.classification_report(y_test, y_test_pred4))

Accuracy Score = 0.86
Precision Score = 0.3333333333333333
Confucion Matrix =
 [[1749    4]
 [ 274    2]]
Recall Score = 0.007246376811594203
AUC Score = 0.5024822870937605
Classification Report =
               precision    recall  f1-score   support

           0       0.86      1.00      0.93      1753
           1       0.33      0.01      0.01       276

    accuracy                           0.86      2029
   macro avg       0.60      0.50      0.47      2029
weighted avg       0.79      0.86      0.80      2029



## Finalizing Model and Save into Pickle File

In [63]:
pickle.dump(logistic, open("Loan_model.pkl", "wb"))