In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression, RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVR

import time

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

In [2]:
accepted_df = pd.read_csv("../Resources/lending_club_accepted_klean_v2.csv")
accepted_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,Employment Length,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,open_rv_24m,max_bal_bc,all_util,chargeoff_within_12_mths,delinq_amnt,mort_acc,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,disbursement_method
0,32000.0,60 months,10.49,687.65,10.0,MORTGAGE,120000.0,Verified,Current,n,...,,,,0.0,0.0,2.0,0.0,0.0,556496.0,Cash
1,9600.0,36 months,12.99,323.42,,RENT,21900.0,Verified,Fully Paid,n,...,,,,0.0,0.0,0.0,1.0,0.0,11600.0,Cash
2,4000.0,36 months,6.68,122.93,4.0,MORTGAGE,83000.0,Not Verified,Fully Paid,n,...,,,,0.0,0.0,2.0,0.0,0.0,222616.0,Cash
3,6025.0,36 months,10.91,197.0,10.0,RENT,52000.0,Not Verified,Fully Paid,n,...,1.0,2071.0,17.0,0.0,0.0,0.0,0.0,0.0,32227.0,Cash
4,25000.0,60 months,26.3,752.96,10.0,OWN,65000.0,Verified,Current,n,...,8.0,14473.0,35.0,0.0,0.0,0.0,0.0,0.0,257219.0,Cash


In [3]:
rejected_df = pd.read_csv("../Resources/lending_club_rejected_klean_v2.csv")
rejected_df.head()

Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,7000.0,2016-10-18,DEBT_CONSOLIDATION,571.0,6.66,OK,0.5,0.0
1,20000.0,2018-01-14,DEBT_CONSOLIDATION,626.0,32.17,FL,0.5,0.0
2,15000.0,2013-09-04,DEBT_CONSOLIDATION,673.0,34.11,NH,0.5,0.0
3,10000.0,2017-02-16,DEBT_CONSOLIDATION,665.0,24.61,PA,0.5,0.0
4,2000.0,2016-03-14,HOME,601.0,0.64,AZ,0.5,0.0


In [4]:
rejected_df_2 = rejected_df.drop(['Application Date', 'Risk_Score'], axis=1)
rejected_df_2.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,7000.0,DEBT_CONSOLIDATION,6.66,OK,0.5,0.0
1,20000.0,DEBT_CONSOLIDATION,32.17,FL,0.5,0.0
2,15000.0,DEBT_CONSOLIDATION,34.11,NH,0.5,0.0
3,10000.0,DEBT_CONSOLIDATION,24.61,PA,0.5,0.0
4,2000.0,HOME,0.64,AZ,0.5,0.0


In [5]:
accepted_df.columns.to_list()

['loan_amnt',
 'term',
 'int_rate',
 'installment',
 'Employment Length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'loan_status',
 'pymnt_plan',
 'Loan Title',
 'addr_state',
 'dti',
 'mths_since_last_delinq',
 'open_acc',
 'total_acc',
 'out_prncp',
 'last_pymnt_amnt',
 'last_fico_range_high',
 'mths_since_last_major_derog',
 'Policy Code',
 'application_type',
 'verification_status_joint',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mort_acc',
 'pub_rec_bankruptcies',
 'tax_liens',
 'tot_hi_cred_lim',
 'disbursement_method']

In [6]:
accepted_df_2 = accepted_df[['loan_amnt', 'Loan Title', 'dti', 'addr_state', 'Employment Length', 'Policy Code']]
accepted_df_2 = accepted_df_2.rename(columns={
    'loan_amnt': 'Amount Requested',
    'dti': 'Debt-To-Income Ratio',
    'addr_state': 'State'
})
acc_length = len(accepted_df_2)

In [7]:
rej_length = len(rejected_df_2)
acc_length / rej_length

0.23820809248554914

In [8]:
rejected_sample = rejected_df_2.sample(frac=(acc_length / rej_length), random_state=42)
len(rejected_sample)

53573

In [9]:
# Some policy codes are 2.
# https://news.fintechnexus.com/policy-code-2-loans-lending-club/
# policy code 0 means rejected
# policy code 1 means accepted
# policy code 2 means accepted, but considered sub-prime on credit rating

# We're merely predicting whether or not someone's loan will be accepted or rejected, 
# so we'll change the status and simplify our model
rejected_sample['Policy Code'] = 0
accepted_df_2['Policy Code'] = 1

In [10]:
loan_application_df = pd.DataFrame.append(accepted_df_2, rejected_sample)

len(loan_application_df)

107146

In [11]:
loan_application_df.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000.0,DEBT_CONSOLIDATION,24.05,CA,10.0,1
1,9600.0,DEBT_CONSOLIDATION,10.03,FL,,1
2,4000.0,MAJOR_PURCHASE,19.53,FL,4.0,1
3,6025.0,DEBT_CONSOLIDATION,9.16,MA,10.0,1
4,25000.0,DEBT_CONSOLIDATION,36.26,CA,10.0,1


In [12]:
len(loan_application_df.dropna())

103922

In [13]:
loan_application_df = loan_application_df.dropna()

In [14]:
loan_application_df.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000.0,DEBT_CONSOLIDATION,24.05,CA,10.0,1
2,4000.0,MAJOR_PURCHASE,19.53,FL,4.0,1
3,6025.0,DEBT_CONSOLIDATION,9.16,MA,10.0,1
4,25000.0,DEBT_CONSOLIDATION,36.26,CA,10.0,1
5,20000.0,DEBT_CONSOLIDATION,16.43,NV,10.0,1


In [15]:
loan_state = loan_application_df.astype({
    'Policy Code': int,
    'Amount Requested': int
})
loan_state.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000,DEBT_CONSOLIDATION,24.05,CA,10.0,1
2,4000,MAJOR_PURCHASE,19.53,FL,4.0,1
3,6025,DEBT_CONSOLIDATION,9.16,MA,10.0,1
4,25000,DEBT_CONSOLIDATION,36.26,CA,10.0,1
5,20000,DEBT_CONSOLIDATION,16.43,NV,10.0,1


In [16]:
loan_state.dtypes

Amount Requested          int32
Loan Title               object
Debt-To-Income Ratio    float64
State                    object
Employment Length        object
Policy Code               int32
dtype: object

In [17]:
X = loan_state.drop(['Policy Code'], axis=1)
X.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length
0,32000,DEBT_CONSOLIDATION,24.05,CA,10.0
2,4000,MAJOR_PURCHASE,19.53,FL,4.0
3,6025,DEBT_CONSOLIDATION,9.16,MA,10.0
4,25000,DEBT_CONSOLIDATION,36.26,CA,10.0
5,20000,DEBT_CONSOLIDATION,16.43,NV,10.0


In [18]:
y = loan_state['Policy Code']
y.head()

0    1
2    1
3    1
4    1
5    1
Name: Policy Code, dtype: int32

In [19]:
X_dummies = pd.get_dummies(X)
# Scale the data
scaler = StandardScaler().fit(X_dummies)
X_scaled = scaler.transform(X_dummies)
# X_train_scaled


# Pickle model scaler

In [20]:
# filename = '../Models/loan_acceptance_scaler.sav'
# pickle.dump(scaler, open(filename, 'wb'))

In [21]:
X_scaled

array([[ 1.80302617, -0.00833908, -0.18115335, ..., -0.15590415,
        -0.15144265, -0.14196244],
       [-0.96643524, -0.00865758, -0.18115335, ..., -0.15590415,
        -0.15144265, -0.14196244],
       [-0.76614384, -0.0093883 , -0.18115335, ..., -0.15590415,
        -0.15144265, -0.14196244],
       ...,
       [-1.26316325, -0.00993652, -0.18115335, ..., -0.15590415,
        -0.15144265, -0.14196244],
       [-1.06534458, -0.00921637, -0.18115335, ..., -0.15590415,
        -0.15144265, -0.14196244],
       [-1.01588991, -0.01003376, -0.18115335, ..., -0.15590415,
        -0.15144265, -0.14196244]])

In [22]:
X_scaled.shape

(103922, 80)

In [23]:
def confusion_score(y_true, y_pred):
    [[TP, FN],[FP,TN]] = confusion_matrix(y_true, y_pred)

    accuracy = round((TP + TN) / (TP + FP + TN + FN),4) # (111 + 128) / (111 + 5 + 128 + 6)
#     print(f"Accuracy: {accuracy.round(4)}")
    precision = round((TP / (TP + FP)),4)
#     print(f'Precision: {precision.round(4)}')
    sensitivity = round(TP / (TP + FN),4)
#     print(f'Sensitivity: {sensitivity.round(4)}')
    specificity = round(TN / (TN + FP),4)
#     print(f'Specificity: {specificity.round(4)}')

    neg_predictive_value = round(TN / (TN + FN),4)
    
    matrix = [
#         ['','Predicted Class','Predicted Class',''],
        ['CONFUSION','Pred. Pos.','Pred. Neg.',''],
        ['MATRIX','_'*50],
        ['Act. Positive',TP,FN,sensitivity,'sensitivity'],
        ['Act. Negative',FP,TN,specificity,'specificity'],
        ['',precision,neg_predictive_value,accuracy,'accuracy'],
        ['','precision','neg. pred. value']
    ]
    
    for i in matrix:
        for j in i:
            print(f'{j:^16}|', end='')
        print("")

In [24]:
# IN CASE OF 3x3 CONFUSION MATRIX:

def confusion_score2(y_true, y_pred):
    [[a1, a2, a3],[b1,b2, b3],[c1,c2,c3]] = confusion_matrix(y_true, y_pred)
    TP = a1
    FN = a2 + a3
    FP = b1 + c1
    TN = b2 + b3 + c2 + c3

    accuracy = (TP + TN) / (TP + FP + TN + FN) # (111 + 128) / (111 + 5 + 128 + 6)
    print(f"Accuracy: {accuracy.round(3)}")
    precision = TP / (TP + FP)
    print(f'Precision: {precision.round(3)}')
    sensitivity = TP / (TP + FN)
    print(f'Sensitivity: {sensitivity.round(3)}')
    specificity = TN / (TN + FP)
    print(f'Specificity: {specificity.round(3)}')

In [25]:
# split the data into test and train

X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=1)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, random_state=1)

In [26]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9146149010148702
Testing Data Score: 0.9161695084869712


In [27]:
y_true = y_test
y_pred = classifier.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

Confusion Matrix:
 [[12115  1166]
 [ 1012 11688]] 

   CONFUSION    |   Pred. Pos.   |   Pred. Neg.   |                |
     MATRIX     |__________________________________________________|
 Act. Positive  |     12115      |      1166      |     0.9122     |  sensitivity   |
 Act. Negative  |      1012      |     11688      |     0.9203     |  specificity   |
                |     0.9229     |     0.9093     |     0.9162     |    accuracy    |
                |   precision    |neg. pred. value|


In [28]:
y.unique()

array([1, 0])

In [29]:
y.value_counts()

0    53573
1    50349
Name: Policy Code, dtype: int64

#### We suspect that employment length is the largest differentiator between a loan application being accepted or rejected.

In [30]:
rejected_sample['Employment Length'].value_counts()

0.5     48702
5.0      1333
10.0     1266
2.0       485
1.0       421
3.0       390
4.0       303
6.0       201
7.0       175
8.0       156
9.0       141
Name: Employment Length, dtype: int64

In [31]:
accepted_df_2['Employment Length'].value_counts()

10.0    17881
2.0      4891
3.0      4297
0.5      4102
1.0      3601
5.0      3440
4.0      3280
6.0      2480
7.0      2291
8.0      2174
9.0      1912
Name: Employment Length, dtype: int64

#### This seems to be a reasonable assumption.

# Check multiple models for score

In [32]:
def test_model(model, data):
    start = time.time()
    X_train_scaled, X_test_scaled, y_train, y_test = data

    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
#     plt.show()    
    y_true = y_test
    y_pred = model.predict(X_test_scaled)
    print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n\n')
    confusion_score(y_true, y_pred)
    end = time.time()
    print(f'Process time: {round(end-start, 2)} seconds.')

    print('~'*40)
    
    filename = f'../Models/loan_acceptance/{type(reg).__name__}.sav'
    pickle.dump(model, open(filename, 'wb'))

In [33]:
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [34]:
# multiple regression tests

# test_model(LinearRegression(), data)

# test_model(KNeighborsRegressor(), data)
# test_model(RandomForestRegressor(), data)
# test_model(ExtraTreesRegressor(), data)
# test_model(AdaBoostRegressor(), data)
# test_model(SVR(C=1.0, epsilon=0.2), data)

In [35]:
# multiple classification tests

# test_model(LinearRegression(), data)

test_list = [    
    LogisticRegression(),
    RandomForestClassifier(random_state=1, n_estimators=100),
    tree.DecisionTreeClassifier(),
    KNeighborsClassifier(n_neighbors = 15),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    AdaBoostClassifier()
]


# test_model(LogisticRegression(),data)
with ProcessPoolExecutor() as executor:

    for model in test_list:

        try:

            test_model(model, data)

            # test_model(SVR(C=1.0, epsilon=0.2), data)
        except Exception as e:
            print('#'*40)
            print('Exception found.')
            print(e)
            print('#'*40)

Model: LogisticRegression
Train score: 0.9150382981999204
Test Score: 0.9165928948077441

Confusion Matrix:
 [[12188  1093]
 [ 1074 11626]] 


   CONFUSION    |   Pred. Pos.   |   Pred. Neg.   |                |
     MATRIX     |__________________________________________________|
 Act. Positive  |     12188      |      1093      |     0.9177     |  sensitivity   |
 Act. Negative  |      1074      |     11626      |     0.9154     |  specificity   |
                |     0.919      |     0.9141     |     0.9166     |    accuracy    |
                |   precision    |neg. pred. value|
Process time: 0.87 seconds.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model: RandomForestClassifier
Train score: 0.9997433956454241
Test Score: 0.9091258996959316

Confusion Matrix:
 [[12021  1260]
 [ 1101 11599]] 


   CONFUSION    |   Pred. Pos.   |   Pred. Neg.   |                |
     MATRIX     |__________________________________________________|
 Act. Positive  |     12021      |      1260      |    

# Pickle Random Forest Classifier or other clf model

In [36]:
# filename = '../Models/loan_acceptance_model.sav'
# pickle.dump(clf, open(filename, 'wb'))