In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVR

import time

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

In [2]:
accepted_df = pd.read_csv("../Resources/lending_club_accepted.csv")
accepted_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,disbursement_method
0,32000.0,60 months,10.49,687.65,10+ years,MORTGAGE,120000.0,Verified,Current,n,...,4.0,100.0,28.6,0.0,0.0,556496.0,103647.0,64100.0,72197.0,Cash
1,9600.0,36 months,12.99,323.42,,RENT,21900.0,Verified,Fully Paid,n,...,2.0,100.0,50.0,1.0,0.0,11600.0,4509.0,2400.0,0.0,Cash
2,4000.0,36 months,6.68,122.93,4 years,MORTGAGE,83000.0,Not Verified,Fully Paid,n,...,2.0,100.0,0.0,0.0,0.0,222616.0,64253.0,5600.0,76154.0,Cash
3,6025.0,36 months,10.91,197.0,10+ years,RENT,52000.0,Not Verified,Fully Paid,n,...,0.0,96.0,0.0,0.0,0.0,32227.0,5559.0,11000.0,11127.0,Cash
4,25000.0,60 months,26.3,752.96,10+ years,OWN,65000.0,Verified,Current,n,...,3.0,78.8,10.0,0.0,0.0,257219.0,97647.0,179400.0,65719.0,Cash


In [3]:
rejected_df = pd.read_csv("../Resources/lending_club_rejected.csv")
rejected_df.head()

Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,15000.0,2015-10-29,debt_consolidation,,54.58%,761xx,TX,< 1 year,0.0
1,10000.0,2017-10-28,Credit card refinancing,605.0,28.71%,076xx,NJ,,0.0
2,10000.0,2018-03-09,Debt consolidation,,58.06%,087xx,NJ,< 1 year,0.0
3,7000.0,2016-10-18,Debt consolidation,571.0,6.66%,731xx,OK,< 1 year,0.0
4,5000.0,2017-02-07,other,,2.23%,537xx,WI,< 1 year,0.0


In [4]:
accepted_df_2 = accepted_df[['loan_amnt', 'title', 'dti', 'zip_code','addr_state', 'emp_length', 'policy_code']].rename(columns = {
    'loan_amnt': 'Amount Requested',
    'title': 'Loan Title',
    'dti': 'Debt-To-Income Ratio',
    'zip_code': 'Zip Code',
    'addr_state': 'State',
    'emp_length': 'Employment Length',
    'policy_code': 'Policy Code'
})
accepted_df_2.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,919xx,CA,10+ years,1.0
1,9600.0,Debt consolidation,10.03,331xx,FL,,1.0
2,4000.0,Major purchase,19.53,333xx,FL,4 years,1.0
3,6025.0,Debt consolidation,9.16,021xx,MA,10+ years,1.0
4,25000.0,Debt consolidation,36.26,926xx,CA,10+ years,1.0


In [5]:
rejected_df_2 = rejected_df.drop(['Application Date', 'Risk_Score'], axis=1)
rejected_df_2.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,15000.0,debt_consolidation,54.58%,761xx,TX,< 1 year,0.0
1,10000.0,Credit card refinancing,28.71%,076xx,NJ,,0.0
2,10000.0,Debt consolidation,58.06%,087xx,NJ,< 1 year,0.0
3,7000.0,Debt consolidation,6.66%,731xx,OK,< 1 year,0.0
4,5000.0,other,2.23%,537xx,WI,< 1 year,0.0


In [6]:
rejected_df_2['Debt-To-Income Ratio'] = rejected_df_2['Debt-To-Income Ratio'].str.replace('%','')
rejected_df_2 = rejected_df_2.astype({
    'Debt-To-Income Ratio': float
})

In [7]:
rejected_df_2.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,15000.0,debt_consolidation,54.58,761xx,TX,< 1 year,0.0
1,10000.0,Credit card refinancing,28.71,076xx,NJ,,0.0
2,10000.0,Debt consolidation,58.06,087xx,NJ,< 1 year,0.0
3,7000.0,Debt consolidation,6.66,731xx,OK,< 1 year,0.0
4,5000.0,other,2.23,537xx,WI,< 1 year,0.0


In [8]:
len(accepted_df_2)

56518

In [9]:
len(rejected_df_2)

691219

In [10]:
rejected_sample = rejected_df_2.sample(frac=0.0817656922, random_state=42)
len(rejected_sample)

56518

In [11]:
# Some policy codes are 2.
# https://news.fintechnexus.com/policy-code-2-loans-lending-club/
# policy code 0 means rejected
# policy code 1 means accepted
# policy code 2 means accepted, but considered sub-prime on credit rating

# We're merely predicting whether or not someone's loan will be accepted or rejected, 
# so we'll change the status and simplify our model
rejected_sample['Policy Code'] = 0
accepted_df_2['Policy Code'] = 1

In [12]:
loan_application_df = pd.DataFrame.append(accepted_df_2, rejected_sample)

len(loan_application_df)

113036

# Converting Employment Length to numeric for now
## Need to remove after csv files have been changed to numeric

In [13]:
loan_application_df['Employment Length']  = loan_application_df['Employment Length'].replace('< 1 year', 0.5).replace('10+ years', 10).\
replace('2 years', 2).replace('3 years', 3).replace('1 year', 1).replace('4 years', 4).replace('5 years', 5).\
replace('6 years', 6).replace('7 years', 7).replace('8 years', 8).replace('8 years', 8).replace('9 years', 9)

In [14]:
loan_application_df.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,919xx,CA,10.0,1
1,9600.0,Debt consolidation,10.03,331xx,FL,,1
2,4000.0,Major purchase,19.53,333xx,FL,4.0,1
3,6025.0,Debt consolidation,9.16,021xx,MA,10.0,1
4,25000.0,Debt consolidation,36.26,926xx,CA,10.0,1


In [15]:
len(loan_application_df.dropna())

107013

In [16]:
loan_application_df = loan_application_df.dropna()

In [17]:
loan_application_df.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,919xx,CA,10.0,1
2,4000.0,Major purchase,19.53,333xx,FL,4.0,1
3,6025.0,Debt consolidation,9.16,021xx,MA,10.0,1
4,25000.0,Debt consolidation,36.26,926xx,CA,10.0,1
5,20000.0,Debt consolidation,16.43,891xx,NV,10.0,1


In [18]:
loan_state = loan_application_df.drop(['Zip Code'], axis=1)
loan_state.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,CA,10.0,1
2,4000.0,Major purchase,19.53,FL,4.0,1
3,6025.0,Debt consolidation,9.16,MA,10.0,1
4,25000.0,Debt consolidation,36.26,CA,10.0,1
5,20000.0,Debt consolidation,16.43,NV,10.0,1


In [19]:
loan_state = loan_state.astype({
    'Policy Code': int,
    'Amount Requested': int
})
loan_state.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000,Debt consolidation,24.05,CA,10.0,1
2,4000,Major purchase,19.53,FL,4.0,1
3,6025,Debt consolidation,9.16,MA,10.0,1
4,25000,Debt consolidation,36.26,CA,10.0,1
5,20000,Debt consolidation,16.43,NV,10.0,1


In [20]:
X = loan_state.drop(['Policy Code'], axis=1)
X.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length
0,32000,Debt consolidation,24.05,CA,10.0
2,4000,Major purchase,19.53,FL,4.0
3,6025,Debt consolidation,9.16,MA,10.0
4,25000,Debt consolidation,36.26,CA,10.0
5,20000,Debt consolidation,16.43,NV,10.0


In [21]:
y = loan_state['Policy Code']
y.head()

0    1
2    1
3    1
4    1
5    1
Name: Policy Code, dtype: int32

In [22]:
X_dummies = pd.get_dummies(X)
# Scale the data
scaler = StandardScaler().fit(X_dummies)
X_scaled = scaler.transform(X_dummies)
# X_train_scaled


# Pickle model scaler

In [23]:
# filename = '../Models/loan_acceptance_scaler.sav'
# pickle.dump(scaler, open(filename, 'wb'))

In [24]:
X_scaled

array([[ 1.39292486, -0.02983799,  1.73077146, ..., -0.11711818,
        -0.05937997, -0.0446585 ],
       [-0.80117121, -0.03192195,  0.12878921, ..., -0.11711818,
        -0.05937997, -0.0446585 ],
       [-0.64249105, -0.03670308,  1.73077146, ..., -0.11711818,
        -0.05937997, -0.0446585 ],
       ...,
       [-0.72281063, -0.02933544,  0.39578626, ..., -0.11711818,
        -0.05937997, -0.0446585 ],
       [-0.17428662, -0.02807677, -0.80570043, ..., -0.11711818,
        -0.05937997, -0.0446585 ],
       [ 0.0607951 , -0.01344753, -0.80570043, ..., -0.11711818,
        -0.05937997, -0.0446585 ]])

In [25]:
X_scaled.shape

(107013, 2744)

In [32]:
def confusion_score(y_true, y_pred):
    [[TP, FN],[FP,TN]] = confusion_matrix(y_true, y_pred)

    accuracy = round((TP + TN) / (TP + FP + TN + FN),4) # (111 + 128) / (111 + 5 + 128 + 6)
#     print(f"Accuracy: {accuracy.round(4)}")
    precision = round((TP / (TP + FP)),4)
#     print(f'Precision: {precision.round(4)}')
    sensitivity = round(TP / (TP + FN),4)
#     print(f'Sensitivity: {sensitivity.round(4)}')
    specificity = round(TN / (TN + FP),4)
#     print(f'Specificity: {specificity.round(4)}')

    neg_predictive_value = round(TN / (TN + FN),4)
    
    matrix = [
#         ['','Predicted Class','Predicted Class',''],
        ['CONFUSION','Pred. Pos.','Pred. Neg.',''],
        ['MATRIX','_'*50],
        ['Act. Positive',TP,FN,sensitivity,'sensitivity'],
        ['Act. Negative',FP,TN,specificity,'specificity'],
        ['',precision,neg_predictive_value,accuracy,'accuracy'],
        ['','precision','neg. pred. value']
    ]
    
    for i in matrix:
        for j in i:
            print(f'{j:^16}|', end='')
        print("")

In [27]:
# IN CASE OF 3x3 CONFUSION MATRIX:

def confusion_score2(y_true, y_pred):
    [[a1, a2, a3],[b1,b2, b3],[c1,c2,c3]] = confusion_matrix(y_true, y_pred)
    TP = a1
    FN = a2 + a3
    FP = b1 + c1
    TN = b2 + b3 + c2 + c3

    accuracy = (TP + TN) / (TP + FP + TN + FN) # (111 + 128) / (111 + 5 + 128 + 6)
    print(f"Accuracy: {accuracy.round(3)}")
    precision = TP / (TP + FP)
    print(f'Precision: {precision.round(3)}')
    sensitivity = TP / (TP + FN)
    print(f'Sensitivity: {sensitivity.round(3)}')
    specificity = TN / (TN + FP)
    print(f'Specificity: {specificity.round(3)}')

In [24]:
# split the data into test and train

X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=1)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, random_state=1)

In [29]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Data Score: 0.9068391083865984
Testing Data Score: 0.8965388353143455


In [30]:
y_true = y_test
y_pred = classifier.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

Confusion Matrix:
 [[13019   562]
 [ 2206 10967]] 

Accuracy: 0.8965
Precision: 0.8551
Sensitivity: 0.9586
Specificity: 0.8325


In [28]:
y.unique()

array([1, 0])

In [29]:
y.value_counts()

0    54566
1    52447
Name: Policy Code, dtype: int64

#### We suspect that employment length is the largest differentiator between a loan application being accepted or rejected.

In [30]:
rejected_sample['Employment Length'].value_counts()

< 1 year     46876
5 years       4678
10+ years      903
1 year         551
2 years        423
3 years        385
4 years        261
6 years        144
8 years        127
7 years        118
9 years        101
Name: Employment Length, dtype: int64

In [31]:
accepted_df_2['Employment Length'].value_counts()

10+ years    18695
2 years       5081
< 1 year      4683
3 years       4505
1 year        3753
5 years       3606
4 years       3411
6 years       2562
7 years       2389
8 years       2258
9 years       1981
Name: Employment Length, dtype: int64

#### This seems to be a reasonable assumption.

# Pickle Logistic Regression Model

In [35]:
# filename = '../Models/loan_acceptance_model.sav'
# pickle.dump(classifier, open(filename, 'wb'))

# Random Forest Classifier

In [39]:
# Train a Random Forest Classifier model on the scaled data and print the model score
start = time.time()
clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)
end = time.time()
print(f'Training Score: {clf.score(X_train, y_train).round(4)}')
print(f'Testing Score: {clf.score(X_test, y_test).round(4)}')

# Random Forest Classifier Confusion Matrix
y_true = y_test
y_pred = clf.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

# Show time calculation for training model
print(f'\nModel Time: {end-start} secs')

Training Score: 0.9998
Testing Score: 0.9252
Confusion Matrix:
 [[12564  1017]
 [  983 12190]] 

Accuracy: 0.9252
Precision: 0.9274
Sensitivity: 0.9251
Specificity: 0.9254

Model Time: 191.47328901290894 secs


### Multithreaded

In [41]:
# Train a Random Forest Classifier model on the scaled data and print the model score
start = time.time()
with ThreadPoolExecutor() as executor:
    clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)
end = time.time()
print(f'Training Score: {clf.score(X_train, y_train).round(4)}')
print(f'Testing Score: {clf.score(X_test, y_test).round(4)}')

# Random Forest Classifier Confusion Matrix
y_true = y_test
y_pred = clf.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

# Show time calculation for training model
print(f'\nModel Time: {end-start} secs')

Training Score: 0.9998
Testing Score: 0.9252
Confusion Matrix:
 [[12564  1017]
 [  983 12190]] 

Accuracy: 0.9252
Precision: 0.9274
Sensitivity: 0.9251
Specificity: 0.9254

Model Time: 191.29166412353516 secs


### Multiprocess

In [42]:
# Train a Random Forest Classifier model on the scaled data and print the model score
start = time.time()
with ProcessPoolExecutor() as executor:
    clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)
end = time.time()
print(f'Training Score: {clf.score(X_train, y_train).round(4)}')
print(f'Testing Score: {clf.score(X_test, y_test).round(4)}')

# Random Forest Classifier Confusion Matrix
y_true = y_test
y_pred = clf.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

# Show time calculation for training model
print(f'\nModel Time: {end-start} secs')

Training Score: 0.9998
Testing Score: 0.9252
Confusion Matrix:
 [[12564  1017]
 [  983 12190]] 

Accuracy: 0.9252
Precision: 0.9274
Sensitivity: 0.9251
Specificity: 0.9254

Model Time: 191.14236617088318 secs


# Decision Tree

In [47]:
# Create and score a decision tree classifier

start = time.time()

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)


# Random Forest Classifier Confusion Matrix
y_true = y_test
y_pred = clf.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

end = time.time()
print(f'\nModel Time: {end-start} secs')

Confusion Matrix:
 [[12315  1266]
 [ 1304 11869]] 

Accuracy: 0.9039
Precision: 0.9043
Sensitivity: 0.9068
Specificity: 0.901

Model Time: 64.37062215805054 secs


# Check multiple models for score

In [28]:
def test_model(model, data):
    start = time.time()
    X_train_scaled, X_test_scaled, y_train, y_test = data

    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
#     plt.show()    
    y_true = y_test
    y_pred = model.predict(X_test_scaled)
    print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n\n')
    confusion_score(y_true, y_pred)
    end = time.time()
    print(f'Process time: {round(end-start, 2)} seconds.')

    print('~'*40)
    
    filename = f'../Models/loan_acceptance/{type(reg).__name__}.sav'
    pickle.dump(model, open(filename, 'wb'))

In [26]:
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [None]:
# multiple regression tests

# test_model(LinearRegression(), data)

# test_model(KNeighborsRegressor(), data)
# test_model(RandomForestRegressor(), data)
# test_model(ExtraTreesRegressor(), data)
# test_model(AdaBoostRegressor(), data)
# test_model(SVR(C=1.0, epsilon=0.2), data)

Model: LinearRegression
Train score: 0.5179285866892313
Test Score: -8.198585342967208e+27

Confusion Matrix:
 [[ 8461  5120]
 [  544 12629]] 


Accuracy: 0.7883
Precision: 0.9396
Sensitivity: 0.623
Specificity: 0.9587
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model: KNeighborsRegressor
Train score: 0.8076869286119375
Test Score: 0.7072803577666905

Confusion Matrix:
 [[ 8461  5120]
 [  544 12629]] 


Accuracy: 0.7883
Precision: 0.9396
Sensitivity: 0.623
Specificity: 0.9587
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model: RandomForestRegressor
Train score: 0.9688942464839772
Test Score: 0.7757007881502956

Confusion Matrix:
 [[ 8461  5120]
 [  544 12629]] 


Accuracy: 0.7883
Precision: 0.9396
Sensitivity: 0.623
Specificity: 0.9587
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [30]:
# multiple classification tests

# test_model(LinearRegression(), data)

test_list = [    
    LogisticRegression(),
    RandomForestClassifier(random_state=1, n_estimators=100),
    tree.DecisionTreeClassifier(),
    KNeighborsClassifier(n_neighbors = 15),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    AdaBoostClassifier()
]


# test_model(LogisticRegression(),data)
with ProcessPoolExecutor() as executor:

    for model in test_list:

        try:

            test_model(model, data)

            # test_model(SVR(C=1.0, epsilon=0.2), data)
        except Exception as e:
            print('#'*40)
            print('Exception found.')
            print(e)
            print('#'*40)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: LogisticRegression
Train score: 0.9068391083865984
Test Score: 0.8965388353143455

Confusion Matrix:
 [[13019   562]
 [ 2206 10967]] 


Accuracy: 0.8965
Precision: 0.8551
Sensitivity: 0.9586
Specificity: 0.8325
Process time: 30.75 seconds.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model: RandomForestClassifier
Train score: 0.9998131050723283
Test Score: 0.9252448232040069

Confusion Matrix:
 [[12564  1017]
 [  983 12190]] 


Accuracy: 0.9252
Precision: 0.9274
Sensitivity: 0.9251
Specificity: 0.9254
Process time: 202.2 seconds.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model: DecisionTreeClassifier
Train score: 0.9999377016907761
Test Score: 0.9046123944083128

Confusion Matrix:
 [[12328  1253]
 [ 1299 11874]] 


Accuracy: 0.9046
Precision: 0.9047
Sensitivity: 0.9077
Specificity: 0.9014
Process time: 60.84 seconds.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Model: KNeighborsClassifier
Train score: 0.9086083803685568
Test Score: 0.8981834492038574

Confusion Matrix:
 [[12664   917]
 [

# Pickle Random Forest Classifier or other clf model

In [37]:
# filename = '../Models/loan_acceptance_model.sav'
# pickle.dump(clf, open(filename, 'wb'))