In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [3]:
accepted_df = pd.read_csv("../Resources/lending_club_accepted.csv")
accepted_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,disbursement_method
0,32000.0,60 months,10.49,687.65,10+ years,MORTGAGE,120000.0,Verified,Current,n,...,4.0,100.0,28.6,0.0,0.0,556496.0,103647.0,64100.0,72197.0,Cash
1,9600.0,36 months,12.99,323.42,,RENT,21900.0,Verified,Fully Paid,n,...,2.0,100.0,50.0,1.0,0.0,11600.0,4509.0,2400.0,0.0,Cash
2,4000.0,36 months,6.68,122.93,4 years,MORTGAGE,83000.0,Not Verified,Fully Paid,n,...,2.0,100.0,0.0,0.0,0.0,222616.0,64253.0,5600.0,76154.0,Cash
3,6025.0,36 months,10.91,197.0,10+ years,RENT,52000.0,Not Verified,Fully Paid,n,...,0.0,96.0,0.0,0.0,0.0,32227.0,5559.0,11000.0,11127.0,Cash
4,25000.0,60 months,26.3,752.96,10+ years,OWN,65000.0,Verified,Current,n,...,3.0,78.8,10.0,0.0,0.0,257219.0,97647.0,179400.0,65719.0,Cash


In [4]:
rejected_df = pd.read_csv("../Resources/lending_club_rejected.csv")
rejected_df.head()

Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,15000.0,2015-10-29,debt_consolidation,,54.58%,761xx,TX,< 1 year,0.0
1,10000.0,2017-10-28,Credit card refinancing,605.0,28.71%,076xx,NJ,,0.0
2,10000.0,2018-03-09,Debt consolidation,,58.06%,087xx,NJ,< 1 year,0.0
3,7000.0,2016-10-18,Debt consolidation,571.0,6.66%,731xx,OK,< 1 year,0.0
4,5000.0,2017-02-07,other,,2.23%,537xx,WI,< 1 year,0.0


In [27]:
accepted_df_2 = accepted_df[['loan_amnt', 'title', 'dti', 'zip_code','addr_state', 'emp_length', 'policy_code']].rename(columns = {
    'loan_amnt': 'Amount Requested',
    'title': 'Loan Title',
    'dti': 'Debt-To-Income Ratio',
    'zip_code': 'Zip Code',
    'addr_state': 'State',
    'emp_length': 'Employment Length',
    'policy_code': 'Policy Code'
})
accepted_df_2.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,919xx,CA,10+ years,1.0
1,9600.0,Debt consolidation,10.03,331xx,FL,,1.0
2,4000.0,Major purchase,19.53,333xx,FL,4 years,1.0
3,6025.0,Debt consolidation,9.16,021xx,MA,10+ years,1.0
4,25000.0,Debt consolidation,36.26,926xx,CA,10+ years,1.0


In [28]:
rejected_df_2 = rejected_df.drop(['Application Date', 'Risk_Score'], axis=1)
rejected_df_2.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,15000.0,debt_consolidation,54.58%,761xx,TX,< 1 year,0.0
1,10000.0,Credit card refinancing,28.71%,076xx,NJ,,0.0
2,10000.0,Debt consolidation,58.06%,087xx,NJ,< 1 year,0.0
3,7000.0,Debt consolidation,6.66%,731xx,OK,< 1 year,0.0
4,5000.0,other,2.23%,537xx,WI,< 1 year,0.0


In [29]:
len(accepted_df_2)

56518

In [30]:
len(rejected_df_2)

691219

In [31]:
rejected_sample = rejected_df_2.sample(frac=0.0817656922, random_state=42)
len(rejected_sample)

56518

In [61]:
# Some policy codes are 2.
# https://news.fintechnexus.com/policy-code-2-loans-lending-club/
# policy code 0 means rejected
# policy code 1 means accepted
# policy code 2 means accepted, but considered sub-prime on credit rating

# We're merely predicting whether or not someone's loan will be accepted or rejected, 
# so we'll change the status and simplify our model
rejected_sample['Policy Code'] = 0
accepted_df_2['Policy Code'] = 1

In [32]:
loan_application_df = pd.DataFrame.append(accepted_df_2, rejected_sample)

len(loan_application_df)

113036

In [33]:
loan_application_df.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,919xx,CA,10+ years,1.0
1,9600.0,Debt consolidation,10.03,331xx,FL,,1.0
2,4000.0,Major purchase,19.53,333xx,FL,4 years,1.0
3,6025.0,Debt consolidation,9.16,021xx,MA,10+ years,1.0
4,25000.0,Debt consolidation,36.26,926xx,CA,10+ years,1.0


In [34]:
len(loan_application_df.dropna())

107008

In [35]:
loan_application_df = loan_application_df.dropna()

In [36]:
loan_application_df.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,919xx,CA,10+ years,1.0
2,4000.0,Major purchase,19.53,333xx,FL,4 years,1.0
3,6025.0,Debt consolidation,9.16,021xx,MA,10+ years,1.0
4,25000.0,Debt consolidation,36.26,926xx,CA,10+ years,1.0
5,20000.0,Debt consolidation,16.43,891xx,NV,10+ years,1.0


In [37]:
loan_state = loan_application_df.drop(['Zip Code'], axis=1)
loan_state.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000.0,Debt consolidation,24.05,CA,10+ years,1.0
2,4000.0,Major purchase,19.53,FL,4 years,1.0
3,6025.0,Debt consolidation,9.16,MA,10+ years,1.0
4,25000.0,Debt consolidation,36.26,CA,10+ years,1.0
5,20000.0,Debt consolidation,16.43,NV,10+ years,1.0


In [43]:
loan_state = loan_state.astype({
    'Policy Code': int,
    'Amount Requested': int
})
loan_state.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length,Policy Code
0,32000,Debt consolidation,24.05,CA,10+ years,1
2,4000,Major purchase,19.53,FL,4 years,1
3,6025,Debt consolidation,9.16,MA,10+ years,1
4,25000,Debt consolidation,36.26,CA,10+ years,1
5,20000,Debt consolidation,16.43,NV,10+ years,1


In [44]:
X = loan_state.drop(['Policy Code'], axis=1)
X.head()

Unnamed: 0,Amount Requested,Loan Title,Debt-To-Income Ratio,State,Employment Length
0,32000,Debt consolidation,24.05,CA,10+ years
2,4000,Major purchase,19.53,FL,4 years
3,6025,Debt consolidation,9.16,MA,10+ years
4,25000,Debt consolidation,36.26,CA,10+ years
5,20000,Debt consolidation,16.43,NV,10+ years


In [45]:
y = loan_state['Policy Code']
y.head()

0    1
2    1
3    1
4    1
5    1
Name: Policy Code, dtype: int32

In [None]:
X_dummies = pd.get_dummies(X)
# Scale the data
scaler = StandardScaler().fit(X_dummies)
X_scaled = scaler.transform(X_dummies)
# X_train_scaled


In [48]:
X_scaled

array([[ 1.3928828 , -0.01729545, -0.00305699, ..., -0.14991207,
        -0.14055279, -0.96396015],
       [-0.80117426, -0.01729545, -0.00305699, ..., -0.14991207,
        -0.14055279, -0.96396015],
       [-0.64249692, -0.01729545, -0.00305699, ..., -0.14991207,
        -0.14055279, -0.96396015],
       ...,
       [-0.72281508, -0.01729545, -0.00305699, ..., -0.14991207,
        -0.14055279, -0.96396015],
       [-0.17430082, -0.01729545, -0.00305699, ..., -0.14991207,
        -0.14055279,  1.03738728],
       [ 0.06077673, -0.01729545, -0.00305699, ..., -0.14991207,
        -0.14055279,  1.03738728]])

In [49]:
def confusion_score(y_true, y_pred):
    [[TP, FN],[FP,TN]] = confusion_matrix(y_true, y_pred)

    accuracy = (TP + TN) / (TP + FP + TN + FN) # (111 + 128) / (111 + 5 + 128 + 6)
    print(f"Accuracy: {accuracy.round(2)}")
    precision = TP / (TP + FP)
    print(f'Precision: {precision.round(2)}')
    sensitivity = TP / (TP + FN)
    print(f'Sensitivity: {sensitivity.round(2)}')
    specificity = TN / (TN + FP)
    print(f'Specificity: {specificity.round(2)}')

In [None]:
def confusion_score(y_true, y_pred):
    [[TP, FN],[FP,TN]] = confusion_matrix(y_true, y_pred)

    accuracy = (TP + TN) / (TP + FP + TN + FN) # (111 + 128) / (111 + 5 + 128 + 6)
    print(f"Accuracy: {accuracy.round(2)}")
    precision = TP / (TP + FP)
    print(f'Precision: {precision.round(2)}')
    sensitivity = TP / (TP + FN)
    print(f'Sensitivity: {sensitivity.round(2)}')
    specificity = TN / (TN + FP)
    print(f'Specificity: {specificity.round(2)}')

In [55]:
def confusion_score2(y_true, y_pred):
    [[a1, a2, a3],[b1,b2, b3],[c1,c2,c3]] = confusion_matrix(y_true, y_pred)
    TP = a1
    FN = a2 + a3
    FP = b1 + c1
    TN = b2 + b3 + c2 + c3

    accuracy = (TP + TN) / (TP + FP + TN + FN) # (111 + 128) / (111 + 5 + 128 + 6)
    print(f"Accuracy: {accuracy.round(3)}")
    precision = TP / (TP + FP)
    print(f'Precision: {precision.round(3)}')
    sensitivity = TP / (TP + FN)
    print(f'Sensitivity: {sensitivity.round(3)}')
    specificity = TN / (TN + FP)
    print(f'Specificity: {specificity.round(3)}')

In [50]:
# split the data into test and train

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1)

In [51]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Data Score: 0.9994517543859649
Testing Data Score: 0.9935705741626795


In [56]:
y_true = y_test
y_pred = classifier.predict(X_test)
print('Confusion Matrix:\n',confusion_matrix(y_true, y_pred), '\n')

confusion_score(y_true, y_pred)

Confusion Matrix:
 [[13461    67    26]
 [   29 13118    12]
 [   38     0     1]] 

Accuracy: 0.994
Precision: 0.995
Sensitivity: 0.993
Specificity: 0.995


In [58]:
y.unique()

array([1, 0, 2])

In [60]:
y.value_counts()

0    54405
1    52447
2      156
Name: Policy Code, dtype: int64