In [27]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

## Prediction 
I think the logistic regression model will perform better because that model will allow us to draw a line between risky and non-risky customers. This will then allow us to predit which class the new customer should belong to based on which side of the line they fall on.

In [4]:
lending_data = pd.read_csv('Resources/lending_data.csv')
lending_data

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [7]:
lending_data['loan_status'].value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [11]:
y = lending_data['loan_status']
X= lending_data.drop(columns = 'loan_status')

# assining target names
names_target = ['non-risky', 'risky']

In [13]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)
X_test.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
71710,9900.0,7.323,49500,0.393939,4,0,19500
11304,9400.0,7.134,47700,0.371069,3,0,17700
41806,9200.0,7.04,46800,0.358974,3,0,16800
29492,10100.0,7.431,50500,0.405941,4,1,20500
19244,9400.0,7.121,47600,0.369748,3,0,17600


In [16]:
#Logistic model
log_classifier = LogisticRegression()
log_classifier

LogisticRegression()

In [17]:
lr = log_classifier.fit(X_train, y_train)

print(f'Testing Data Score: {lr.score(X_test, y_test)}')
print(f'Training Data Score: {lr.score(X_train, y_train)}')

Testing Data Score: 0.9925196037969459
Training Data Score: 0.9918317512725272


In [19]:
y_true = y_test
y_pred = log_classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18682,    88],
       [   57,   557]], dtype=int64)

In [20]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9925196037969459


In [31]:
print(classification_report(y_true, y_pred, target_names=names_target))

              precision    recall  f1-score   support

   non-risky       1.00      1.00      1.00     18770
       risky       0.86      0.91      0.88       614

    accuracy                           0.99     19384
   macro avg       0.93      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [26]:
#Random Forest Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [28]:
clf = RandomForestClassifier(random_state=50, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9973001788416563
Testing Score: 0.9920553033429633


## Conclusion 
Both models fit 99% of the testing data so it is clear to see that they would both be good models for predicting accurate outcomes. However, the logistic regression model is slightly better because the training and testing data on the logistic regression model are closer with only a 0.000688 difference. 