# Supervised Machine Learning: Predicting Credit Risk

### Set Up

In [1]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [2]:
# Read in data

file_path = "Resources/lending_data.csv"

lending_data = pd.read_csv(file_path)
lending_data.shape

(77536, 8)

In [3]:
# Remove loan_status from data set

y = lending_data["loan_status"]
X = lending_data.drop(columns = "loan_status")

In [4]:
# Split training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [5]:
# Scale data sets
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Prediction

I predict that the Random Forest Classifier will perform better. The `borrower_income`, `debt_to_income`, and `total_debt` fields are all related to one another. An underlying assumption of the Logistic Regression is that the independent variables aren't correlated, and that doesn't seem to be true in this case.

### Logistic Regression

In [6]:
# Make logistic regressor

log_res = LogisticRegression()

In [7]:
# Train model on training set

log_res.fit(X_train_scaled, y_train)

LogisticRegression()

In [41]:
# Test model

accuracy_lr_train = round((log_res.score(X_train_scaled, y_train)*100),3)
accuracy_lr_test = round((log_res.score(X_test_scaled, y_test)*100),3)
print(f'Logistic Regression accuracy on the training set: {accuracy_lr_train}%')
print(f'Logistic Regression accuracy on the test set: {accuracy_lr_test}%')

Logistic Regression accuracy on the training set: 99.412%
Logistic Regression accuracy on the test set: 99.417%


In [9]:
# Evaluate effectiveness

y_true_lr = y_test
y_pred_lr = log_res.predict(X_test_scaled)

confusion_matrix(y_true_lr, y_pred_lr)

array([[18690,   102],
       [   11,   581]], dtype=int64)

In [10]:
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(y_true_lr, y_pred_lr).ravel()

print(f'True Negatives: {tn_lr}, False Positives: {fp_lr}')
print(f'False Negatives: {fn_lr}, True Positives: {tp_lr}')

True Negatives: 18690, False Positives: 102
False Negatives: 11, True Positives: 581


In [50]:
precision_lr = round((tp_lr/(tp_lr+fp_lr))*100, 3)
print(f'Logistic Regression Precision: {precision_lr}%')

Logistic Regression Precision: 85.066%


In [52]:
sensitivity_lr = round((tp_lr/(tp_lr+fn_lr))*100, 3)
print(f'Logistic Regression Sensitivity: {sensitivity_lr}%')

Logistic Regression Sensitivity: 98.142%


### Random Forest Classifier

In [28]:
# Create random forest classifier

rand_forest = RandomForestClassifier(n_estimators=100, random_state = 42)

In [29]:
# Train model on training set

rand_forest.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=42)

In [43]:
# Test model on testing set

accuracy_rf_train = round((rand_forest.score(X_train_scaled, y_train)*100), 3)
accuracy_rf_test = round((rand_forest.score(X_test_scaled, y_test)*100), 3)

print(f'Random Forest Classifier accuracy on the training set: {accuracy_rf_train}%')
print(f'Random Forest Classifier accuracy on the testing set: {accuracy_rf_test}%')

Random Forest Classifier accuracy on the training set: 99.718%
Random Forest Classifier accuracy on the testing set: 99.175%


In [31]:
# Evaluate effectiveness

y_true_rf = y_test
y_pred_rf = rand_forest.predict(X_test_scaled)

confusion_matrix(y_true_rf, y_pred_rf)

array([[18694,    98],
       [   62,   530]], dtype=int64)

In [32]:
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(y_true_rf, y_pred_rf).ravel()

print(f'True Negatives: {tn_rf}, False Positives: {fp_rf}')
print(f'False Negatives: {fn_rf}, True Positives: {tp_rf}')

True Negatives: 18694, False Positives: 98
False Negatives: 62, True Positives: 530


In [44]:
precision_rf = round((tp_rf/(tp_rf+fp_rf))*100, 3)
print(f'Random Forest Classifier Precision: {precision_rf}%')

Precision: 84.395%


In [46]:
sensitivity_rf = round((tp_rf/(tp_rf+fn_rf))*100, 3)
print(f'Random Forest Classifier Sensitivity: {sensitivity_rf}%')

Random Forest Classifier Sensitivity: 89.527%


### Analysis and Conclusions

In this comparison of logistic regression and random tree classification, I notice that the results are very similar between the two models, but the logistic regression does slightly better. The models are close to each other in their accuracy and precision, but the logistic regression is significantly more precise. I am a little surprised that the logistic regression did better, because I had concerns that the test variables were too reliant on each other, but I can see that the logistic regression wins out in spite of that.

The logistic regression was also a lot faster than the random forest classifier, so if I were to approach a problem like this again, I would probably run a logistic regression first, to see how well it does on the data set. If it didn't perform well, I would move on to trying another method, such as a random forest classifier.

In [57]:
# I have listed the accuracy, precision, and sensitivity measures for each method for quick reference

print(f'Logistic Regression accuracy: {accuracy_lr_test}%, Random Forest accuracy: {accuracy_rf_test}%')
print(f'Logistic Regression precision: {precision_lr}%, Random Forest precision: {precision_rf}%')
print(f'Logistic Regression sensitivity: {sensitivity_lr}%, Random Forest sensitivity: {sensitivity_rf}%')

Logistic Regression accuracy: 99.417%, Random Forest accuracy: 99.175%
Logistic Regression precision: 85.066%, Random Forest precision: 84.395%
Logistic Regression sensitivity: 98.142%, Random Forest sensitivity: 89.527%
