# Supervised Machine Learning: Predicting Credit Risk

### Set Up

In [2]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [3]:
# Read in data

file_path = "Resources/lending_data.csv"

lending_data = pd.read_csv(file_path)
lending_data.shape

(77536, 8)

In [4]:
# Remove loan_status from data set

y = lending_data["loan_status"]
X = lending_data.drop(columns = "loan_status")

In [5]:
# Split training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
# Scale data sets
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Prediction

I predict that the Random Forest Classifier will perform better. The `borrower_income`, `debt_to_income`, and `total_debt` fields are all related to one another. An underlying assumption of the Logistic Regression is that the independent variables aren't correlated, and that isn't true in this case.

### Logistic Regression

In [7]:
# Make logistic regressor

log_res = LogisticRegression()

In [8]:
# Train model on training set

log_res.fit(X_train_scaled, y_train)

LogisticRegression()

In [9]:
# Test model

print(f'Model score on training set: {log_res.score(X_train_scaled, y_train)}')
print(f'Model score on testing set: {log_res.score(X_test_scaled, y_test)}')

Model score on training set: 0.9941188609162196
Model score on testing set: 0.9941704498555509


In [25]:
# Evaluate effectiveness

y_true_lr = y_test
y_pred_lr = log_res.predict(X_test_scaled)

confusion_matrix(y_true_lr, y_pred_lr)

array([[18690,   102],
       [   11,   581]], dtype=int64)

In [26]:
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(y_true_lr, y_pred_lr).ravel()

print(f'True Negatives: {tn_lr}, False Positives: {fp_lr}')
print(f'False Negatives: {fn_lr}, True Positives: {tp_lr}')

True Negatives: 18690, False Positives: 102
False Negatives: 11, True Positives: 581


In [27]:
precision_lr = tp_lr/(tp_lr+fp_lr)
print(f'Precision: {precision_lr}')

Precision: 0.8506588579795022


In [28]:
sensitivity_lr = tp_lr/(tp_lr+fn_lr)
print(f'Sensitivity: {sensitivity_lr}')

Sensitivity: 0.981418918918919


### Random Forest Classifier

In [17]:
# Create model

rand_forest = RandomForestClassifier(n_estimators=100, random_state = 42)

In [19]:
# Train model on training set

rand_forest.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=42)

In [20]:
# Test model on testing set

print(f'Model score on training set: {rand_forest.score(X_train_scaled, y_train)}')
print(f'Model score on testing set: {rand_forest.score(X_test_scaled, y_test)}')

Model score on training set: 0.9971798046498831
Model score on testing set: 0.9917457697069748


In [21]:
# Evaluate effectiveness

y_true_rf = y_test
y_pred_rf = rand_forest.predict(X_test_scaled)

confusion_matrix(y_true_rf, y_pred_rf)

array([[18694,    98],
       [   62,   530]], dtype=int64)

In [22]:
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(y_true_rf, y_pred_rf).ravel()

print(f'True Negatives: {tn_rf}, False Positives: {fp_rf}')
print(f'False Negatives: {fn_rf}, True Positives: {tp_rf}')

True Negatives: 18694, False Positives: 98
False Negatives: 62, True Positives: 530


In [23]:
precision_rf = tp_rf/(tp_rf+fp_rf)
print(f'Precision: {precision_rf}')

Precision: 0.8439490445859873


In [24]:
sensitivity_rf = tp_rf/(tp_rf+fn_rf)
print(f'Sensitivity: {sensitivity_rf}')

Sensitivity: 0.8952702702702703


### Analysis and Conclusions