# Supervised Machine Learning Homework - Predicting Credit Risk


In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Import the data
lending_data = pd.read_csv(Path('Resources/lending_data.csv'))
lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
lending_data.loan_status.unique()

array([0, 1], dtype=int64)

In [4]:
lending_data.isnull().values.any()

False

In [5]:
# Split the data into X_train, X_test, y_train, y_test
y = lending_data['loan_status'].values
X = lending_data.drop('loan_status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
np.unique(y)

array([0, 1], dtype=int64)

In [8]:
# Train a Logistic Regression model print the model score
logistic_regression = LogisticRegression().fit(X_train, y_train)

print('LogisticRegression Training score: ', logistic_regression.score(X_train,y_train))
print('LogisticRegression Test score: ', logistic_regression.score(X_test, y_test))

LogisticRegression Training score:  0.9921240885954051
LogisticRegression Test score:  0.9918489475856377


In [10]:
# Train a Random Forest Classifier model and print the model score
RandomForest_classifier = RandomForestClassifier(random_state=1)
RandomForest_classifier.fit(X_train, y_train)
print('RandomForestClassifier Training score: ', RandomForest_classifier.score(X_train,y_train))
print('RandomForestClassifier Test score: ', RandomForest_classifier.score(X_test,y_test))

RandomForestClassifier Training score:  0.9975409272252029
RandomForestClassifier Test score:  0.9914878250103177


In [13]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Train the Logistic Regression model on the scaled data and print the model score
scaled_logistic = LogisticRegression(
    solver='lbfgs',
    max_iter=100,
    random_state=1
)
scaled_logistic.fit(X_train_scaled, y_train)

print('Scaled LogisticRegression Training score: ', scaled_logistic.score(X_train_scaled, y_train))
print('Scaled LogisticRegression Test score: ',scaled_logistic.score(X_test_scaled, y_test))

Scaled LogisticRegression Training score:  0.9942908240473243
Scaled LogisticRegression Test score:  0.9936545604622369


In [15]:
# Train a Random Forest Classifier model on the scaled data and print the model score
scaled_random_forest = RandomForestClassifier(random_state=0)
scaled_random_forest.fit(X_train_scaled, y_train)
print('Scaled RandomForestClassifier Training score: ', scaled_random_forest.score(X_train_scaled,y_train))
print('Scaled RandomForestClassifier Test score: ', scaled_random_forest.score(X_test_scaled, y_test))

Scaled RandomForestClassifier Training score:  0.9975409272252029
Scaled RandomForestClassifier Test score:  0.9915910028889806


###  Logistic Regression Confusion

In [16]:
# Create a confusion matrix from the test values and predictions for the Logistic Regression model
y_true = y_test
y_logistic_pred = logistic_regression.predict(X_test)
confusion_matrix(y_true, y_logistic_pred)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [17]:
tn, fp, fn, tp = confusion_matrix(y_true,y_logistic_pred).ravel()
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score= 2 * precision * recall / (precision + recall)
print(f'Logistic Regression precision: {precision}')
print(f'Logistic Regression recall: {recall}')
print(f'Logistic Regression f1_scores: {f1_score}')

Logistic Regression precision: 0.8466165413533835
Logistic Regression recall: 0.9095315024232633
Logistic Regression f1_scores: 0.8769470404984423


###  Random Forest Classifier Confusion

In [18]:
# Create a confusion matrix from the test values and predictions for the Random Forest Classifier model
y_true = y_test
y_random_pred = RandomForest_classifier.predict(X_test)
confusion_matrix(y_true, y_random_pred)

array([[18666,    99],
       [   66,   553]], dtype=int64)

In [19]:
tn, fp, fn, tp = confusion_matrix(y_true, y_random_pred).ravel()
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score= 2 * precision * recall / (precision + recall)
print(f'Random Forest precision: {precision}')
print(f'Random Forest recall: {recall}')
print(f'Random Forest precision f1_score: {f1_score}')

Random Forest precision: 0.848159509202454
Random Forest recall: 0.8933764135702746
Random Forest precision f1_score: 0.8701809598741148
