In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the file lending_data.csv from the Resources folder into a Pandas DataFrame
lending_data_df = pd.read_csv("../Starter_Code/Resources/lending_data.csv")

# Preview the dataframe
lending_data_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Separate the data into labels and features
# loan_status will be the labels, all other columns will be features
# note: 
#     loan_status = 0 => the loan is healthy
#     loan_status = 1 => the loan is at high risk of defaulting

# Pull out the first 7 columns into their own dataframe.
features_df = lending_data_df.iloc[:,0:7]

# Preview the features.

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [7]:
# And now separate out the labels.
labels = lending_data_df["loan_status"]

# Preview the labels.
labels.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [9]:
# Check the balance of the labels. I should only have values 0 and 1.
labels.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [10]:
# And we don't have any surprise extra labels, which is good. 
# Since we do have quite a bit more healthy loans in our set, 
# I would think that the model later on will be more inclined to predict new loans as being healthy.

# Next step: Split the data into training and testing datasets. Still have to import the train_test_split module.
from sklearn.model_selection import train_test_split

In [11]:
# Split the data using train_test_split
# This should generate 4 groups of data
# Assign a random_state of 1 to the function

x_train,x_test,y_train,y_test = train_test_split(features_df,labels,random_state=1,stratify=labels)

In [12]:
# Just out of curiosity, check on the size of the training vs testing sets.
len(x_train)

58152

In [13]:
len(x_test)

19384

In [16]:
# Part 1: Create a logistic regression model with the original data.
# Fit a logistic regression model by using the training data.

# We still have to import the logistic regression module.
from sklearn.linear_model import LogisticRegression

In [18]:
# Instantiate the Logistic Regression model and assign a random_state parameter of 1 to the model
classifier_model = LogisticRegression(random_state=1)

# Train the model with our training data
classifier_model.fit(x_train,y_train)

LogisticRegression(random_state=1)

In [19]:
# Check the model's accuracy on the training data. 
classifier_model.score(x_train,y_train)

0.9914878250103177

In [21]:
# Make predictions using the testing data set.
predictions = classifier_model.predict(x_test)

In [23]:
# Evaluate the model's performance with: balanced accuracy score, confusion matrix, and classification report.
# These have already been imported.

# First up is the balanced accuracy score. 
balanced_accuracy_score(y_test,predictions)

0.9442676901753825

In [24]:
# Next is the confusion matrix.
confusion_matrix(y_test,predictions)

array([[18679,    80],
       [   67,   558]], dtype=int64)

In [27]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      0.89      0.88       625

    accuracy                           0.99     19384
   macro avg       0.94      0.94      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [35]:
# Our scores are spot on for loans with a loan_status of 0, not so much for high risk loans with status 1.
# This is as expected previously.

# Part 2: Predict a logistic regression model with resampled training data.
# We'll have to use the RandomOverSampler module from the imbalanced-learn library to resample the data. 
# First, I still have to import the RandomOverSampler module.
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
x_oversampled,y_oversampled = oversampler.fit_resample(features_df,labels)

# Be sure to confirm that the labels have an equal number of data points.
y_oversampled.value_counts()

0    75036
1    75036
Name: loan_status, dtype: int64

In [36]:
# And they do! Split that into training and testing sets.
x_train_resampled,x_test_resampled,y_train_resampled,y_test_resampled = train_test_split(x_oversampled,y_oversampled,random_state=1,stratify=y_oversampled)

In [37]:
# Let's make a new instance of the logistic regression classifier and train it with the resampled data.

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
resampled_classifier_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
classifier_model.fit(x_train_resampled,y_train_resampled)

# Make a prediction using the testing data
predictions_resampled = classifier_model.predict(x_test_resampled)

In [38]:
# We'll check the accuracy using the same measures as before.

# First up, the balanced accuracy score.
balanced_accuracy_score(y_test_resampled,predictions_resampled)

0.9945359560744176

In [39]:
# And now a confusion matrix.
confusion_matrix(y_test_resampled,predictions_resampled)

array([[18668,    91],
       [  114, 18645]], dtype=int64)

In [40]:
# And a classification report.
print(classification_report(y_test_resampled,predictions_resampled))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     18759
           1       1.00      0.99      0.99     18759

    accuracy                           0.99     37518
   macro avg       0.99      0.99      0.99     37518
weighted avg       0.99      0.99      0.99     37518



In [None]:
# On the whole I'm happier about the precision and recall scores on this version with the resampled data
# since it didn't lose much for the healthy loans and went up significantly from 0.88-ish for high risk loans.