In [13]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

## Splitting the Data into Training and Testing Sets

In [2]:
# Read in the lending_data.csv file into a PandasDataFrame.
df = pd.read_csv("lending_data.csv")

# Review the first five rows in the dataframe
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# The column 'loan_status' is the thing we want to predict. 
# Class 0 indicates healthy loan and class 1 indicates high risk of defaulting
# Using value_counts, how many healthy and unhealthy loans are in this dataset?
df["loan_status"].value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

__It can be observed that there is a class imbalance in the data as there are more cases of healthy loans over loans being defaulted.__

In [4]:
# The target column should be the binary `loan_status` column.
y = df["loan_status"]

# The features column should be all the remaining features. 
X = df.iloc[:, :-1]
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [5]:
# Split the dataset using the train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Creating a Logistic Regression Model with the Original Data

In [6]:
# Declare a logistic regression model.
# Apply a random_state of 7 to the model
logistic_regression_model = LogisticRegression(random_state=7)

In [8]:
# Train the data
logistic_regression_model.fit(X_train, y_train)

In [10]:
# Predict outcomes for test data set
predictions = logistic_regression_model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
...,...,...
45639,0,0
11301,0,0
51614,0,0
4598,0,0


In [11]:
# Determining the model accuracy
accuracy_score(y_test, predictions)

0.9918489475856377

In [14]:
# Printing the confusion matrix
confusion_matrix(y_test, predictions)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [16]:
# Printing the Classification Report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



_The logistic model indicates an accuracy of 99\%, the precision score is almost 100\% for class 0 i.e., for a healthy loan but lower for the cases where the loan can default. Same follows for recall (sensitivity) and f1-score. A high accuracy does not necessarily indicate that the model works well, rather it suggests that there is a greater probability of overfitting as the accuracy vs precision/recall do not match well._

### Predicting a Logistic Regression Model with Resampled Training Data

In [17]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

In [18]:
# Instantiate the RandomOverSampler instance
random_oversampler = RandomOverSampler(random_state=1)

# Fit the data to the model
X_resampled, y_resampled = random_oversampler.fit_resample(X_train, y_train)

In [19]:
# Count distinct values
y_resampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [20]:
# Declare a logistic regression model.
# Apply a random_state of 1 to the model
ros_model = LogisticRegression(random_state=1)

In [22]:
# Train the data
ros_model.fit(X_resampled, y_resampled)

In [23]:
# Predict outcomes for test data set
ros_predictions = ros_model.predict(X_test)
pd.DataFrame({"Prediction": ros_predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
...,...,...
45639,0,0
11301,0,0
51614,0,0
4598,0,0


In [25]:
# Determining the model accuracy
accuracy_score(y_test, ros_predictions)

0.9938093272802311

In [26]:
# Printing the confusion matrix
confusion_matrix(y_test, ros_predictions)

array([[18649,   116],
       [    4,   615]], dtype=int64)

In [27]:
# Printing the Classification Report
print(classification_report(y_test, ros_predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



_After applying oversampling technique to address the imbalance, the model's performance notably improved in predicting the class 1 i.e., cases where loan defaults. The recall for class 1 increased significantly from 0.91 to 0.99, indicating a substantial enhancement in correctly identifying actual instances of class 1. Additionally, the F1-score for class 1 also increased from 0.88 to 0.91, indicating a better balance between precision and recall for the minority class. However the precision still remained the same._

_The model's ability to correctly identify instances of the minority class has substantially improved after oversampling, reducing the impact of the class imbalance issue._