In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv(
    Path("Resources/lending_data.csv"))

# Review the DataFrame
display(lending_df.head())
display(lending_df.tail())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df['loan_status']

# Separate the X variable, the features
X = lending_df.drop(columns=['loan_status'])

In [4]:
# Review the y variable Series
y

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

In [5]:
# Review the X variable DataFrame
X

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
# Check the balance of our target values
print(y.value_counts())

loan_status
0    75036
1     2500
Name: count, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state = 1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [9]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)
testing_predictions = logistic_regression_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [10]:
# Print the balanced_accuracy score of the model
test_accuracy = balanced_accuracy_score(y_test, testing_predictions)
print(f"Balanced Accuracy Score: {test_accuracy:.5}")

Balanced Accuracy Score: 0.95205


The balanced accuracy score is 0.95205 (or approximately 95.21%). Balanced accuracy is particularly useful in situations where classes are imbalanced. It calculates the average of recall obtained on each class, accounting for any imbalance by treating each class equally. A score closer to 1 indicates a better model performance, so a score of approximately 95.21% is excellent, suggesting that the model does a good job of balancing the prediction accuracy across both healthy and high-risk loans.

In [11]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)
confusion_df = pd.DataFrame(test_matrix, 
                            index=["Actual 0", "Actual 1"], 
                            columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix:")
confusion_df

Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18663,102
Actual 1,56,563


This matrix tells us:

- **True negatives (TN)**: 18663 (healthy loans correctly identified)
- **False positives (FP)**: 102 (healthy loans incorrectly labeled as high-risk)
- **False negatives (FN)**: 56 (high-risk loans incorrectly labeled as healthy)
- **True positives (TP)**: 563 (high-risk loans correctly identified)

The low numbers of false positives and false negatives relative to the true positives and true negatives indicate that the model performs well in identifying both healthy and high-risk loans.

In [12]:
# Print the classification report for the model
training_report = classification_report(y_train, training_predictions, target_names = ["Healthy Loan (0)" , "High-Risk Loan(1)"])
print(f"Classification Report:\n{training_report}")

Classification Report:
                   precision    recall  f1-score   support

 Healthy Loan (0)       1.00      1.00      1.00     56271
High-Risk Loan(1)       0.86      0.90      0.88      1881

         accuracy                           0.99     58152
        macro avg       0.93      0.95      0.94     58152
     weighted avg       0.99      0.99      0.99     58152



**Healthy Loan (0)**:
- **Precision**: 1.00, indicating perfect precision, as the model had no false positives within this class.
- **Recall**: 1.00, showing that it identified 100% of all healthy loans correctly.
- **F1-score**: 1.00, which is the harmonic mean of precision and recall, suggesting excellent model performance for healthy loans.

**High-Risk Loan (1)**:
- **Precision**: 0.86, meaning that when the model predicted a loan as high-risk, it was correct 86% of the time.
- **Recall**: 0.90, indicating that the model identified 90% of all high-risk loans correctly.
- **F1-score**: 0.88, reflecting a balance between precision and recall, which is also quite high.

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model performs excellently in predicting both healthy (0) and high-risk (1) loan labels. The balanced accuracy score indicates a high level of overall accuracy that accounts for class imbalance, and the detailed metrics from the confusion matrix and classification report show that the model is highly effective at identifying both classes with high precision, recall, and F1-scores. This model is particularly strong in identifying healthy loans with almost perfect precision and recall, and it also performs very well in identifying high-risk loans, although with slightly lower precision and recall.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [13]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [14]:
# Count the distinct values of the resampled labels data
print(y_resampled.value_counts())

loan_status
0    56271
1    56271
Name: count, dtype: int64


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [15]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
lr_resampled.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
testing_predictions_resampled = lr_resampled.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [16]:
# Print the balanced_accuracy score of the model 
test_accuracy_resampled = balanced_accuracy_score(y_test, testing_predictions_resampled)
print(f"Balanced Accuracy Score (Resampled): {test_accuracy_resampled:.5}")

Balanced Accuracy Score (Resampled): 0.99368


The balanced accuracy score is 0.99368, or 99.368%. This is an exceptionally high score, suggesting the model performs very well overall in balancing the prediction accuracy across both the healthy loan (0) and high-risk loan (1) labels.

In [17]:
# Generate a confusion matrix for the model
test_matrix_resampled = confusion_matrix(y_test, testing_predictions_resampled)
confusion_df_resampled = pd.DataFrame(test_matrix_resampled, 
                            index=["Actual 0", "Actual 1"], 
                            columns=["Predicted 0", "Predicted 1"])

print("Confusion Matrix (Resampled):")
confusion_df_resampled

Confusion Matrix (Resampled):


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18649,116
Actual 1,4,615


The confusion matrix shows the following:
- **True Positives (TP) for Healthy Loan (0)**: 18649, meaning the model correctly predicted the majority of healthy loans.
- **True Negatives (TN) for High-Risk Loan (1)**: 615, indicating that it also correctly identified most of the high-risk loans.
- **False Positives (FP) for Healthy Loan (0)**: 116, referring to a small number of healthy loans incorrectly predicted as high-risk.
- **False Negatives (FN) for High-Risk Loan (1)**: 4, which is a very low count, showing that the model rarely misclassified high-risk loans as healthy.

In [18]:
# Print the classification report for the model
classification_report_resampled = classification_report(y_test, testing_predictions_resampled, target_names = ["Healthy Loan (0)" , "High-Risk Loan(1)"])
print(f"Classification Report (Resampled):\n{classification_report_resampled}")

Classification Report (Resampled):
                   precision    recall  f1-score   support

 Healthy Loan (0)       1.00      0.99      1.00     18765
High-Risk Loan(1)       0.84      0.99      0.91       619

         accuracy                           0.99     19384
        macro avg       0.92      0.99      0.95     19384
     weighted avg       0.99      0.99      0.99     19384



**Healthy Loan (0)**:
- **Precision**: 1.00, indicating perfect precision, as the model had no false positives within this class.
- **Recall**: 0.99, showing that it identified 99% of all healthy loans correctly.
- **F1-score**: 1.00, which is the harmonic mean of precision and recall, suggesting excellent model performance for healthy loans.

**High-Risk Loan (1)**:
- **Precision**: 0.84, meaning that when the model predicted a loan as high-risk, it was correct 84% of the time.
- **Recall**: 0.99, indicating that the model identified 99% of all high-risk loans correctly.
- **F1-score**: 0.91, reflecting a balance between precision and recall, which is also quite high.


### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model, when fit with oversampled data, demonstrates excellent performance in predicting both healthy loan (0) and high-risk loan (1) labels. The balanced accuracy score is exceptionally high at 99.368%, indicating that the model is very effective in balancing prediction accuracy for both classes. The model shows near-perfect precision and recall for healthy loans and high precision and recall for high-risk loans. The confusion matrix and classification report details underline the model's strong predictive capabilities, with notably few misclassifications.