In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

#Import RandomOverSampler for recommended model
from imblearn.over_sampling import RandomOverSampler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv('lending_data.csv')

# Review the DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df['loan_status']

# Separate the X variable, the features
X = lending_df.drop('loan_status', axis =1)


In [5]:
# Review the y variable Series
y.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

In [6]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state =1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression.fit(X_train,y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [9]:
# Make a prediction using the testing data
y_test_pred = logistic_regression.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [10]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test,y_test_pred)
cm_df = pd.DataFrame(cm, index= ['Actual 0', 'Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18663,102
Actual 1,56,563


In [11]:
# Print the classification report for the model
print(classification_report(y_test, y_test_pred, target_names = ['healthy loan(0)', 'high-risk loan(1)']))

                   precision    recall  f1-score   support

  healthy loan(0)       1.00      0.99      1.00     18765
high-risk loan(1)       0.85      0.91      0.88       619

         accuracy                           0.99     19384
        macro avg       0.92      0.95      0.94     19384
     weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

---

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:**
The logistic regression model predicts healthy loans (0) with a recall score of 0.99 and high-risk loans (1) with a recall score of 0.91. However, the data is imbalanced with significantly more healthy loans (0) than high-risk loans (1), which might affect training and predictions.

To address this imbalance, we can either use oversampling or undersampling. Oversampling involves creating duplicate data points for the minority class (1), which may lead to overfitting. Undersampling, on the other hand, reduces the majority class (0) to match the size of the minority class, potentially losing valuable information.

Given the scenario, oversampling seems more appropriate to retain information from the majority class without discarding valuable data. However, this approach may increase training time and memory requirements, representing a trade-off for improved predictions.






**Create a Logistic Regression Model with the resampled Data**


Step 1: Resample the data using RandomOverSampler from imblearn.over_sampling

In [12]:
# Instantiate the RandomOverSampler
# Assign a random_state parameter of 1 to the model
random_over_sampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler
X_train_ros, y_train_ros = random_over_sampler.fit_resample(X_train, y_train)

In [13]:
#Review the y_train_ros variable Series
y_train_ros.value_counts()

loan_status
0    56271
1    56271
Name: count, dtype: int64

Step 2: Fit a logistic regression model by using the resampled data (X_train_ros and y_train_ros).

In [14]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_ros = LogisticRegression(random_state=1)

# Fit the model using resampled data
logistic_regression_ros.fit(X_train_ros,y_train_ros)

Step 3: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [15]:
# Make a prediction using the testing data
y_test_ros_pred = logistic_regression_ros.predict(X_test)

Step 4: Evaluate the model’s performance by doing the following:


---



*   Generate a confusion matrix.

*   Print the classification report.


In [16]:
# Generate a confusion matrix for the model
cm_ros = confusion_matrix(y_test,y_test_ros_pred)
cm_ros_df = pd.DataFrame(cm_ros, index= ['Actual 0', 'Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_ros_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18649,116
Actual 1,4,615


In [17]:
# Print the classification report for the model
print(classification_report(y_test, y_test_ros_pred, target_names = ['healthy loan(0)', 'high-risk loan(1)']))

                   precision    recall  f1-score   support

  healthy loan(0)       1.00      0.99      1.00     18765
high-risk loan(1)       0.84      0.99      0.91       619

         accuracy                           0.99     19384
        macro avg       0.92      0.99      0.95     19384
     weighted avg       0.99      0.99      0.99     19384



Question: How well does the logistic regression model with oversampled data predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

Answer:
The logistic regression model with oversampled data predicted healthy loans (0) with a recall score of 0.99 and high-risk loans (1) with a recall score of 0.99.

Both models have high accuracy scores, but the second model has a slightly lower precision score compared to the first model.

For recommendation, I suggest using the second model because it reduces the number of false positives, meaning it predicts fewer healthy loans as high-risk. This helps avoid situations where loans are incorrectly classified, which is crucial for minimizing risk.