In [29]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import RandomOverSampler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [30]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("../Credit_Risk/lending_data.csv")
df_lending_data = pd.read_csv(file_path)

# Review the DataFrame
display(df_lending_data.head())
display(df_lending_data.tail())


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [31]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df_lending_data['loan_status']

# Separate the X variable, the features
x = df_lending_data.drop(columns='loan_status')

In [32]:
# Review the y variable Series
display(y.head())
display(y.tail())

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, dtype: int64

In [33]:
# Check the balance of our target values
# A value of 0 in the “loan_status” column means that the loan is healthy. 
# A value of 1 means that the loan has a high risk of defaulting.
y.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

In [34]:
# Review the X variable DataFrame
display(x.head())
display(x.tail())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
77531,19100.0,11.261,86600,0.65358,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300
77535,15600.0,9.742,72300,0.585062,9,2,42300


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [35]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, Y_train, Y_test = train_test_split(x, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model= LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = lr_model.fit(X_train,Y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [37]:
# Make a prediction using the testing data
lr_prediction = lr_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

* Calculate the accuracy score of the model

In [38]:
# Generate a confusion matrix for the model
# A value of 0 in the “loan_status” column means that the loan is healthy. 
# A value of 1 means that the loan has a high risk of defaulting.
cm_model = confusion_matrix(Y_test,lr_prediction)
df_cm_model = pd.DataFrame(cm_model,
                           index= ['Healthy Loans value 0 (low-risk)','Non-Healthy Loans value 1 (high-risk)'],
                           columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)'])
df_cm_model

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Healthy Loans value 0 (low-risk),18655,110
Non-Healthy Loans value 1 (high-risk),36,583


In [39]:
# Print the classification report for the model
print(classification_report(Y_test, lr_prediction))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.94      0.89       619

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.94     19384
weighted avg       0.99      0.99      0.99     19384



In [40]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(Y_test,lr_prediction)

0.967989851522121

### Step 4: Answer the following question.

**Question:**  How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 
The logistic regression model demonstrates strong predictive performance for both the '0' (healthy loan) and '1' (high-risk loan) labels. Here's a summary of its performance based on the provided confusion matrix and classification report:

Healthy Loans (Label '0'):
Precision: The precision for healthy loans is 1.00, indicating that when the model predicts a loan as healthy, it is correct 100% of the time.
Recall: The recall for healthy loans is 0.99, indicating that the model correctly identifies about 99% of all actual healthy loans.
F1-score: The F1-score for healthy loans is 1.00, reflecting a perfect balance between precision and recall.

High-Risk Loans (Label '1'):
Precision: The precision for high-risk loans is 0.84, meaning that when the model predicts a loan as high-risk, it is correct approximately 84% of the time.
Recall: The recall for high-risk loans is 0.94, indicating that the model captures approximately 94% of all actual high-risk loans.
F1-score: The F1-score for high-risk loans is 0.89, indicating a good balance between precision and recall.
These performance metrics suggest that the logistic regression model is highly effective in distinguishing between healthy and high-risk loans. It achieves near-perfect precision and recall for healthy loans, indicating very few false positives and negatives. Additionally, while there is a slightly lower precision for high-risk loans, the model still demonstrates strong performance in correctly identifying the majority of high-risk loans.

The overall accuracy score of 0.9679 or 96.79% further confirms the model's excellent performance in predicting both classes.



## Create a Logistic Regression Model with the Resampled Traning Data

### Step 1 : Use the RandomOverSampler module from the imbalanced-leaarn library to resample the data. 

In [24]:
# Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_oversampled, Y_oversampled = ros_model.fit_resample(X_train, Y_train)

In [25]:
# Count the distinct values of the resampled labels data
Y_oversampled.value_counts()

loan_status
0    56271
1    56271
Name: count, dtype: int64

### Step 2 : Use the LogisticRegression classifier and the resampled data to fit the model and make predictions.

In [26]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_oversampled_model = LogisticRegression(random_state = 1)

# Fit the model using the resampled training data
lr_oversampled_model.fit(X_oversampled, Y_oversampled)

# Make a prediction using the testing data
lr_oversampled_pred =lr_oversampled_model.predict(X_test)

### Step 3 : Evaluate the performance of model by doing the following:

* Generate a confusion matrix.

* Print the classification report.

* Calculate the accuracy score of the model

In [27]:
# Generate a confusion matrix

cm_oversampled = confusion_matrix(Y_test, lr_oversampled_pred)
cm_oversampled_df = pd.DataFrame(cm_oversampled, 
                                index = ['Healthy Loans value 0 (low-risk)','Non-Healthy Loans value 1 (high-risk)'], 
                                columns = ['Predicted Healthy Loans (low-risk)', 'Predicted Non-Healthy Loans (high-risk)']
                              )
cm_oversampled_df

Unnamed: 0,Predicted Healthy Loans (low-risk),Predicted Non-Healthy Loans (high-risk)
Healthy Loans value 0 (low-risk),18646,119
Non-Healthy Loans value 1 (high-risk),4,615


In [28]:
# Print the classification report for the model
print(classification_report(Y_test, lr_oversampled_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



In [23]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(Y_test, lr_oversampled_pred)

0.9935981855334257

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 

The logistic regression model fitted with oversampled data demonstrates strong performance in predicting both the '0' (healthy loan) and '1' (high-risk loan) labels. Here's a breakdown of its predictive capabilities:

Healthy Loans (Label '0'):
Precision: The precision for healthy loans is 1.00, indicating that when the model predicts a loan as healthy, it is correct 100% of the time.
Recall: The recall for healthy loans is 0.99, indicating that the model correctly identifies about 99% of all actual healthy loans.
F1-score: The F1-score for healthy loans is 1.00, reflecting a perfect balance between precision and recall.
Support: The support for healthy loans is 18,765, indicating a substantial number of instances in this class.

High-Risk Loans (Label '1'):
Precision: The precision for high-risk loans is 0.84, meaning that when the model predicts a loan as high-risk, it is correct approximately 84% of the time.
Recall: The recall for high-risk loans is 0.99, indicating that the model captures approximately 99% of all actual high-risk loans.
F1-score: The F1-score for high-risk loans is 0.91, representing a good balance between precision and recall.
Support: The support for high-risk loans is 619, indicating a smaller but still significant number of instances in this class.
Overall, the logistic regression model fitted with oversampled data performs exceptionally well in predicting healthy loans, achieving high precision, recall, and F1-score. It also demonstrates strong performance in identifying high-risk loans, with slightly lower precision but still excellent recall and F1-score. These results suggest that the model is effective in distinguishing between healthy and high-risk loans, making it valuable for risk assessment in lending scenarios.


In conclusion, the logistic regression model fitted with oversampled data shows exceptional predictive capability for both healthy and high-risk loans, making it valuable for risk assessment in lending scenarios.




---