In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
credit_data = pd.read_csv('lending_data.csv')

# Review the DataFrame
credit_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [7]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = credit_data['loan_status']

# Separate the X variable, the features
X = credit_data.drop(columns='loan_status', axis=1)

In [12]:
# Review the y variable Series
print(y.info())

<class 'pandas.core.series.Series'>
RangeIndex: 77536 entries, 0 to 77535
Series name: loan_status
Non-Null Count  Dtype
--------------  -----
77536 non-null  int64
dtypes: int64(1)
memory usage: 605.9 KB
None


In [15]:
# Review the X variable DataFrame
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 4.1 MB


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [17]:
# Check the balance of our target values
y_values = y.value_counts()
y_values

0    75036
1     2500
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [19]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
29175,8600.0,6.792,44500,0.325843,3,0,14500
23020,7800.0,6.419,41000,0.268293,2,0,11000
31269,10000.0,7.386,50100,0.401198,4,1,20100
35479,9300.0,7.093,47300,0.365751,3,0,17300
13470,9200.0,7.045,46900,0.360341,3,0,16900


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [20]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
log_classifier = LogisticRegression(solver='lbfgs',random_state=1)

# Fit the model using training data
log_classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [27]:
# Make a prediction using the testing data
log_prediction = log_classifier.predict(X_test)

log_prediction_df = pd.DataFrame({'prediction':log_prediction, 'actual': y_test}).reset_index(drop=True)
log_prediction_df.head()



Unnamed: 0,prediction,actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [32]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, log_prediction)
balanced_accuracy = balanced_accuracy_score(y_test, log_prediction)

print(f"The accuracy score is {accuracy}")
print(f"The balanced accuracy score is {balanced_accuracy}")

The accuracy score is 0.9918489475856377
The balanced accuracy score is 0.9520479254722232


In [33]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, log_prediction)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [34]:
# Print the classification report for the model
target_names = ["Healthy Loans", "High-Risk Loans"]
print(classification_report(y_test, log_prediction, target_names=target_names))

                 precision    recall  f1-score   support

  Healthy Loans       1.00      0.99      1.00     18765
High-Risk Loans       0.85      0.91      0.88       619

       accuracy                           0.99     19384
      macro avg       0.92      0.95      0.94     19384
   weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The performance evaluation of the logistic regression model demonstrates strong results when assessed using the imbalanced accuracy metric, achieving a score of approximately 95%. However, upon closer examination of the classification report, certain disparities become evident between the treatment of Risky loans and Healthy loans. Notably, for Risky loans, the recall is relatively lower, and precision exhibits a more varied outcome when contrasted with Healthy loans, which boasts a precision rate of 100%.

This discrepancy can be attributed to the disproportionate class sizes. With the Healthy Loan class vastly outnumbering the Risky Loan class, the classifier's primary focus lies in correctly identifying instances within the larger Healthy Loan category. Consequently, the classifier may not allocate sufficient attention to the smaller yet critical high-risk Loan class.

The practical implication of this is that instances where lenders decline to provide loans categorized as high-risk loans have a notable 15% chance of being misclassified as healthy loans. This underscores the challenge of accurate classification in scenarios where class imbalances exist and emphasizes the importance of addressing these disparities to avoid potentially costly misclassifications.








---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_sampler = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_random_sampler, y_random_sampler = random_sampler.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_random_sampler.value_counts()

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_random= LogisticRegression(random_state = 1)

# Fit the model using the resampled training data
lr_random.fit(X_random_sampler, y_random_sampler)

# Make a prediction using the testing data
lr_prediction = lr_random.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Print the balanced_accuracy score of the model 
print("Balanced accuracy score: %.3f" % balanced_accuracy_score(y_test, lr_prediction))

In [None]:
# Generate a confusion matrix for the model
conf_matrix = confusion_matrix(y_test, lr_prediction)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, lr_prediction,
                            target_names=["Healthy Loans", "High-Risk Loans"]))

### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Oversampling significantly boosts the balanced score from 95% to 99%. This is achieved by randomly oversampling and training the logistic regression model on the augmented training datasets.

The improvement in scoring results from the ability of the oversampling algorithm to select random samples from the minority class (High-Risk Loans) with replacement, thereby enriching the training data.

While there's a potential risk of overrepresenting a single sample in training, this isn't a major concern here due to the substantial class imbalance, as highlighted in the confusion matrix.

Overall, the achieved accuracy is promising for considering these algorithms in banking. However, a prudent approach involves piloting with new data to assess the model's real-world reliability.