In [179]:
import pandas as pd
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [180]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv("Resources/lending_data.csv")
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [181]:
# Separate the data into labels and features
y = df['loan_status']
X = df.drop(columns='loan_status')

In [182]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [183]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [184]:
# Check the balance of our target values
y.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [185]:
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [186]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [187]:
# Fit the model using training data
logistic_regression_model = LogisticRegression(random_state=1)

lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [188]:
# Make a prediction using the Training data
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [189]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, testing_predictions)

0.9924680148576145

In [190]:
training_matrix = confusion_matrix(y_train, training_predictions)
print(training_matrix)

[[55987   284]
 [  127  1754]]


In [191]:
testing_matrix = confusion_matrix(y_test, testing_predictions)
print(testing_matrix)

[[18655   110]
 [   36   583]]


In [192]:
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56271
           1       0.86      0.93      0.90      1881

    accuracy                           0.99     58152
   macro avg       0.93      0.96      0.95     58152
weighted avg       0.99      0.99      0.99     58152



In [193]:
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)


              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.94      0.89       619

    accuracy                           0.99     19384
   macro avg       0.92      0.97      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Logistic Regression model does very well at predicting healthy loans with accuracy, precision and recall close to 100%. It still does well, but performs noticably worse at identifying high-risk loans where it has a precision of 85% and a recall of 91%. This suggests that this model is less able to classify high risk loans than healthy loans. This is likely related to the skew in the data that was provided, with only about 3 percent of the sample being in the high risk loans category.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [194]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

ROS = RandomOverSampler(random_state=1)

X_resampled, y_resampled= ROS.fit_resample(X_train, y_train)


In [195]:
y_resampled.value_counts()

loan_status
0    56271
1    56271
Name: count, dtype: int64

In [196]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_resampled, y_resampled, random_state=1)

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [197]:
# Instantiate the Logistic Regression model
classifierR = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifierR
# Fit the model using the resampled training data
logistic_regression_model = LogisticRegression(random_state=1)

lr_model = logistic_regression_model.fit(X_train1, y_train1)

# Make a prediction using the Training data
training_predictions1 = lr_model.predict(X_train1)

# Make a prediction using the testing data
testing_predictions1 = logistic_regression_model.predict(X_test1)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [198]:
# Print the balanced_accuracy score of the model 
accuracy_score(y_test1, testing_predictions1)

0.9941711686096104

In [199]:
training_matrix = confusion_matrix(y_train1, training_predictions1)
print(training_matrix)

[[41940   225]
 [  212 42029]]


In [200]:
testing_matrix = confusion_matrix(y_test1, testing_predictions1)
print(testing_matrix)

[[14016    90]
 [   74 13956]]


In [201]:
# Print the Training classification report for the model
training_report = classification_report(y_train1, training_predictions1)
print(training_report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     42165
           1       0.99      0.99      0.99     42241

    accuracy                           0.99     84406
   macro avg       0.99      0.99      0.99     84406
weighted avg       0.99      0.99      0.99     84406



In [202]:
# Print the Testing classification report for the model
testing_report = classification_report(y_test1, testing_predictions1)
print(testing_report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     14106
           1       0.99      0.99      0.99     14030

    accuracy                           0.99     28136
   macro avg       0.99      0.99      0.99     28136
weighted avg       0.99      0.99      0.99     28136



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** It performs noticably better at identifying high risk loans in addition to identifying healthy loans. The overall prediction rate is consistent across precision, recall, and accuracy.

---

## Model Deployment Preparation

Now let's prepare this model for deployment by saving the trained model and creating deployment files.

In [203]:
# Save the best model (oversampled version) for deployment
import joblib

# The oversampled model performed better, so we'll use that for deployment
# First, let's retrain on the original training data for production use
final_model = LogisticRegression(random_state=1)
final_model.fit(X_resampled, y_resampled)

# Save the model
joblib.dump(final_model, 'credit_risk_model.pkl')

# Also save the feature names for the web app
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')

print("Model saved successfully!")
print(f"Features: {feature_names}")

Model saved successfully!
Features: ['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income', 'num_of_accounts', 'derogatory_marks', 'total_debt']
