In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv("Resources/lending_data.csv")

# Review the DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df["loan_status"].values.reshape(-1,1)

# Separate the X variable, the features
X = lending_df.copy()
X.drop("loan_status",axis=1,inplace=True)

In [5]:
# Review the y variable Series
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [9]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [44]:
# Check the balance of our target values
healthy = lending_df["loan_status"].value_counts()[0]
high_risk = lending_df["loan_status"].value_counts()[1]
total = healthy + high_risk

print(f"healthy: {healthy}, {(healthy/total)*100}%\nhigh-risk: {high_risk}, {(high_risk/total)*100}%\ntotal: {total}")


healthy: 75036, 96.77569129178704%
high-risk: 2500, 3.224308708212959%
total: 77536


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [41]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [42]:
print(X_train,X_test,y_train,y_test)

       loan_size  interest_rate  borrower_income  debt_to_income  \
29175     8600.0          6.792            44500        0.325843   
23020     7800.0          6.419            41000        0.268293   
31269    10000.0          7.386            50100        0.401198   
35479     9300.0          7.093            47300        0.365751   
13470     9200.0          7.045            46900        0.360341   
...          ...            ...              ...             ...   
20609     7200.0          6.177            38700        0.224806   
21440    10000.0          7.389            50100        0.401198   
73349    10200.0          7.463            50800        0.409449   
50057    11100.0          7.838            54400        0.448529   
5192     10600.0          7.632            52400        0.427481   

       num_of_accounts  derogatory_marks  total_debt  
29175                3                 0       14500  
23020                2                 0       11000  
31269             

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [59]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_model = LogisticRegression(random_state=1)

# Fit the model using training data
LR_model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [60]:
# Make a prediction using the testing data
predicted_y_values = LR_model.predict(X_test)
predicted_y_values[:5]

array([0, 0, 0, 0, 0], dtype=int64)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [61]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test,predicted_y_values)

0.9520479254722232

In [62]:
# Generate a confusion matrix for the model
confusion_matrix(y_test,predicted_y_values)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [63]:
# Print the classification report for the model
print(classification_report(y_test,predicted_y_values, target_names= ['healthy','high_risk']))

              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
   high_risk       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Due to the unbalanced nature of the dataset, w/ over 95% being healthy loans, it was able to be very precise w/ healthy loans with a precision score of 1.0, and high-risk loans w/ a score of .85. In terms of accuracy, the model is a very solid choice as the accuracy score is scant of 1.0. 

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [83]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_res, y_res = random.fit_resample(X,y)

In [93]:
# Count the distinct values of the resampled labels data
resampled_y = pd.DataFrame(y_res)
resampled_x = pd.DataFrame(X_res)
print(resampled_y.count, resampled_x.count)

<bound method DataFrame.count of         0
0       0
1       0
2       0
3       0
4       0
...    ..
150067  1
150068  1
150069  1
150070  1
150071  1

[150072 rows x 1 columns]> <bound method DataFrame.count of         loan_size  interest_rate  borrower_income  debt_to_income  \
0         10700.0          7.672            52800        0.431818   
1          8400.0          6.692            43600        0.311927   
2          9000.0          6.963            46100        0.349241   
3         10700.0          7.664            52700        0.430740   
4         10800.0          7.698            53000        0.433962   
...           ...            ...              ...             ...   
150067    19000.0         11.208            86100        0.651568   
150068    18700.0         11.062            84700        0.645809   
150069    21100.0         12.092            94400        0.682203   
150070    17500.0         10.553            79900        0.624531   
150071    18800.0         1

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [94]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
LR_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
LR_model.fit(resampled_x,resampled_y)

# Make a prediction using the testing data
predicted_resampled_y = LR_model.predict(resampled_x)
predicted_resampled_y[:5]

  y = column_or_1d(y, warn=True)


array([0, 0, 0, 0, 0], dtype=int64)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [96]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_res,predicted_resampled_y)

0.9945026387334079

In [98]:
# Generate a confusion matrix for the model
confusion_matrix(y_res,predicted_resampled_y)


array([[74614,   422],
       [  403, 74633]], dtype=int64)

In [99]:
# Print the classification report for the model
print(classification_report(y_res,predicted_resampled_y, target_names= ['healthy','high_risk']))

              precision    recall  f1-score   support

     healthy       0.99      0.99      0.99     75036
   high_risk       0.99      0.99      0.99     75036

    accuracy                           0.99    150072
   macro avg       0.99      0.99      0.99    150072
weighted avg       0.99      0.99      0.99    150072



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** With the resampled data, all scores are scant of 1.0, meaning that the linear regression model is a perfect model to use for this data.