In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [17]:
! head ../Resources/lending_data.csv

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
10700.0,7.672000000000000,52800,0.4318181818181820,5,1,22800,0
8400.0,6.692,43600,0.3119266055045870,3,0,13600,0
9000.0,6.963000000000000,46100,0.3492407809110630,3,0,16100,0
10700.0,7.664,52700,0.4307400379506640,5,1,22700,0
10800.0,7.6980000000000000,53000,0.4339622641509430,5,1,23000,0
10100.0,7.438,50600,0.4071146245059290,4,1,20600,0
10300.0,7.49,51100,0.4129158512720160,4,1,21100,0
8800.0,6.857,45100,0.3348115299334810,3,0,15100,0
9300.0,7.096,47400,0.3670886075949370,3,0,17400,0


In [18]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data=pd.read_csv("../Resources/lending_data.csv")

# Review the DataFrame
# YOUR CODE HERE!
lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [21]:
# Separate the data into labels and features
# Separate the y variable, the labels
# YOUR CODE HERE!]
y = lending_data['loan_status'].values


# Separate the X variable, the features
# YOUR CODE HERE!
X = lending_data[['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income', 'num_of_accounts', 'derogatory_marks', 'total_debt']].values


In [22]:
# Review the y variable Series
# YOUR CODE HERE!
y

array([0, 0, 0, ..., 1, 1, 1])

In [23]:
# Review the X variable DataFrame
# YOUR CODE HERE!
X

array([[1.0700e+04, 7.6720e+00, 5.2800e+04, ..., 5.0000e+00, 1.0000e+00,
        2.2800e+04],
       [8.4000e+03, 6.6920e+00, 4.3600e+04, ..., 3.0000e+00, 0.0000e+00,
        1.3600e+04],
       [9.0000e+03, 6.9630e+00, 4.6100e+04, ..., 3.0000e+00, 0.0000e+00,
        1.6100e+04],
       ...,
       [1.7600e+04, 1.0595e+01, 8.0300e+04, ..., 1.1000e+01, 2.0000e+00,
        5.0300e+04],
       [1.6300e+04, 1.0068e+01, 7.5300e+04, ..., 1.0000e+01, 2.0000e+00,
        4.5300e+04],
       [1.5600e+04, 9.7420e+00, 7.2300e+04, ..., 9.0000e+00, 2.0000e+00,
        4.2300e+04]])

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [24]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
# YOUR CODE HERE!
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [25]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
lr = LogisticRegression(random_state=1)
# Fit the model using training data
# YOUR CODE HERE!
lr.fit(X_train, y_train)

LogisticRegression(random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [27]:
# Make a prediction using the testing data
# YOUR CODE HERE!
y_pred = lr.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [28]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[18663,   102],
       [   56,   563]])

In [30]:
# Print the classification report for the model
# YOUR CODE HERE!
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred)
print(cr)


              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The model has recall of 0.99 for healthy loan, meaning it identifies healthy loan 99% time. The recall is almost 100%, meaning if the model says it's healthy loan, it's almost likely to be healthy.

FOr unhealthy, the model identifies correctly 85% of the time (misses 15% of unhealthy), whereas if model says unhealthy, it's unhealthy 88% of time, and falsely labels healthy as unhealthy 12% of time.

# Model 2 -- SVM (Support Vector Machine)

In [31]:
from sklearn.svm import SVC
# fitting SVM
svm = SVC(random_state=1)
svm.fit(X_train, y_train)

SVC(random_state=1)

In [32]:
# predictions and confusion matrix
y_pred_svm = svm.predict(X_test)
confusion_matrix(y_test, y_pred_svm)


array([[18651,   114],
       [    6,   613]])

In [34]:
# classification report
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



## balanced accuracies

---

In [35]:
from sklearn.metrics import balanced_accuracy_score

bacc_lr = balanced_accuracy_score(y_test, y_pred)
bacc_svm = balanced_accuracy_score(y_test, y_pred_svm)
print(bacc_lr, bacc_svm)

0.9520479254722232 0.9921159034000586
