In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv(
    Path('Resources/lending_data.csv')   
)

# Review the DataFrame
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


In [4]:
df.shape

(77536, 8)

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['loan_status']


# Separate the X variable, the features
X = df.drop(columns='loan_status')

In [6]:
# Review the y variable Series
y[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: loan_status, dtype: int64

In [7]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [8]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Review Scaled Trained data
X_train_scaled[:5]

array([[-0.57708952, -0.56367666, -0.5652314 , -0.63429794, -0.43489843,
        -0.67289855, -0.5652314 ],
       [-0.95927354, -0.98302549, -0.98332378, -1.34143791, -0.96014741,
        -0.67289855, -0.98332378],
       [ 0.09173251,  0.10413354,  0.10371642,  0.29161787,  0.09035056,
         1.04334691,  0.10371642],
       [-0.24267851, -0.22527452, -0.23075749, -0.14393449, -0.43489843,
        -0.67289855, -0.23075749],
       [-0.29045151, -0.27923898, -0.27853948, -0.21040167, -0.43489843,
        -0.67289855, -0.27853948]])

In [11]:
# Review Scaled Test data
X_test_scaled[:5]

array([[ 1.33383057,  1.32170668,  1.32215708,  1.53620797,  1.14084852,
         1.04334691,  1.32215708],
       [-0.00381349, -0.00491964, -0.00379305,  0.15702536,  0.09035056,
        -0.67289855, -0.00379305],
       [ 0.52168953,  0.53584922,  0.5337543 ,  0.7848731 ,  0.61559954,
         1.04334691,  0.5337543 ],
       [ 0.42614353,  0.41892623,  0.41429933,  0.65461753,  0.61559954,
         1.04334691,  0.41429933],
       [ 0.04395951,  0.06703297,  0.06787993,  0.24729423,  0.09035056,
        -0.67289855,  0.06787993]])

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [12]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier



In [13]:
# Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [14]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)



### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [15]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [16]:
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12600,6165
Actual 1,3,616


Accuracy Score : 0.6817994222038795
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.67      0.80     18765
           1       0.09      1.00      0.17       619

    accuracy                           0.68     19384
   macro avg       0.55      0.83      0.48     19384
weighted avg       0.97      0.68      0.78     19384



### Step 4: Answer the following question about the logistic regression model.

**Question:** How well does the **logistic regression model** predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regresssion model did not perform particularly well. 
- The accuracy score of 68% is somewhat low saying that the model is correct 68% of the time (TP_TN/(TP+TN+FP+FN)
- The precision scores show that the model was 100% correct in predicting Positives for 0-(healthy loans).  However it was only 9% correct in predicting positive for 1-(unhealthy loans).  (TP/TP+FN)
- The recall score shows a ratio of 67% correctly predicting 0-(healthy loans) and a ratio of 100% correctly predictin 1-(unhealthy loans)

- f1 accounts for both precision and recall, we see a low f1-score for 1-(unhealthy loans) at 17%

Overview and Summary:  This model was designed to predict the health of loans based on the data provided.  A healthy loan is indicated with a 0 and an unhealthy loan is indicated with a 1.  The logistic regression model did not perform particularly well based on its accuracy, precision, and recall scores detailed above.

### Lets try a Decision Tree Classifier Model with the Original Data

In [17]:
# Let's try the Decision Tree Model
# Creating the decision tree classifier instance
model = DecisionTreeClassifier()

In [18]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)


In [19]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [21]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18667,98
Actual 1,93,526


Accuracy Score : 0.9901465125877011
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     18765
           1       0.84      0.85      0.85       619

    accuracy                           0.99     19384
   macro avg       0.92      0.92      0.92     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following questions about the Decision Tree Classifier Model.

**Question:** How well does the **Decision Tree Classifier** model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Decision Tree Classifier model outperformed the logistic regresssion model. 
- The accuracy score of 99% is almost too good to be true (TP_TN/(TP+TN+FP+FN)
- The precision scores show that the model was 100% correct in predicting Positives for 0-(healthy loans)and 84% correct in predicting positive for 1-(unhealthy loans).  (TP/TP+FN)
- The recall score shows a ratio of 99% correctly predicting 0-(healthy loans) and a ratio of 85% correctly predictin 1-(unhealthy loans)

- f1 accounts for both precision and recall, we see a low f1-score for 0-(healthy loans) of 99% for 1-(unhealthy loans) of 85%


Overview and Summary:  This model was designed to predict the health of loans based on the data provided.  A healthy loan is indicated with a 0 and an unhealthy loan is indicated with a 1.  The Decision Tree Classifier model is accurate however it does do a better job of prediction healthy loans over unhealthy loans

### Lets try a K Nearest Neighbor Model with the Original Data

In [22]:
# create and fit the model
model = KNeighborsClassifier(n_neighbors=3)
model = model.fit(X_train_scaled, y_train)

In [23]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [24]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18658,107
Actual 1,45,574


Accuracy Score : 0.9921584812216261
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.93      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.96      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following questions about the K-Nearest Neighbor Model.

**Question:** How well does the **K-Nearest Neighbor** model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The K-Nearest Neighbor model outperformed the logistic regresssion model and performed nearly the same as the Decision Tree Model. 
- The accuracy score of 99% is almost too good to be true (TP_TN/(TP+TN+FP+FN)
- The precision scores show that the model was 100% correct in predicting Positives for 0-(healthy loans)and 84% correct in predicting positive for 1-(unhealthy loans).  (TP/TP+FN)
- The recall score shows a ratio of 99% correctly predicting 0-(healthy loans) and a ratio of 93% correctly predictin 1-(unhealthy loans)

- f1 accounts for both precision and recall, we see a low f1-score for 0-(healthy loans) of 100% for 1-(unhealthy loans) of 88%


Overview and Summary:  This model was designed to predict the health of loans based on the data provided.  A healthy loan is indicated with a 0 and an unhealthy loan is indicated with a 1.  The Decision Tree Classifier model is accurate however it does do a better job of prediction healthy loans over unhealthy loans.



### Of the 3 model types tested.  K-Nearest neighbor performed the best.  It outperformed the Decision Tree model by a slight margin.  Both K-Nearest Neighbor and the Decision Tree model outperformed the Logistic Regression model significantly.


---