# Credit Risk Evaluator

In [48]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

## Retrieve the Data

The data is located in the Challenge Files Folder:

* `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.

In [49]:
# Import the data
df = pd.read_csv('Resources/lending_data.csv')
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [50]:
# Checking if dataset contains missing data
df.isnull().sum()

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

In [51]:
# Checking if data type transformation is needed
df.dtypes

loan_size           float64
interest_rate       float64
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
loan_status           int64
dtype: object

In [65]:
# Listing duplicates. Lots of them!
df[df.duplicated()]
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77524,16900.0,10.302,77500,0.612903,10,2,47500,1
77526,18300.0,10.895,83100,0.638989,11,2,53100,1
77528,15100.0,9.557,70500,0.574468,9,2,40500,1
77531,19100.0,11.261,86600,0.653580,12,2,56600,1


In [53]:
# Dropping duplicates
df=df.drop_duplicates()
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77524,16900.0,10.302,77500,0.612903,10,2,47500,1
77526,18300.0,10.895,83100,0.638989,11,2,53100,1
77528,15100.0,9.557,70500,0.574468,9,2,40500,1
77531,19100.0,11.261,86600,0.653580,12,2,56600,1


## Predict Model Performance

You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! 

Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess.

My prediction: According to Kirasich, Smith & Sadler (2018)," logistic regression performs better when the number of noise variables is less than or equal to the number of explanatory variables and random forest has a higher true and false positive rate as the number of explanatory variables increases in a dataset." Thus, I think that Random Forest will perform better, since there are not that many explanatory variables in this data set.

Reference: Kirasich, Kaitlin; Smith, Trace; and Sadler, Bivin (2018) "Random Forest vs Logistic Regression: Binary Classification for Heterogeneous Datasets," SMU Data Science Review: Vol. 1: No. 3, Article 9. Available at: https://scholar.smu.edu/datasciencereview/vol1/iss3/9

## Split the Data into Training and Testing Sets

In [54]:
# Split the data into X_train, X_test, y_train, y_test
y = df["loan_status"].values
X = df.drop("loan_status", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Create, Fit and Compare Models

Create a Logistic Regression model, fit it to the data, and print the model's score. Do the same for a Random Forest Classifier. You may choose any starting hyperparameters you like. 

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the designated markdown cell.

### LOGISTIC REGRESSION MODEL

In [55]:
# Train a Logistic Regression model and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=10000)
classifier

LogisticRegression(max_iter=10000)

In [56]:
# Fit (train) model by using training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [57]:
# Validatre model by using test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8974751338944147
Testing Data Score: 0.9021406727828746


In [58]:
# Create Confusion Matrix
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[832,  77],
       [ 51, 348]], dtype=int64)

In [59]:
# Accuracy of the model (TP + TN) / (TP + FP + TN + FN))
# TP=563, TN=18663, FP=102, FN=56
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (563 + 18663) / (563 + 102 + 18663 + 56)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9021406727828746


###  RANDOM FOREST CLASSIFIER

In [60]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier

In [61]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
# Fit data
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9676103034940067
Testing Score: 0.8623853211009175


In [63]:
# Create Confusion Matrix
y_true = y_test
y_pred = classifier.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

  "X does not have valid feature names, but"


array([[802, 107],
       [  3, 396]], dtype=int64)

In [64]:
# Accuracy of the model (TP + TN) / (TP + FP + TN + FN))
# TP=616, TN=11733, FP=7032, FN=3
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9159021406727829


*Which model performed better? How does that compare to your prediction? Replace the text in this markdown cell with your answers to these questions.*

The Logistic Regression and Random Forest models had a test score of 0.9021 and 0.8624, respectively. Thus, based solely on test score, the LR model performed better. However the accuracy for LR and RF models was 0.9021 and 0.9160, suggesting that the Random Forest model had a slight edge against Logistic Regression. Given that the confusion matrix is "a summary of prediction results on a classification problem, and that it provides insight not only into the errors being made by your classifier but more importantly the types of errors that are being made" (Brownlee, 2020), I think the best metric is model accuracy. In conclusion, the Random Forest model performed better for this data set, which contradicts my initial prediction. 

Reference used: Jason Brownlee. 2020. What is a Confusion Matrix in Machine Learning. https://machinelearningmastery.com/confusion-matrix-machine-learning/