# Credit Risk Evaluator

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

## Retrieve the Data

The data is located in the Challenge Files Folder:

* `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.

In [2]:
# Import the data
file_path = Path("./Resources/lending_data.csv")
df = pd.read_csv(file_path)
df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


## Predict Model Performance

You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! 

Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess.

**My Prediction**

Logistic regression is a simple and more efficient method for binary and linear classification problems. It is a classification model, which is very easy to realize and achieves very good performance with linearly separable classes. 

A random forest can perform both regression and classification tasks. It can handle large datasets efficiently and works well with non-linear data. The random forest algorithm provides a higher level of accuracy in predicting outcomes and has better accuracy than other classification algorithms.

I think the Random Forests Classifier would perform better here as the credit risk data might not be linearly separable.

## Split the Data into Training and Testing Sets

In [3]:
# Split the data into X_train, X_test, y_train, y_test
X = df.drop("loan_status", axis = 1)
y = df["loan_status"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [4]:
X_train

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
29175,8600.0,6.792,44500,0.325843,3,0,14500
23020,7800.0,6.419,41000,0.268293,2,0,11000
31269,10000.0,7.386,50100,0.401198,4,1,20100
35479,9300.0,7.093,47300,0.365751,3,0,17300
13470,9200.0,7.045,46900,0.360341,3,0,16900
...,...,...,...,...,...,...,...
20609,7200.0,6.177,38700,0.224806,1,0,8700
21440,10000.0,7.389,50100,0.401198,4,1,20100
73349,10200.0,7.463,50800,0.409449,4,1,20800
50057,11100.0,7.838,54400,0.448529,5,1,24400


In [5]:
X_test

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
60914,12600.0,8.469,60300,0.502488,6,1,30300
36843,9800.0,7.289,49200,0.390244,4,0,19200
1966,10900.0,7.770,53700,0.441341,5,1,23700
70137,10700.0,7.666,52700,0.430740,5,1,22700
27237,9900.0,7.353,49800,0.397590,4,0,19800
...,...,...,...,...,...,...,...
45639,9900.0,7.328,49600,0.395161,4,0,19600
11301,9900.0,7.317,49500,0.393939,4,0,19500
51614,8000.0,6.520,42000,0.285714,2,0,12000
4598,11500.0,8.001,55900,0.463327,5,1,25900


In [6]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Train a Logistic Regression model and print the model score

In [7]:
# Create a logistic Regression Model
classifier = LogisticRegression()
classifier

LogisticRegression()

In [8]:
# Fit (train) our model by using the training data
classifier.fit(X_train, y_train)

LogisticRegression()

In [9]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9921240885954051
Testing Data Score: 0.9918489475856377


In [10]:
# Create a confusion matrix
y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [11]:
# Calculate the accuracy - The accuracy of the model on the test data is TP + TN / (TP + FP + TN + FN)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (563 + 18663) / (563 + 102 + 18663 + 56)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9918489475856377


In [12]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### On scaled data

In [13]:
# Create a logistic Regression Model
classifier_scaled = LogisticRegression()
classifier_scaled

LogisticRegression()

In [14]:
# Fit (train) our model by using the training data
classifier_scaled.fit(X_train_scaled, y_train)

LogisticRegression()

In [15]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_scaled.score(X_test_scaled, y_test)}")

Training Data Score: 0.9942908240473243
Testing Data Score: 0.9936545604622369


In [16]:
# Create a confusion matrix
y_true = y_test
y_pred = classifier_scaled.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[18652,   113],
       [   10,   609]], dtype=int64)

In [17]:
# Calculate the accuracy - The accuracy of the model on the test data is TP + TN / (TP + FP + TN + FN)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (609 + 18652) / (609 + 113 + 18652 + 10)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9936545604622369


In [18]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.98      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Train a Random Forest Classifier model and print the model score

In [19]:
# Create a Random Forest Classifier Model
clf = RandomForestClassifier()
clf

RandomForestClassifier()

In [20]:
# Fit (train) our model by using the training data
clf.fit(X_train, y_train)

RandomForestClassifier()

In [21]:
# Validate the model by using the test data
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 0.9975409272252029
Testing Data Score: 0.9914878250103177


In [22]:
# Create a confusion matrix
y_true = y_test
y_pred = clf.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18666,    99],
       [   66,   553]], dtype=int64)

In [23]:
# Calculate the accuracy - The accuracy of the model on the test data is TP + TN / (TP + FP + TN + FN)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (553 + 18666) / (553 + 99 + 18666 + 66)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9914878250103177


In [24]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.89      0.87       619

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384



### On scaled Data

In [25]:
# Create a Random Forest Classifier Model
clf_scaled = RandomForestClassifier()
clf_scaled

RandomForestClassifier()

In [26]:
# Fit (train) our model by using the training data
clf_scaled.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [27]:
# Validate the model by using the test data
print(f"Training Data Score: {clf_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf_scaled.score(X_test_scaled, y_test)}")

Training Data Score: 0.9975409272252029
Testing Data Score: 0.9915910028889806


In [28]:
# Create a confusion matrix
y_true = y_test
y_pred = clf_scaled.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[18666,    99],
       [   64,   555]], dtype=int64)

In [29]:
# Calculate the accuracy - The accuracy of the model on the test data is TP + TN / (TP + FP + TN + FN)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (555 + 18666) / (555 + 99 + 18666 + 64)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9915910028889806


In [30]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.90      0.87       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.93     19384
weighted avg       0.99      0.99      0.99     19384



**Conclusion**

While the training scores of Random Forest Classifier model seem to be slightly better than Logistic Regression Model, the test scores of both the models seem to be around the same range with that of Logistic Regression model doing slightly better. These results have been similar on both the scaled and unscaled data. This isn't in line with my prediction.

It can also be seen from the accuracy scores and classification reports for both the models that there is not much of a difference in terms of performance of both the models, while the Logistic Regression model has done slightly better which is almost negligible.