# Credit Risk Evaluator.

In [1]:
#Import dependancies.
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Retrieve the Data.

In [2]:
#Import the data and create DataFrame.
lending_data_df = pd.read_csv('Resources/lending_data.csv')
lending_data_df

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [3]:
#Checking null/missing values.
lending_data_df.isnull().sum()

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

# Predict Model Performance.

Prediction: Logistic Regression Model will perform better.


# Split the Data into Training and Testing Sets.

In [5]:
#Create features and target.
X =  lending_data_df.drop("loan_status", axis=1)
y = lending_data_df["loan_status"]

In [6]:
#Split the data into X_train, X_test, y_train, y_test.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create, Fit and Compare Models.

Create a Logistic Regression model, fit it to the data, and print the model's score. Do the same for a Random Forest Classifier. You may choose any starting hyperparameters you like.

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the designated markdown cell.

# Logistic Regression Model.

In [7]:
#Train a Logistic Regression model and print the model score.
#Scale and Transform the data.
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
#Print the model.
model = LogisticRegression(max_iter=10000)
model.fit(X_train_scaled, y_train)
print(f"LogisticRegression Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"LogisticRegression Testing Data Score: {model.score(X_test_scaled, y_test)}")

LogisticRegression Training Data Score: 0.9942908240473243
LogisticRegression Testing Data Score: 0.9936545604622369


In [9]:
#Create Confusion Matrix.
y_true = y_test
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[18652,   113],
       [   10,   609]], dtype=int64)

In [10]:
#The accuracy of the Logistic Regression model.
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9936545604622369


In [11]:
#Creating Report.
y_pred_log = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred_log))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.98      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



# Random Forest Classifier model.

In [12]:
#Train a Random Forest Classifier model and print the model score.
#Scale and Transform the data.
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
#Print the model.
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}')

RandomForestClassifier Training Score: 0.9974893382858715
RandomForestClassifier Testing Score: 0.9910751134956666


In [14]:
#Create Confusion Matrix.
y_true = y_test
y_pred = clf.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[18666,    99],
       [   74,   545]], dtype=int64)

In [15]:
#The accuracy of the Random Forest Classifier on the test data is TP + TN / (TP + FP + TN + FN).
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(f"Random Forest Classifier Accuracy: {accuracy}")

Random Forest Classifier Accuracy: 0.9910751134956666


In [16]:
#Creating Report.
y_pred_tree = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred_tree))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.85      0.88      0.86       619

    accuracy                           0.99     19384
   macro avg       0.92      0.94      0.93     19384
weighted avg       0.99      0.99      0.99     19384



Which model performed better? How does that compare to your prediction? Replace the text in this markdown cell with your answers to these questions.

Both models testing score  have no significant different. However, Random Forest Classifier model training score was slightly higher than Logistic Regresssi model. If we consider the confusion matrix, the Logistic Regression model performed better as it has scoring percentage of 99.3% whereas Random Forest Classifier model has 99.1%. Therefore Logistic Regression model performed better because of having a little higher Testing Score and Accuracy Score than Random Forest Classifier model.