In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [4]:
pip install -U scikit-learn

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.
  Downloading scikit_learn-1.0.2-cp37-cp37m-win_amd64.whl (7.1 MB)
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)

Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.1.0 scikit-learn-1.0.2 threadpoolctl-3.1.0


## Preprocessing dataset

In [2]:
# Import the data
# Reading in CSV 
lending_data = pd.read_csv('lending_data.csv')
lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Viewing shape and contents of data
# There are no missing values 
lending_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


Predicting whether a loan will be approved or not and classify the risk level of given loans. 'loan_status' = binary classification which is what we want to predict on unseen data. 

In [4]:
# set up variables 
X = lending_data.drop('loan_status', axis=1)
y = lending_data['loan_status']

In [5]:
y
#X.columns

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

# Prediction on model performance

I believe that the random forest classifier will perform better on this data as the dataset does not consist of linear data but of categorical data.

## Logistic Regression Model: Unscaled Data

In [6]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Create classifier
logreg = LogisticRegression()

# Fit the classifier to training data 
logreg.fit(X_train, y_train)

LogisticRegression()

In [8]:
# Predict labels of the test set
y_pred = logreg.predict(X_test)

In [39]:
# Model scores
# Training data score
print(classification_report(y_test, y_pred))
print(f'Logistic Regression Model Unscaled Training Data score: {logreg.score(X_train, y_train)}')
print(f'Logistic Regression Model Unscaled Testing Data score: {logreg.score(X_test, y_test)}')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22564
           1       0.85      0.91      0.88       697

    accuracy                           0.99     23261
   macro avg       0.92      0.95      0.94     23261
weighted avg       0.99      0.99      0.99     23261

Logistic Regression Model Unscaled Training Data score: 0.9667802855826808
Logistic Regression Model Unscaled Testing Data score: 0.9700356820429045


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


## Logistic Regression Model: Scaled Data

In [30]:
# Scale the data using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
# Create classifier
logreg = LogisticRegression()

# Fit the classifier to training data 
logreg.fit(X_train_scaled, y_train)
y_pred_scaled = logreg.predict(X_test_scaled)

In [36]:
# Scores for scaled data
# Training data score
#print(classification_report(y_test, y_pred_scaled))
print(f'Logistic Regression Model Scaled Training Data score: {logreg.score(X_train_scaled, y_train)}')
print(f'Logistic Regression Model Scaled Testing Data score: {logreg.score(X_test_scaled, y_test)}')

Logistic Regression Model Scaled Training Data score: 0.994048825426071
Logistic Regression Model Scaled Testing Data score: 0.9943252654658011


## Random Forest Classifier Model: Unscaled Data

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [38]:
 # Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Random Forest Classifier Unscaled Model Training Score: {clf.score(X_train, y_train)}')
print(f'Random Forest Classifier Unscaled Model Testing Score: {clf.score(X_test, y_test)}')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22564
           1       0.85      0.91      0.88       697

    accuracy                           0.99     23261
   macro avg       0.92      0.95      0.94     23261
weighted avg       0.99      0.99      0.99     23261

Random Forest Classifier Unscaled Model Training Score: 0.9971810225702441
Random Forest Classifier Unscaled Model Testing Score: 0.9925626585271484


## Random Forest Classifier Model: Scaled Data

In [37]:
 # Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred_scaled = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred_scaled))
print(f'Random Forest Classifier Model Scaled Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Random Forest Classifier Scaled Model Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22564
           1       0.85      0.91      0.88       697

    accuracy                           0.99     23261
   macro avg       0.92      0.95      0.94     23261
weighted avg       0.99      0.99      0.99     23261

Random Forest Classifier Model Scaled Training Score: 0.9971810225702441
Random Forest Classifier Scaled Model Testing Score: 0.9924766777008727


# Evaluation and comparison of model performance 

In [40]:
print('Logistic Regression Model Scores')
print('Unscaled')
print(f'Logistic Regression Model Unscaled Training Data score: {logreg.score(X_train, y_train)}')
print(f'Logistic Regression Model Unscaled Testing Data score: {logreg.score(X_test, y_test)}')
print('-----------------------------------------------')
print('Scaled')
print(f'Logistic Regression Model Scaled Training Data score: {logreg.score(X_train_scaled, y_train)}')
print(f'Logistic Regression Model Scaled Testing Data score: {logreg.score(X_test_scaled, y_test)}')
print('-----------------------------------------------')
print('-----------------------------------------------')
print('Random Forest Classifier Model Scores')
print('Unscaled')
print(f'Random Forest Classifier Unscaled Model Training Score: {clf.score(X_train, y_train)}')
print(f'Random Forest Classifier Unscaled Model Testing Score: {clf.score(X_test, y_test)}')
print('-----------------------------------------------')
print('Scaled')
print(f'Random Forest Classifier Model Scaled Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Random Forest Classifier Scaled Model Testing Score: {clf.score(X_test_scaled, y_test)}')

Logistic Regression Model Scores
Unscaled
Logistic Regression Model Unscaled Training Data score: 0.9667802855826808
Logistic Regression Model Unscaled Testing Data score: 0.9700356820429045
-----------------------------------------------
Scaled
Logistic Regression Model Scaled Training Data score: 0.994048825426071
Logistic Regression Model Scaled Testing Data score: 0.9943252654658011
-----------------------------------------------
-----------------------------------------------
Random Forest Classifier Model Scores
Unscaled


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


Random Forest Classifier Unscaled Model Training Score: 0.9971810225702441
Random Forest Classifier Unscaled Model Testing Score: 0.9925626585271484
-----------------------------------------------
Scaled


  "X does not have valid feature names, but"


Random Forest Classifier Model Scaled Training Score: 0.9667802855826808
Random Forest Classifier Scaled Model Testing Score: 0.9700356820429045


  "X does not have valid feature names, but"


From the above scores it seems that Logistic Regression Model performs better than the Random Forest Classifier. The Logistic Regression model also performs better with scaled data. The Random Forest Classifier seems to perfrom better on unscaled data which suggests that the training data has been overfitted to the model. The scaled data shows a slightly lower score. 