# A Comparative Study of Predicting Loan Status of a Lending Company using Various Machine Learning Algorithms

This notebook shows the code for my Final Year Project: A Comparative Study of Predicting Loan Status of a Lending Company using Various Machine Learning Algorithms

# Import libraries

In [None]:
import os
os.getcwd()

In [None]:
# import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import shap
from xgboost import XGBRFClassifier, XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from pandas_profiling import ProfileReport
from pandas_dq import dq_report, Fix_DQ
from lazypredict.Supervised import LazyClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.dummy import DummyClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, roc_curve, recall_score, auc, f1_score, cohen_kappa_score, matthews_corrcoef
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from imblearn.combine import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.linear_model import *
from sklearn.discriminant_analysis import *
from sklearn.neighbors import *
from sklearn.naive_bayes import *
from sklearn.svm import *
from sklearn.neural_network import *
from sklearn.cluster import *
from sklearn.mixture import *
from sklearn.linear_model import *
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import *
from sklearn import model_selection
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn import svm,model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

# Set options for displaying data
pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)
sns.set(style = "whitegrid")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set(rc = {'figure.figsize':(3,3)})
sns.set_style('whitegrid')
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action = 'ignore', category = DataConversionWarning)
warnings.filterwarnings(action = 'ignore', category = FutureWarning)

# Read the data

In [None]:
acc_data = pd.read_csv("accepted_2007_to_2018Q4.csv", low_memory = False)
df = pd.DataFrame(acc_data)
df

# Checking Data Issues

- Automated EDA

In [None]:
# profile = ProfileReport(df)
# profile.to_file('Terrence FYP EDA.html')

- Checking Null Values

In [None]:
nul = df.isnull().mean().sort_values()

nul

In [None]:
nul = nul[nul>0.3]

nul

In [None]:
nul_col = nul.sort_values(ascending = False).index

nul_col

- Dropping Columns with Missing Values > 30%

In [None]:
data = df.drop(nul_col, axis = 1)

data

In [None]:
data.drop(['id', 'url', 'loan_status'], axis = 1, inplace = True)
data.drop(['grade', 'emp_title'], axis = 1, inplace = True)
data.drop(['title', 'zip_code'], axis = 1, inplace = True)

In [None]:
data = data.dropna(how = 'all')

data

In [None]:
date_col = ['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d']

for value in date_col:
    data[value + '_month'] = data[value].apply(lambda x: x[0:3] if isinstance(x, str) else x)
    data[value + '_year'] = data[value].apply(lambda x: x[-4:] if isinstance(x, str) else x)

data.drop(date_col, axis = 1, inplace = True)

data

- Sampling to reduce work

In [None]:
data = data.sample(frac = 0.45, axis = 0, random_state = 0).reset_index(drop = True)

# Feature Engineering

- Data Splitting

In [None]:
def data_splitting(df, target_col, test_size):
    global X, y, X_train, X_test, y_train, y_test
    X = df.loc[:, df.columns != target_col]
    y = df.loc[:, target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 0)

data_splitting(data, 'sub_grade', 0.2)

- Label Encoding Before Imputation

In [None]:
le = LabelEncoder()

In [None]:
# X_train

X_train_categorical = [feature for feature in X_train.columns if X_train[feature].dtype == "O"]

for col in X_train_categorical:
    X_train[col] = le.fit_transform(X_train[col])


X_train

In [None]:
# X_test

X_test_categorical = [feature for feature in X_test.columns if X_test[feature].dtype == "O"]

for col in X_test_categorical:
    X_test[col] = le.fit_transform(X_test[col])


X_test

In [None]:
# y_train

y_train = le.fit_transform(y_train)

y_train

In [None]:
# y_test

y_test = le.transform(y_test)

y_test

- Checking Null Values and Perform Imputation

In [None]:
# Function to calculate missing values by column
def missing_values_table(df):
  # Total missing values
  mis_val = df.isnull().sum()

  # Percentage of missing values
  mis_val_percent = 100 * df.isnull().sum() / len(df)

  # Make a table with the results
  mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

  # Rename the columns
  mis_val_table_ren_columns = mis_val_table.rename(
  columns = {0 : 'Missing Values', 1 : '% of Total Values'})

  # Sort the table by percentage of missing descending
  mis_val_table_ren_columns = mis_val_table_ren_columns[
      mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
  '% of Total Values', ascending=False).round(1)

  # Print some summary information
  print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
      "There are " + str(mis_val_table_ren_columns.shape[0]) +
        " columns that have missing values.")

  # Return the dataframe with missing information
  return mis_val_table_ren_columns

missing_values_table(X_train)

In [None]:
from sklearn.impute import SimpleImputer

x_train_col = X_train.columns
x_test_col = X_test.columns

imputer = SimpleImputer(strategy = 'mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train))
X_test = pd.DataFrame(imputer.transform(X_test))

X_train.columns = x_train_col
X_test.columns = x_test_col

# Model Comparison

- Logistic Regression

In [None]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
lr_test_accuracy = lr.score(X_test, y_test)

print("Logistic Regression Test Accuracy:", lr_test_accuracy)
print(classification_report(y_test, y_pred_lr))

- Gaussian Naive Bayes

In [None]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred_gnb = gnb.predict(X_test)
gnb_test_accuracy = gnb.score(X_test, y_test)

print("Gaussian Naive Bayes Test Accuracy:", gnb_test_accuracy)
print(classification_report(y_test, y_pred_gnb))

- Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
dt_test_accuracy = dt.score(X_test, y_test)

print("Decision Tree Classifier Test Accuracy:", dt_test_accuracy)
print(classification_report(y_test, y_pred_dt))

- Random Forest Classifier

In [None]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
rf_test_accuracy = rf.score(X_test, y_test)

print("Random Forest Classifier Test Accuracy:", rf_test_accuracy)
print(classification_report(y_test, y_pred_rf))

- Bagging Classifier

In [None]:
bag = BaggingClassifier()

bag.fit(X_train, y_train)

y_pred_bag = bag.predict(X_test)
bag_test_accuracy = bag.score(X_test, y_test)

print("Bagging Classifier Test Accuracy:", bag_test_accuracy)
print(classification_report(y_test, y_pred_bag))

- AdaBoost Classifier

In [None]:
ada = AdaBoostClassifier()

ada.fit(X_train, y_train)

y_pred_ada = ada.predict(X_test)
ada_test_accuracy = ada.score(X_test, y_test)

print("Ada Boost Classifier Test Accuracy:", ada_test_accuracy)
print(classification_report(y_test, y_pred_ada))

- Light Gradient Boosting Model

In [None]:
lgb = LGBMClassifier()

lgb.fit(X_train, y_train)

y_pred_lgb = lgb.predict(X_test)
lgb_test_accuracy = lgb.score(X_test, y_test)

print("Light Gradient Boosting Classifier Test Accuracy:", lgb_test_accuracy)
print(classification_report(y_test, y_pred_lgb))

- Extreme Gradient Boosting Random Forest

In [None]:
xgb = XGBClassifier()

xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
xgb_test_accuracy = xgb.score(X_test, y_test)

print("Extreme Gradient Boosting Classifier Test Accuracy:", xgb_test_accuracy)
print(classification_report(y_test, y_pred_xgb))

# Feature Importance

- Logistic Regression

In [None]:
feature_names = X_train.columns  # Replace X_train with your actual feature data

# Get the absolute values of the coefficients for feature importances
coefficients = np.abs(lr.coef_[0])

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(coefficients)[::-1]
sorted_feature_importance = coefficients[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x=sorted_feature_importance, y=sorted_feature_names)
plt.title('Logistic Regression Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Gaussian Naive Bayes

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance = np.std(gnb.theta_, axis=0)

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Gaussian Naive Bayes Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Decision Tree

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance = dt.feature_importances_

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Decision Tree Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Random Forest

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance = rf.feature_importances_

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Random Forest Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Bootstrap Aggregating

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance /= len(bag.estimators_)

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Bootstrap Aggregating Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Adaptive Boosting

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance = ada.feature_importances_

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Adaptive Boosting Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Light Gradient Boosting

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance = lgb.feature_importances_

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Light Gradient Boosting Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

- Extreme Gradient Boosting

In [None]:
# Calculate feature importance based on the standard deviation of each feature
feature_importance = xgb.feature_importances_

# Get the feature names from your input data or a predefined list
feature_names = X_train.columns

# Sort the feature importance scores and feature names in descending order
sorted_indices = np.argsort(feature_importance)[::-1]
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Limit to top 5 feature importances
top_n = 5
sorted_feature_importance = sorted_feature_importance[:top_n]
sorted_feature_names = sorted_feature_names[:top_n]

# Plot feature importance
plt.figure(figsize=(6, 3))
sns.barplot(x = sorted_feature_importance, y = sorted_feature_names)
plt.title('Extreme Gradient Boosting Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()