# ANN Classification Ruben Serdons (Loan Approval)

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# pip install tensorflow
import tensorflow as tf
import keras
from keras import layers

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('./Data/Loan.csv')

In [3]:
df.head(20)

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0
5,2018-01-06,37,178310,626,Self-Employed,Master,16,15433,72,Married,...,14859.166667,0.756079,5,27071,0.217433,0.217601,385.577074,0.075211,1,44.0
6,2018-01-07,58,51250,564,Employed,High School,39,12741,48,Married,...,4270.833333,0.884275,5,21730,0.225741,0.205271,391.300352,0.170529,0,50.0
7,2018-01-08,49,97345,516,Employed,High School,23,19634,12,Divorced,...,8112.083333,0.933492,5,38621,0.226634,0.209113,1827.360055,0.260767,1,42.4
8,2018-01-09,34,116841,603,Employed,Bachelor,12,55353,60,Divorced,...,9736.75,0.728397,3,7711,0.258853,0.291539,1762.199026,0.246509,0,61.0
9,2018-01-10,46,40615,612,Employed,Associate,19,25443,12,Married,...,3384.583333,0.615323,3,116812,0.184443,0.197271,2353.577424,0.903384,0,53.0


In [4]:
df.shape

(20000, 36)

In [5]:
df.dtypes

ApplicationDate                object
Age                             int64
AnnualIncome                    int64
CreditScore                     int64
EmploymentStatus               object
EducationLevel                 object
Experience                      int64
LoanAmount                      int64
LoanDuration                    int64
MaritalStatus                  object
NumberOfDependents              int64
HomeOwnershipStatus            object
MonthlyDebtPayments             int64
CreditCardUtilizationRate     float64
NumberOfOpenCreditLines         int64
NumberOfCreditInquiries         int64
DebtToIncomeRatio             float64
BankruptcyHistory               int64
LoanPurpose                    object
PreviousLoanDefaults            int64
PaymentHistory                  int64
LengthOfCreditHistory           int64
SavingsAccountBalance           int64
CheckingAccountBalance          int64
TotalAssets                     int64
TotalLiabilities                int64
MonthlyIncom

We're going to check for the columns who's type is **object** how many unique values it has, and what the unique values are

In [6]:
for column in df.columns:
    if str(df[column].dtype) == 'object':
        print('-------------------------')
        print(df[column].value_counts())
        print("Total different values: ", str(len(df[column].unique())))

-------------------------
ApplicationDate
2018-01-01    1
2054-07-01    1
2054-07-08    1
2054-07-07    1
2054-07-06    1
             ..
2036-04-02    1
2036-04-01    1
2036-03-31    1
2036-03-30    1
2072-10-03    1
Name: count, Length: 20000, dtype: int64
Total different values:  20000
-------------------------
EmploymentStatus
Employed         17036
Self-Employed     1573
Unemployed        1391
Name: count, dtype: int64
Total different values:  3
-------------------------
EducationLevel
Bachelor       6054
High School    5908
Associate      4034
Master         3050
Doctorate       954
Name: count, dtype: int64
Total different values:  5
-------------------------
MaritalStatus
Married     10041
Single       6078
Divorced     2882
Widowed       999
Name: count, dtype: int64
Total different values:  4
-------------------------
HomeOwnershipStatus
Mortgage    7939
Rent        6087
Own         3938
Other       2036
Name: count, dtype: int64
Total different values:  4
-------------------

In [7]:
df.drop('ApplicationDate', axis=1, inplace=True)

## Encoding

We have EmploymentStatus, EducationLevel, MaritalStatus, HomeOwnershipStatus and LoanPurposes that need to be encoded.

In [8]:
from sklearn.preprocessing import LabelEncoder

variables = ['EmploymentStatus', 'EducationLevel', 'MaritalStatus', 'HomeOwnershipStatus',
            'LoanPurpose']

encoder = LabelEncoder()

df[variables] = df[variables].apply(encoder.fit_transform)


In [9]:
df.head()

Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,45,39948,617,0,4,22,13152,48,1,2,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,38,39709,628,0,0,15,26045,48,2,1,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,47,40724,570,0,1,26,17627,36,1,2,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,58,69084,545,0,3,34,37898,96,2,1,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,37,103264,594,0,0,17,9184,36,1,1,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


## X and y-variables

In [10]:
X = df.drop('LoanApproved', axis=1)

y_temp = df['LoanApproved']

y = tf.keras.utils.to_categorical(y_temp)

## train/val/test - split

In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.35)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

## Create neural network

In [12]:
model = keras.Sequential(
    [
        layers.Dense(16, activation="relu", input_shape=(len(X.columns),)),
        layers.Dense(8, activation="relu"),
        layers.Dense(32),
        layers.Dense(len(df['LoanApproved'].unique()), activation="softmax")
    ]
)

model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.fit(x=X_train, y=y_train, epochs=300, validation_data=(X_val, y_val))

Epoch 1/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6782 - loss: 1330.0144 - val_accuracy: 0.7220 - val_loss: 34.3471
Epoch 2/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step - accuracy: 0.7879 - loss: 36.6384 - val_accuracy: 0.7937 - val_loss: 26.3358
Epoch 3/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 867us/step - accuracy: 0.8028 - loss: 18.7917 - val_accuracy: 0.8351 - val_loss: 3.9401
Epoch 4/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 858us/step - accuracy: 0.8186 - loss: 4.1255 - val_accuracy: 0.8220 - val_loss: 1.1079
Epoch 5/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 861us/step - accuracy: 0.7972 - loss: 1.7917 - val_accuracy: 0.7997 - val_loss: 1.1489
Epoch 6/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851us/step - accuracy: 0.7776 - loss: 1.0588 - val_accuracy: 0.7711 - val_loss: 1.4825
Epoch

[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 844us/step - accuracy: 0.7592 - loss: 0.5313 - val_accuracy: 0.7686 - val_loss: 0.5188
Epoch 51/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 846us/step - accuracy: 0.7616 - loss: 0.5281 - val_accuracy: 0.7686 - val_loss: 0.5193
Epoch 52/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 839us/step - accuracy: 0.7562 - loss: 0.5342 - val_accuracy: 0.7686 - val_loss: 0.5191
Epoch 53/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 846us/step - accuracy: 0.7563 - loss: 0.5348 - val_accuracy: 0.7686 - val_loss: 0.5187
Epoch 54/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.7546 - loss: 0.5369 - val_accuracy: 0.7686 - val_loss: 0.5194
Epoch 55/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 860us/step - accuracy: 0.7557 - loss: 0.5337 - val_accuracy: 0.7686 - val_loss: 0.5187
Epoch 56/300
[1m

[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 830us/step - accuracy: 0.7607 - loss: 0.5284 - val_accuracy: 0.7686 - val_loss: 0.5193
Epoch 100/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 836us/step - accuracy: 0.7555 - loss: 0.5323 - val_accuracy: 0.7686 - val_loss: 0.5191
Epoch 101/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 847us/step - accuracy: 0.7568 - loss: 0.5339 - val_accuracy: 0.7686 - val_loss: 0.5202
Epoch 102/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step - accuracy: 0.7533 - loss: 0.5364 - val_accuracy: 0.7686 - val_loss: 0.5188
Epoch 103/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 843us/step - accuracy: 0.7567 - loss: 0.5328 - val_accuracy: 0.7686 - val_loss: 0.5190
Epoch 104/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851us/step - accuracy: 0.7571 - loss: 0.5323 - val_accuracy: 0.7686 - val_loss: 0.5202
Epoch 105/30

[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.7578 - loss: 0.5334 - val_accuracy: 0.7686 - val_loss: 0.5187
Epoch 149/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851us/step - accuracy: 0.7549 - loss: 0.5356 - val_accuracy: 0.7686 - val_loss: 0.5187
Epoch 150/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 837us/step - accuracy: 0.7568 - loss: 0.5337 - val_accuracy: 0.7686 - val_loss: 0.5190
Epoch 151/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 840us/step - accuracy: 0.7560 - loss: 0.5336 - val_accuracy: 0.7686 - val_loss: 0.5188
Epoch 152/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.7541 - loss: 0.5371 - val_accuracy: 0.7686 - val_loss: 0.5185
Epoch 153/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 854us/step - accuracy: 0.7572 - loss: 0.5332 - val_accuracy: 0.7686 - val_loss: 0.5185
Epoch 154/30

[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 891us/step - accuracy: 0.7575 - loss: 0.5378 - val_accuracy: 0.7686 - val_loss: 0.5247
Epoch 198/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - accuracy: 0.7518 - loss: 0.5446 - val_accuracy: 0.7686 - val_loss: 0.5243
Epoch 199/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - accuracy: 0.7559 - loss: 0.5402 - val_accuracy: 0.7686 - val_loss: 0.5243
Epoch 200/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 866us/step - accuracy: 0.7552 - loss: 0.5410 - val_accuracy: 0.7686 - val_loss: 0.5246
Epoch 201/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/step - accuracy: 0.7581 - loss: 0.5380 - val_accuracy: 0.7686 - val_loss: 0.5243
Epoch 202/300
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 875us/step - accuracy: 0.7555 - loss: 0.5408 - val_accuracy: 0.7686 - val_loss: 0.5247
Epoch 203/30

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df[['loss', 'val_loss']].plot()

In [None]:
loss_df[['accuracy', 'val_accuracy']].plot()


In [None]:
# compare the final model loss/accuracy/evaluation values
# the values should again match mostly
print("Test data evaluation:")
print(model.evaluate(X_test, y_test, verbose=0))
print("\nTrain data evaluation:")
print(model.evaluate(X_train, y_train, verbose=0))

In [None]:
# get predictions and convert with argmax() to get categories 
# instead of raw probabilities
test_predictions = model.predict(X_test)
test_predictions = np.argmax(test_predictions, axis=1)

# convert also y-test -values with argmax
y_test = np.argmax(y_test, axis=1)

In [None]:
# confusion matrix
sns.heatmap(confusion_matrix(y_test, test_predictions), annot=True, fmt='g')

In [None]:
# print the classification report based on true values and predictions
print(classification_report(y_test, test_predictions))

# get overall accuracy of the model and print it
acc = accuracy_score(y_test, test_predictions)
print("\nModel overall accuracy: {:.2f}%".format(acc * 100))

In [None]:
# The AUC score is a super sensitive metric
# you often get low scores, even 0.5

# in binary classification, AUC values are often interpreted as follows:
# A binary classifier is useful only when it achieves ROC-AUC score greater than 0.5 and as near to 1 as possible. 
# If a classifier yields a score less than 0.5, it simply means that the model is performing worse 
# than a random classifier, and therefore is useless.

# In multi category classification , AUC values are often interpreted as follows: 
# 0.5-0.6 (failed)
# 0.6-0.7 (worthless)
# 0.7-0.8 (poor)
# 0.8-0.9 (good)
# > 0.9 (excellent)

# get ROC-AUC -score
roc_auc_score(y, model.predict(X), multi_class="ovr")

In [None]:
df['RiskScore'].head(10)

In [None]:
df['CreditScore'].min()

In [None]:
tester_row = {
     'Age': 45,                        # Mid-career individual with stable income
    'AnnualIncome': 180000,            # High annual income
    'CreditScore': 820,                # Excellent credit score
    'EmploymentStatus': 1,             # Employed (assuming 1 represents employment)
    'EducationLevel': 3,               # Higher education (Master's or PhD level)
    'Experience': 20,                  # 20 years of work experience
    'LoanAmount': 20000,               # Moderate loan amount
    'LoanDuration': 36,                # Loan duration of 3 years
    'MaritalStatus': 1,                # Married (usually indicates stability)
    'NumberOfDependents': 1,           # 1 dependent, moderate family size
    'HomeOwnershipStatus': 1,          # Owns a home
    'MonthlyDebtPayments': 500,        # Low monthly debt payments
    'CreditCardUtilizationRate': 0.15, # Low credit card utilization rate
    'NumberOfOpenCreditLines': 5,      # 5 open credit lines, indicating manageable credit usage
    'NumberOfCreditInquiries': 1,      # Few recent credit inquiries
    'DebtToIncomeRatio': 0.1,          # Low debt-to-income ratio
    'BankruptcyHistory': 0,            # No bankruptcy history
    'LoanPurpose': 1,                  # Purpose such as home improvement or education (lower risk)
    'PreviousLoanDefaults': 0,         # No previous loan defaults
    'PaymentHistory': 48,              # Long and positive payment history
    'LengthOfCreditHistory': 20,       # Lengthy credit history, indicating financial maturity
    'SavingsAccountBalance': 100000,   # Substantial savings account balance
    'CheckingAccountBalance': 5000,    # Decent checking account balance
    'TotalAssets': 1500000,            # Substantial total assets
    'TotalLiabilities': 100000,        # Low total liabilities
    'MonthlyIncome': 15000,            # High monthly income
    'UtilityBillsPaymentHistory': 0.95,# Excellent utility bill payment history
    'JobTenure': 15,                   # Long tenure at current job
    'NetWorth': 1400000,               # High net worth
    'BaseInterestRate': 0.08,          # Low base interest rate
    'InterestRate': 0.10,              # Low overall interest rate
    'MonthlyLoanPayment': 600,         # Moderate monthly loan payment
    'TotalDebtToIncomeRatio': 0.05,    # Very low debt-to-income ratio
    'RiskScore': 0.05                  # Very low risk score
}

tester_row = pd.DataFrame([tester_row])
result = model.predict(tester_row)[0]
# result_text = categories[np.argmax(result)]
print(f"the result is {result}")

In [None]:
print(result)

In [None]:
# Does this mean that there is a 8.59% probability that the loan will be rejected
# and a 91.4% probability that it will be approved?