<div style="color:yellow;">
<h2 style="font-weight:700">CS 4103 - Intelligent Systems</h2>
<h3>Hands-on Exercise - Normalization and Standardization</h3>
<h4>September 6, 2023</h4>
</div>

In [None]:
# import required libraries
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

### Loan Defaults Dataset
---
Anonymised Loan Default data, including loan amount, term, Interest rate, instalment, employment length, home ownership, annual income, loan status and purpose, etc. as well as the binary repay_fail (1 or 0).

Source: https://www.kaggle.com/datasets/joebeachcapital/loan-default

# Load and Preprocess Data

In [None]:
data = pd.read_csv('04-Anonymize_Loan_Default_data.csv',encoding='latin1', index_col=0)

In [None]:
data.info()

In [None]:
data.head()

# Data Preprocessing

In [None]:
# Drop rows with missing values or duplicate IDs
columns_to_drop = ['id','member_id', 'issue_d', 'last_pymnt_d', 'last_credit_pull_d', 'earliest_cr_line', 'next_pymnt_d', 'zip_code']
data = data.drop(columns_to_drop, axis=1)

data.dropna(inplace=True)

# Convert percentage strings to numerical values
data['revol_util'] = data['revol_util'].str.strip('%').astype(float)

# Convert term to numerical values
data['term'] = data['term'].str.extract('(\d+)').astype(int)

# Convert emp_length to numerical values
data['emp_length'] = data['emp_length'].str.extract('(\d+)').astype(float)

In [None]:
data.head()

<h5 style="color:yellow">
Finish the remainder of the data preprocessing code by applying proper techniques in dealing with the rest of the categorical and numeric variables found in the dataset. For numeric values, include the column values that are converted in the cell above this. Refer to the first part of the code for the dataset's column info.
</h5>

In [None]:
# Code for rest of the data preprocessing here

# Encode categorical variables using Label Encoding
label_encoder = LabelEncoder()
categorical_cols = ['home_ownership', 'verification_status', 'purpose', 'addr_state', 'loan_status']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Feature scaling
scaler = StandardScaler()
numeric_cols = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs',
                'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
                'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt', 'term', 'emp_length']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Exploratory Data Analysis

<h5 style="color:yellow">
You may perform further analysis and data visualizations here aside from the ones given.
</h5>

In [None]:
# Split the data into features (X) and target (y)
X = data.drop('repay_fail', axis=1)
y = data['repay_fail']

# Explore correlations
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Feature Scaling

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training and Evaluation

<h5 style="color:yellow">The code below uses Random Forest Classifier. Change the code and use neural networks instead to obtain the predictions.</h5>

In [None]:
# Initialize and train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print()
print(f"Classification Report:\n{classification_rep}")


In [None]:
ann = Sequential([
    Dense(units=3, activation='relu'),   # Input layer
    Dense(units=6, activation='tanh'),   # Hidden layer
    Dense(units=3, activation='relu'),   # Input layer
    Dense(units=1, activation='sigmoid') # Output layer
])
ann.compile(optimizer = 'adam', loss = 'mse', metrics = ['accuracy'])
ann.fit(X_train, y_train, batch_size = 12, epochs = 50)

y_pred = ann.predict(X_test)

In [None]:
y_pred = (y_pred > 0.5)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print()
print(f"Classification Report:\n{classification_rep}")

In [46]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:')
print(cm)
print(classification_report(y_test, y_pred, target_names=["repay success","repay fail"], digits= 4))

Confusion matrix:
[[2281    0]
 [   6  469]]
               precision    recall  f1-score   support

repay success     0.9974    1.0000    0.9987      2281
   repay fail     1.0000    0.9874    0.9936       475

     accuracy                         0.9978      2756
    macro avg     0.9987    0.9937    0.9962      2756
 weighted avg     0.9978    0.9978    0.9978      2756



# Findings
StandardScaler = 0.9698
RobustScaler = 0.9698
MinMaxScaler = 0.9668

MinMax worst in accuracy.