<a href="https://colab.research.google.com/github/PaisleyZuo/Credit-Score-Classification/blob/main/Credit_Score_Classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
pip install imbalanced-learn



In [51]:
!pip install catboost



In [52]:
!pip install plotly
import plotly.graph_objects as go



In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from scipy.stats.contingency import chi2_contingency
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, LabelEncoder, StandardScaler
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, precision_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from keras import models
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from imblearn.over_sampling import SMOTE
from collections import Counter


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier




import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

In [54]:
traindf = pd.read_csv('credit_score_train.csv')

**4. Test Data Preprocess**

In [55]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Annual_Income              100000 non-null  float64
 1   Num_Bank_Accounts          100000 non-null  float64
 2   Num_Credit_Card            100000 non-null  float64
 3   Interest_Rate              100000 non-null  float64
 4   Num_of_Loan                100000 non-null  float64
 5   Delay_from_due_date        100000 non-null  int64  
 6   Num_of_Delayed_Payment     100000 non-null  float64
 7   Changed_Credit_Limit       100000 non-null  float64
 8   Num_Credit_Inquiries       100000 non-null  float64
 9   Credit_Mix                 100000 non-null  object 
 10  Outstanding_Debt           100000 non-null  float64
 11  Payment_of_Min_Amount      100000 non-null  object 
 12  Monthly_Balance            100000 non-null  float64
 13  auto_loan                  100

In [56]:
traindf['Credit_Mix'] = traindf['Credit_Mix'].map({
    "Good": 2,
    "Standard": 1,
    "Bad": 0
}).astype(int)

In [57]:
traindf['Payment_of_Min_Amount'] = traindf['Payment_of_Min_Amount'].map({
    "Yes": 1,
    "No": 0
}).astype(int)

In [58]:
hue_feature = 'Credit_Score'
traindf[hue_feature].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Credit_Score,Unnamed: 1_level_1
1,0.53
0,0.29
2,0.18


There is a imbalance issue, which we need to solve.

**SMOTE**

Since the Keras library doesn't support recall metrics and our data is unbalanced, we will use SMOTE.

In [59]:
from collections import Counter
print("Before:", Counter(y_train))
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After:", Counter(y_train_resampled))

Before: Counter({1: 42539, 0: 23199, 2: 14262})
After: Counter({1: 42539, 0: 42539, 2: 42539})


In [60]:
X = traindf.drop(columns=['Credit_Score'])
y = traindf['Credit_Score']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Modeling

In [61]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
cb = CatBoostClassifier(verbose=0)
xg = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')


lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', lr)
])

# Decision Tree
dt_pipeline = Pipeline([
    ('classifier', dt)
])

# Random Forest
rf_pipeline = Pipeline([
    ('classifier', rf)
])

# CatBoost
cb_pipeline = Pipeline([
    ('classifier', cb)
])

# XGBoost
xg_pipeline = Pipeline([
    ('classifier', xg)
])

In [62]:
pipelines = {
    'Logistic Regression': lr_pipeline,
    'Decision Tree': dt_pipeline,
    'Random Forest': rf_pipeline,
    'CatBoost': cb_pipeline,
    'XGBoost': xg_pipeline
}

for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    print(f"\n{name}")
    print(classification_report(y_val, y_pred))


Logistic Regression
              precision    recall  f1-score   support

           0       0.68      0.52      0.59      5799
           1       0.69      0.75      0.72     10635
           2       0.56      0.64      0.60      3566

    accuracy                           0.66     20000
   macro avg       0.64      0.63      0.63     20000
weighted avg       0.66      0.66      0.66     20000


Decision Tree
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      5799
           1       0.78      0.78      0.78     10635
           2       0.72      0.70      0.71      3566

    accuracy                           0.76     20000
   macro avg       0.75      0.74      0.74     20000
weighted avg       0.76      0.76      0.76     20000


Random Forest
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      5799
           1       0.84      0.82      0.83     10635
           2       0.79 

In [63]:
results = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    results[name] = mse

for name, mse in results.items():
    print(f"{name} MSE: {mse}")

Logistic Regression MSE: 0.4129
Decision Tree MSE: 0.26135
Random Forest MSE: 0.18805
CatBoost MSE: 0.2721
XGBoost MSE: 0.26815


Comparing the performance of different models

In [64]:
baseline_prediction = y_train.mean()
baseline_mse = mean_squared_error(y_val, [baseline_prediction] * len(y_val))

for name, mse in results.items():
    print(f"{name} MSE: {mse}")

mse_values = [baseline_mse] + list(results.values())

model_names = ['Baseline'] + list(results.keys())

sorted_indices = sorted(range(len(mse_values)), key=lambda k: mse_values[k])
mse_values = [mse_values[i] for i in sorted_indices]
model_names = [model_names[i] for i in sorted_indices]

min_mse = min(mse_values)
max_mse = max(mse_values)
middle_mse = (min_mse + max_mse) / 2

colors = ['green' if mse == min_mse else 'red' if mse == max_mse else 'yellow' for mse in mse_values]

fig = go.Figure(data=[go.Bar(x=model_names, y=mse_values, marker=dict(color=colors))])
fig.update_layout(xaxis_title='Model', yaxis_title='Mean Squared Error', title='Comparison of Model Performance with Baseline')
fig.show()

Logistic Regression MSE: 0.4129
Decision Tree MSE: 0.26135
Random Forest MSE: 0.18805
CatBoost MSE: 0.2721
XGBoost MSE: 0.26815


The alternative approach: use ANN model.

**ANN model**

In [65]:
from tensorflow.keras.layers import BatchNormalization

In [66]:
model = Sequential([
    Dense(512, activation='relu'),
    BatchNormalization(),


    Dense(512, activation='relu'),
    BatchNormalization(),

    Dense(256, activation='relu'),
    BatchNormalization(),


    Dense(256, activation='relu'),
    BatchNormalization(),


    Dense(128, activation='relu'),
    BatchNormalization(),

    Dense(64, activation='relu'),
    BatchNormalization(),


    Dense(64, activation='relu'),
    BatchNormalization(),


    Dense(32, activation='relu'),
    BatchNormalization(),

    Dense(3, activation='softmax')
])


model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_accuracy',
                               patience=35,
                               restore_best_weights=True)

history = model.fit(x=X_train_resampled,
                    y=y_train_resampled,
                    validation_data=(X_val, y_val),
                    validation_split=0.2,
                    batch_size=4096,
                    epochs=600,
                    verbose=1,
                    callbacks=[early_stopping])

Epoch 1/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 178ms/step - accuracy: 0.5109 - loss: 1.0748 - val_accuracy: 0.3126 - val_loss: 2.3456
Epoch 2/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6052 - loss: 0.8568 - val_accuracy: 0.4056 - val_loss: 1.3250
Epoch 3/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6300 - loss: 0.8227 - val_accuracy: 0.4999 - val_loss: 1.0080
Epoch 4/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6592 - loss: 0.8004 - val_accuracy: 0.5496 - val_loss: 0.9574
Epoch 5/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6709 - loss: 0.7791 - val_accuracy: 0.5903 - val_loss: 0.9084
Epoch 6/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6745 - loss: 0.7734 - val_accuracy: 0.5765 - val_loss: 0.8882
Epoch 7/600
[1m32/32[0m 

In [67]:
model.summary()

In [68]:
eval_metric(model, X_train_resampled,y_train_resampled, X_val, y_val)

[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Test Set:
[[4242 1516   41]
 [2673 7848  114]
 [ 124 3239  203]]
              precision    recall  f1-score   support

           0       0.60      0.73      0.66      5799
           1       0.62      0.74      0.68     10635
           2       0.57      0.06      0.10      3566

    accuracy                           0.61     20000
   macro avg       0.60      0.51      0.48     20000
weighted avg       0.61      0.61      0.57     20000


Train Set:
[[31088 11202   249]
 [10873 31160   506]
 [ 1226 38881  2432]]
              precision    recall  f1-score   support

           0       0.72      0.73      0.73     42539
           1       0.38      0.73      0.50     42539
           2       0.76      0.06      0.11     42539

    accuracy                           0.51    127617
   macro avg       0.62      0.51      0.45    127617
