<a href="https://colab.research.google.com/github/PaisleyZuo/Credit-Score-Classification/blob/main/Credit_Score_Classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install imbalanced-learn



In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
!pip install plotly
import plotly.graph_objects as go



In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from scipy.stats.contingency import chi2_contingency
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, LabelEncoder, StandardScaler
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, precision_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from keras import models
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from imblearn.over_sampling import SMOTE
from collections import Counter


from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from catboost import CatBoostRegressor
from xgboost import XGBRegressor



import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

In [5]:
traindf = pd.read_csv('credit_score_train.csv')

**4. Test Data Preprocess**

In [6]:
hue_feature = 'Credit_Score'
traindf[hue_feature].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Credit_Score,Unnamed: 1_level_1
1,0.53
0,0.29
2,0.18


There is a imbalance issue, which we need to solve.

In [7]:
X = traindf.drop(columns=['Credit_Score'])
y = traindf['Credit_Score']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Annual_Income              100000 non-null  float64
 1   Num_Bank_Accounts          100000 non-null  float64
 2   Num_Credit_Card            100000 non-null  float64
 3   Interest_Rate              100000 non-null  float64
 4   Num_of_Loan                100000 non-null  float64
 5   Delay_from_due_date        100000 non-null  int64  
 6   Num_of_Delayed_Payment     100000 non-null  float64
 7   Changed_Credit_Limit       100000 non-null  float64
 8   Num_Credit_Inquiries       100000 non-null  float64
 9   Credit_Mix                 100000 non-null  object 
 10  Outstanding_Debt           100000 non-null  float64
 11  Payment_of_Min_Amount      100000 non-null  object 
 12  Monthly_Balance            100000 non-null  float64
 13  auto_loan                  100

In [9]:
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()
one_hot_cols = ["Payment_of_Min_Amount"]
ordinal_cols = ["Credit_Mix"]

ordinal_categories = [
    ['Bad', 'Standard', 'Good']
]

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('one_hot_enc', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
        ('ordinal_enc', OrdinalEncoder(categories=ordinal_categories, handle_unknown="use_encoded_value", unknown_value=-1), ordinal_cols)
    ]
)

Modeling

In [11]:
lr = LogisticRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
cb = CatBoostRegressor()
xg = XGBRegressor()

pipelines = {
    'Logistic Regression': Pipeline(steps=[('preprocessor', preprocessor), ('classifier', lr)]),
    'Decision Tree Regression': Pipeline(steps=[('preprocessor', preprocessor), ('regressor', dt)]),
    'Random Forest Regression': Pipeline(steps=[('preprocessor', preprocessor), ('regressor', rf)]),
    'CatBoost Regression': Pipeline(steps=[('preprocessor', preprocessor), ('regressor', cb)]),
    'XGBoost Regression': Pipeline(steps=[('preprocessor', preprocessor), ('regressor', xg)])
}

In [12]:
results = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    results[name] = mse

for name, mse in results.items():
    print(f"{name} MSE: {mse}")

Learning rate set to 0.081822
0:	learn: 0.6550510	total: 60.6ms	remaining: 1m
1:	learn: 0.6376168	total: 72.4ms	remaining: 36.1s
2:	learn: 0.6222502	total: 84.8ms	remaining: 28.2s
3:	learn: 0.6092036	total: 95.9ms	remaining: 23.9s
4:	learn: 0.5973858	total: 107ms	remaining: 21.4s
5:	learn: 0.5872270	total: 118ms	remaining: 19.6s
6:	learn: 0.5789549	total: 130ms	remaining: 18.4s
7:	learn: 0.5709303	total: 143ms	remaining: 17.8s
8:	learn: 0.5642769	total: 155ms	remaining: 17s
9:	learn: 0.5583909	total: 166ms	remaining: 16.5s
10:	learn: 0.5533901	total: 177ms	remaining: 15.9s
11:	learn: 0.5491454	total: 188ms	remaining: 15.5s
12:	learn: 0.5446840	total: 204ms	remaining: 15.5s
13:	learn: 0.5408969	total: 217ms	remaining: 15.3s
14:	learn: 0.5378850	total: 229ms	remaining: 15s
15:	learn: 0.5349965	total: 241ms	remaining: 14.8s
16:	learn: 0.5328122	total: 252ms	remaining: 14.6s
17:	learn: 0.5307100	total: 264ms	remaining: 14.4s
18:	learn: 0.5288603	total: 275ms	remaining: 14.2s
19:	learn: 0.5

Comparing the performance of different models

In [13]:
baseline_prediction = y_train.mean()
baseline_mse = mean_squared_error(y_val, [baseline_prediction] * len(y_val))

for name, mse in results.items():
    print(f"{name} MSE: {mse}")

mse_values = [baseline_mse] + list(results.values())

model_names = ['Baseline'] + list(results.keys())

sorted_indices = sorted(range(len(mse_values)), key=lambda k: mse_values[k])
mse_values = [mse_values[i] for i in sorted_indices]
model_names = [model_names[i] for i in sorted_indices]

min_mse = min(mse_values)
max_mse = max(mse_values)
middle_mse = (min_mse + max_mse) / 2

colors = ['green' if mse == min_mse else 'red' if mse == max_mse else 'yellow' for mse in mse_values]

fig = go.Figure(data=[go.Bar(x=model_names, y=mse_values, marker=dict(color=colors))])
fig.update_layout(xaxis_title='Model', yaxis_title='Mean Squared Error', title='Comparison of Model Performance with Baseline')
fig.show()

Logistic Regression MSE: 0.41235
Decision Tree Regression MSE: 0.269
Random Forest Regression MSE: 0.14392721
CatBoost Regression MSE: 0.20522302309938764
XGBoost Regression MSE: 0.1994592249393463


In [14]:
def eval_metric(model, X_train, y_train, X_val, y_val):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_val)

    print("Test Set:")
    print(confusion_matrix(y_val, y_pred))
    print(classification_report(y_val, y_pred))

    print("\nTrain Set:")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [18]:
def eval_metric(model, X_train, y_train, X_val, y_val):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_val)

    # Check if it's a classifier (based on step name)
    if 'classifier' in model.named_steps:
        print("📊 Classification Metrics")

        print("\n🔹 Test Set:")
        print(confusion_matrix(y_val, y_pred))
        print(classification_report(y_val, y_pred))

        print("\n🔹 Train Set:")
        print(confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))

    else:
        print("📊 Regression Metrics")

        print("\n🔹 Test Set:")
        print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))
        print("MAE :", mean_absolute_error(y_val, y_pred))
        print("R²  :", r2_score(y_val, y_pred))

        print("\n🔹 Train Set:")
        print("RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
        print("MAE :", mean_absolute_error(y_train, y_train_pred))
        print("R²  :", r2_score(y_train, y_train_pred))


In [19]:
for name, pipeline in pipelines.items():
    print(f"\n🔧 Evaluating Model: {name}")
    pipeline.fit(X_train, y_train)
    eval_metric(pipeline, X_train, y_train, X_val, y_val)


🔧 Evaluating Model: Logistic Regression
📊 Classification Metrics

🔹 Test Set:
[[3003 2365  431]
 [1385 7928 1322]
 [  52 1243 2271]]
              precision    recall  f1-score   support

           0       0.68      0.52      0.59      5799
           1       0.69      0.75      0.72     10635
           2       0.56      0.64      0.60      3566

    accuracy                           0.66     20000
   macro avg       0.64      0.63      0.63     20000
weighted avg       0.66      0.66      0.66     20000


🔹 Train Set:
[[11958  9367  1874]
 [ 5557 31699  5283]
 [  218  4935  9109]]
              precision    recall  f1-score   support

           0       0.67      0.52      0.58     23199
           1       0.69      0.75      0.72     42539
           2       0.56      0.64      0.60     14262

    accuracy                           0.66     80000
   macro avg       0.64      0.63      0.63     80000
weighted avg       0.66      0.66      0.66     80000


🔧 Evaluating Model: Decis

The alternative approach: use SMOTE and ANN model.

**SMOTE**

Since the Keras library doesn't support recall metrics and our data is unbalanced, we will use SMOTE.

In [None]:
from collections import Counter
print("Before:", Counter(y_train))
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After:", Counter(y_train_resampled))

Before: Counter({1: 42539, 0: 23199, 2: 14262})
After: Counter({1: 42539, 0: 42539, 2: 42539})


**ANN model**

In [None]:
from tensorflow.keras.layers import BatchNormalization

In [None]:
model = Sequential([
    Dense(512, activation='relu'),
    BatchNormalization(),


    Dense(512, activation='relu'),
    BatchNormalization(),

    Dense(256, activation='relu'),
    BatchNormalization(),


    Dense(256, activation='relu'),
    BatchNormalization(),


    Dense(128, activation='relu'),
    BatchNormalization(),

    Dense(64, activation='relu'),
    BatchNormalization(),


    Dense(64, activation='relu'),
    BatchNormalization(),


    Dense(32, activation='relu'),
    BatchNormalization(),

    Dense(3, activation='softmax')
])


model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_accuracy',
                               patience=35,
                               restore_best_weights=True)

history = model.fit(x=X_train_resampled,
                    y=y_train_resampled,
                    validation_data=(X_val, y_val),
                    validation_split=0.2,
                    batch_size=4096,
                    epochs=600,
                    verbose=1,
                    callbacks=[early_stopping])

Epoch 1/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 170ms/step - accuracy: 0.6297 - loss: 0.9366 - val_accuracy: 0.6291 - val_loss: 0.9846
Epoch 2/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7609 - loss: 0.6091 - val_accuracy: 0.6162 - val_loss: 0.8995
Epoch 3/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7989 - loss: 0.5240 - val_accuracy: 0.6080 - val_loss: 0.8308
Epoch 4/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8197 - loss: 0.4699 - val_accuracy: 0.6220 - val_loss: 0.7864
Epoch 5/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8388 - loss: 0.4220 - val_accuracy: 0.6074 - val_loss: 0.7812
Epoch 6/600
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8521 - loss: 0.3916 - val_accuracy: 0.6571 - val_loss: 0.7711
Epoch 7/600
[1m32/32[0m 

In [None]:
model.summary()

In [None]:
eval_metric(model, X_train_resampled,y_train_resampled, X_val, y_val)

[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Test Set:
[[4987  743   69]
 [1598 7804 1233]
 [  47  633 2886]]
              precision    recall  f1-score   support

           0       0.75      0.86      0.80      5799
           1       0.85      0.73      0.79     10635
           2       0.69      0.81      0.74      3566

    accuracy                           0.78     20000
   macro avg       0.76      0.80      0.78     20000
weighted avg       0.79      0.78      0.78     20000


Train Set:
[[41115  1301   123]
 [ 4912 34591  3036]
 [   63  1330 41146]]
              precision    recall  f1-score   support

           0       0.89      0.97      0.93     42539
           1       0.93      0.81      0.87     42539
           2       0.93      0.97      0.95     42539

    accuracy                           0.92    127617
   macro avg       0.92      0.92      0.91    127617
