#### Benchmarking classification dataset - processed credit score dataset

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import IsolationForest
from scipy import stats
from sklearn.preprocessing import label_binarize

In [11]:
df = pd.read_csv("housing_classification_processed.csv")

In [12]:
df

Unnamed: 0,price,bedrooms,bathrooms,floors,view,sqft_basement,grade_category
0,231300.0,2,1.00,1.0,0,0,2
1,180000.0,2,1.00,1.0,0,0,1
2,604000.0,4,3.00,1.0,0,910,2
3,510000.0,3,2.00,1.0,0,0,3
4,257500.0,3,2.25,2.0,0,0,2
...,...,...,...,...,...,...,...
16851,360000.0,3,2.50,3.0,0,0,3
16852,400000.0,4,2.50,2.0,0,0,3
16853,402101.0,2,0.75,2.0,0,0,2
16854,400000.0,3,2.50,2.0,0,0,3


In [13]:
df['grade_category'].unique()

array([2, 1, 3, 4], dtype=int64)

In [None]:
# XGBoost expects categories staring from 0 we got the following error:
# Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got [1 2 3 4]
# Thereby we are making it first index starting from 1 instead of 0 
df['grade_category'] = df['grade_category'] -1
df['grade_category'].unique()

array([1, 0, 2, 3], dtype=int64)

### X/y + train/test -splits 

In [15]:
# X/y split etc.
target = "grade_category"

# catboost expects data either in int or str, we convert it to integer
df['floors'] = df['floors'].astype('int64')

# we loose some decimal bathrooms but it should not be a problem
df['bathrooms'] = df['bathrooms'].astype('int64')

# categorical features need to be separated for certain algorithms, like CatBoost
categorical_features = ['bedrooms', 'floors', 'view']

X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Define models to be tested

In [16]:
# define our model dictionary

# With RandomizedSearch, we optimized CatBoost:
# Best Parameters: {'learning_rate': 0.06142857142857143, 'iterations': 600.0, 'depth': 5}

models = {
    #'Logistic Regression': LogisticRegression(max_iter=3000, solver='saga'),
    #'SVM': SVC(probability=True),
    'CatBoost-default': cb.CatBoostClassifier(iterations=500, verbose=0),
    'CatBoost-optimized': cb.CatBoostClassifier(iterations=600, learning_rate=0.06142857142857143, depth=5, verbose=0),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'XGBoost': xgb.XGBClassifier(enable_categorical=True, objective='multi:softprobar', num_class=len(X.columns)),
    'LightGBM': lgb.LGBMClassifier(objective='multiclass')
    
}

### this part is a bit tricky due to how probabilities are handled -> train each model and gather metrics

In [17]:
# our benchmarking code!

# this will contain later our results of the benchmark
results = []

# loop through our models above
for name, model in models.items():

    # we have to react to certain algorithms
    # because they have requirements for fit()
    print("Starting ... " + name)

    # SVM / KNN require scaled data
    if name in ['SVM', 'KNN']:
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
        probability = model.predict_proba(X_test_scaled) if hasattr(model, "predict_proba") else None
    elif name == 'CatBoost':
        # CatBoost requires
        # we specify which columns are categories, INCLUDING ORDINALS
        model.fit(X_train, y_train, cat_features=categorical_features)
        predictions = model.predict(X_test)
        probability = model.predict_proba(X_test)
    else:
        # everything else follows the same logic!
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        probability = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    # Calculate metrics for current model in training
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average="macro")
    recall = recall_score(y_test, predictions, average="macro")
    f1 = f1_score(y_test, predictions, average="macro")

    # ROC AUC
    if probability is not None:
        y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
        roc_auc = roc_auc_score(y_test, probability, multi_class='ovr')
    else:
        roc_auc = np.nan

    # save the metrics for this model into results
    results.append([name, accuracy, precision, recall, f1, roc_auc])

Starting ... CatBoost-default
Starting ... CatBoost-optimized
Starting ... Random Forest
Starting ... XGBoost
Starting ... LightGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 13484, number of used features: 6
[LightGBM] [Info] Start training from score -2.221698
[LightGBM] [Info] Start training from score -0.830117
[LightGBM] [Info] Start training from score -0.930783
[LightGBM] [Info] Start training from score -2.791454


### Highlight and visualize best metrics within the models

In [18]:
metrics_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC-AUC'])

light_theme = 'background: yellow'
dark_theme = 'background: goldenrod'

# you can alternate the highlight styling based on your theme
default_highlight_style = dark_theme

# helper function that highlights the best model of each metric
def highlight_best_metrics(row):
    # default styles for everything is empty in the beginning
    styles = ['' for _ in row]

    # index 1 => Accuracy (index 0 => model name)
    if row['Accuracy'] == metrics_df['Accuracy'].max():
        styles[1] = default_highlight_style

    # index 2 = > Precision
    if row['Precision'] == metrics_df['Precision'].max():
        styles[2] = default_highlight_style

    # index 3 = > Recall
    if row['Recall'] == metrics_df['Recall'].max():
        styles[3] = default_highlight_style

    # index 4 = > F1-score
    if row['F1-score'] == metrics_df['F1-score'].max():
        styles[4] = default_highlight_style

    # index 5 = > ROC-AUC
    if row['ROC-AUC'] == metrics_df['ROC-AUC'].max():
        styles[5] = default_highlight_style

    return styles


# apply the custom styles based on the min/max metrics
highlight_df = metrics_df.style.apply(highlight_best_metrics, axis=1)


highlight_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,ROC-AUC
0,CatBoost-default,0.69484,0.683556,0.580154,0.614722,0.884293
1,CatBoost-optimized,0.69573,0.690541,0.579333,0.614954,0.886035
2,Random Forest,0.655397,0.619158,0.569043,0.589453,0.840664
3,XGBoost,0.688612,0.674767,0.574315,0.6075,0.879152
4,LightGBM,0.691281,0.681685,0.579587,0.613183,0.88188


### CatBoost-optimized is strong here!
* But Recall score is best with CatBoost-default algorithm

In [None]:
# I tried different ways and CatBoost-optimized model performs the best overall, with the highest accuracy (0.6957) and ROC-AUC (0.8860), showing a slight improvement over the default CatBoost.
# Yes we can try also XGBoost which is seems good accurecy but I found with different ittreations CatBoost would be the west option for model.