CatBoost Baseline Model

This notebook trains the baseline CatBoost model on the cleaned stroke dataset. CatBoost is selected for its ability to natively handle categorical variables, reduce overfitting through ordered boosting, and provide interpretable feature importance measures. This baseline model will be compared against the MLP model and the hybrid model in later phases.

In [1]:
import os

# Limit threads to reduce Windows/Jupyter crashes
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

# Safe plotting backend (we won't plot in Jupyter anyway)
import matplotlib
matplotlib.use("Agg")

print("Safe environment set")


Safe environment set


In [2]:
#Step 1 Importing Librariers


import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix
)

# folders
os.makedirs("../models", exist_ok=True)
os.makedirs("../figures", exist_ok=True)





In [3]:
#step 2 Load Cleaned Dataset


CLEAN_FILE = "../data/stroke_clean.csv"  

df = pd.read_csv(CLEAN_FILE)
print("Shape:", df.shape)
df.head()



Shape: (5110, 13)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_group
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Senior
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1,Adult
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Senior
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Adult
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Senior


In [4]:
print(df.isna().sum())
print("\nTarget distribution:\n", df["stroke"].value_counts())
print("\nTarget %:\n", df["stroke"].value_counts(normalize=True) * 100)


id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
age_group            0
dtype: int64

Target distribution:
 stroke
0    4861
1     249
Name: count, dtype: int64

Target %:
 stroke
0    95.127202
1     4.872798
Name: proportion, dtype: float64


In [5]:
if "id" in df.columns:
    df = df.drop(columns=["id"])
    print("Dropped id column")

df.head()


Dropped id column


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_group
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Senior
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1,Adult
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Senior
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Adult
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Senior


In [6]:
#Step 3 Define X/y values

TARGET = "stroke"

X = df.drop(columns=[TARGET])
y = df[TARGET]

print("X:", X.shape, "y:", y.shape)


X: (5110, 11) y: (5110,)


In [7]:
#step 4 Train/Test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)





Train: (4088, 11) Test: (1022, 11)


In [8]:
#step - 5 Detect categorical columns

cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

print("Categorical cols:", cat_cols)
print("Categorical idx:", cat_idx)



Categorical cols: ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'age_group']
Categorical idx: [0, 4, 5, 6, 9, 10]


In [9]:
#step6 -Train CatBoost Model

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / max(pos, 1)

model_cb = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=100,
    cat_features=cat_idx,
    scale_pos_weight=scale_pos_weight,
    thread_count=1
)

model_cb.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
print("CatBoost trained")





0:	test: 0.7897119	best: 0.7897119 (0)	total: 220ms	remaining: 2m 55s
100:	test: 0.8442593	best: 0.8454115 (63)	total: 4.99s	remaining: 34.5s
200:	test: 0.8460288	best: 0.8460288 (200)	total: 10.1s	remaining: 30.2s
300:	test: 0.8338683	best: 0.8464198 (214)	total: 21.5s	remaining: 35.7s
400:	test: 0.8168724	best: 0.8464198 (214)	total: 28.3s	remaining: 28.1s
500:	test: 0.8061317	best: 0.8464198 (214)	total: 34.7s	remaining: 20.7s
600:	test: 0.8062551	best: 0.8464198 (214)	total: 42.1s	remaining: 14s
700:	test: 0.8050823	best: 0.8464198 (214)	total: 49.8s	remaining: 7.03s
799:	test: 0.8030453	best: 0.8464198 (214)	total: 56.5s	remaining: 0us

bestTest = 0.8464197531
bestIteration = 214

Shrink model to first 215 iterations.
CatBoost trained


In [10]:
#step 7 Predictions

p_test = model_cb.predict_proba(X_test)[:, 1]
y_pred = (p_test >= 0.5).astype(int)

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "ROC-AUC": roc_auc_score(y_test, p_test),
}

metrics_df = pd.DataFrame(list(metrics.items()), columns=["Metric", "Score"])
metrics_df



Unnamed: 0,Metric,Score
0,Accuracy,0.792564
1,Precision,0.15678
2,Recall,0.74
3,F1,0.258741
4,ROC-AUC,0.84642


In [11]:
#step 7 confusion Matrix


cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm,
    index=["Actual_0", "Actual_1"],
    columns=["Pred_0", "Pred_1"]
)

cm_path = "../figures/confusion_matrix_catboost.csv"
cm_df.to_csv(cm_path, index=True)

print("Saved:", cm_path)
cm_df




Saved: ../figures/confusion_matrix_catboost.csv


Unnamed: 0,Pred_0,Pred_1
Actual_0,773,199
Actual_1,13,37


In [12]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(5, 4))

# Plot base heatmap (white background)
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Greys_r',
    cbar=False,
    xticklabels=['No Stroke', 'Stroke'],
    yticklabels=['No Stroke', 'Stroke'],
    linewidths=0.5,
    linecolor='white',
    ax=ax
)

# Define colours
green = "#E8F5E9"   # correct
orange = "#FDEBD0"  # incorrect

# Overlay rectangles (row, col)
cell_colours = {
    (0, 0): green,   # True Negative
    (1, 1): green,   # True Positive
    (0, 1): orange,  # False Positive
    (1, 0): orange   # False Negative
}

for (i, j), colour in cell_colours.items():
    ax.add_patch(
        plt.Rectangle(
            (j, i), 1, 1,
            fill=True,
            color=colour,
            alpha=0.8
        )
    )

plt.title("Confusion Matrix â€“ CatBoost Model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.subplots_adjust(bottom=0.28)
plt.show()

In [13]:
#legend

from matplotlib.patches import Patch

legend_elements = [
    Patch(facecolor='#E8F5E9', label='Correct Prediction'),
    Patch(facecolor='#FDEBD0', label='Misclassification')
]

plt.legend(
    handles=legend_elements,
    loc='upper center',
    bbox_to_anchor=(0.5, -0.25),  # moves legend lower
    ncol=2,
    frameon=False,
    fontsize=10
)


<matplotlib.legend.Legend at 0x1bfc02d38e0>

In [14]:
plt.savefig(
    '../figures/confusion_matrix_catboost.png',
    dpi=300,
    bbox_inches='tight'
)



In [15]:
#step-8 Feature Importance Table

importance_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": model_cb.get_feature_importance()
}).sort_values(by="Importance", ascending=False)

fi_path = "../figures/catboost_feature_importance.csv"
importance_df.to_csv(fi_path, index=False)

print("Saved:", fi_path)
importance_df.head(15)




Saved: ../figures/catboost_feature_importance.csv


Unnamed: 0,Feature,Importance
1,age,30.380374
8,bmi,16.445042
10,age_group,11.672077
7,avg_glucose_level,11.624779
9,smoking_status,9.300623
5,work_type,7.031172
0,gender,6.130888
4,ever_married,3.568335
6,Residence_type,2.017375
2,hypertension,1.512885


In [16]:
# Save CatBoost model

model_path = "../models/catboost_baseline.cbm"
model_cb.save_model(model_path)
print("Saved model:", model_path)

# Save schema files for Streamlit input alignment
joblib.dump(list(X_train.columns), "../models/catboost_feature_names.pkl")
joblib.dump(cat_cols, "../models/catboost_categorical_cols.pkl")

print("Saved schema files:")
print(" - ../models/catboost_feature_names.pkl")
print(" - ../models/catboost_categorical_cols.pkl")




Saved model: ../models/catboost_baseline.cbm
Saved schema files:
 - ../models/catboost_feature_names.pkl
 - ../models/catboost_categorical_cols.pkl


In [17]:
print("Models folder:", os.listdir("../models"))
print("Figures folder:", os.listdir("../figures"))


Models folder: ['.ipynb_checkpoints', 'catboost_baseline.cbm', 'catboost_categorical_cols.pkl', 'catboost_feature_names.pkl', 'hybrid_config.pkl', 'mlp_bundle.pkl', 'mlp_model.pkl']
Figures folder: ['.ipynb_checkpoints', 'catboost_feature_importance.csv', 'confusion_matrix_catboost.csv', 'confusion_matrix_catboost.png', 'confusion_matrix_hybrid.csv', 'confusion_matrix_mlp.csv', 'hybrid_final_metrics.csv', 'hybrid_threshold_results.csv', 'hybrid_weight_results.csv', 'mlp_metrics.csv', 'shap_bar.png', 'shap_feature_importance_bar_catboost.png', 'shap_importance_catboost.csv', 'shap_local_meta.pkl', 'shap_summary.png', 'shap_summary_catboost.png', 'shap_waterfall_0.png', 'shap_waterfall_10.png', 'shap_waterfall_50.png', 'shap_waterfall_catboost_10.png']
