In [1]:
#safe environment

import os

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import matplotlib
matplotlib.use("Agg")




In [2]:
# Import all the libraries

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix
)

os.makedirs("../models", exist_ok=True)
os.makedirs("../figures", exist_ok=True)



In [3]:
# Load cleaned dataset

df = pd.read_csv("../data/stroke_clean.csv")
print("Loaded:", df.shape)

if "id" in df.columns:
    df = df.drop(columns=["id"])
  

df.head()

Loaded: (5110, 13)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_group
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,Senior
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1,Adult
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,Senior
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,Adult
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,Senior


In [4]:
#train/test split

TARGET = "stroke"
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Target %:\n", y.value_counts(normalize=True)*100)


Train: (4088, 11) Test: (1022, 11)
Target %:
 stroke
0    95.127202
1     4.872798
Name: proportion, dtype: float64


In [5]:
#loading catboost and mlp

model_cb = CatBoostClassifier()
model_cb.load_model("../models/catboost_baseline.cbm")
print("CatBoost loaded")

# CatBoost schema
cb_feature_names = joblib.load("../models/catboost_feature_names.pkl")
cb_cat_cols = joblib.load("../models/catboost_categorical_cols.pkl")

# MLP bundle
mlp_bundle = joblib.load("../models/mlp_bundle.pkl")
pre = mlp_bundle["preprocessor"]
mlp = mlp_bundle["model"]
mlp_feature_names = mlp_bundle["feature_names"]

print(" MLP bundle loaded")
print("CatBoost features:", len(cb_feature_names))
print("MLP features:", len(mlp_feature_names))
    

CatBoost loaded
 MLP bundle loaded
CatBoost features: 11
MLP features: 11


In [6]:
X_test_cb = X_test.reindex(columns=cb_feature_names, fill_value=0)
X_test_mlp = X_test.reindex(columns=mlp_feature_names, fill_value=0)

cb_cat_idx = [cb_feature_names.index(c) for c in cb_cat_cols if c in cb_feature_names]
print("Alignment done")
print("CatBoost cat idx:", cb_cat_idx)

Alignment done
CatBoost cat idx: [0, 4, 5, 6, 9, 10]


In [7]:
test_pool = Pool(X_test_cb, cat_features=cb_cat_idx)
p_cb = model_cb.predict_proba(test_pool)[:, 1]

X_test_p = pre.transform(X_test_mlp)
p_mlp = mlp.predict_proba(X_test_p)[:, 1]

print("Probabilities computed")
print("p_cb:", p_cb[:5])
print("p_mlp:", p_mlp[:5])

Probabilities computed
p_cb: [0.29437305 0.10454981 0.01056093 0.00248504 0.64506138]
p_mlp: [2.11410893e-02 2.56700108e-01 3.10647601e-04 1.29616135e-11
 7.57896391e-01]


In [8]:
#evaluation function
def evaluate_probs(y_true, probs, threshold=0.5):
    y_pred = (probs >= threshold).astype(int)
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1": f1_score(y_true, y_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(y_true, probs),
        "Threshold": threshold
    }

In [9]:
weights = [0.3, 0.5, 0.7]
results = []

for w in weights:
    p_hybrid = w*p_cb + (1-w)*p_mlp
    m = evaluate_probs(y_test, p_hybrid, threshold=0.5)
    m["Weight_CB"] = w
    results.append(m)

weight_df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
weight_df



Unnamed: 0,Accuracy,Precision,Recall,F1,ROC-AUC,Threshold,Weight_CB
2,0.838552,0.18232,0.66,0.285714,0.821152,0.5,0.7
1,0.850294,0.134752,0.38,0.198953,0.801029,0.5,0.5
0,0.851272,0.125,0.34,0.182796,0.783354,0.5,0.3


In [10]:

weight_path = "../figures/hybrid_weight_results.csv"
weight_df.to_csv(weight_path, index=False)
print("Saved:", weight_path)

Saved: ../figures/hybrid_weight_results.csv


In [11]:
best_w = float(weight_df.iloc[0]["Weight_CB"])
print("Best weight (by F1):", best_w)

p_hybrid_best = best_w*p_cb + (1-best_w)*p_mlp

thresholds = [0.50, 0.40, 0.30, 0.25, 0.20]
thr_results = []

for t in thresholds:
    m = evaluate_probs(y_test, p_hybrid_best, threshold=t)
    m["Weight_CB"] = best_w
    thr_results.append(m)

thr_df = pd.DataFrame(thr_results).sort_values(by="F1", ascending=False)
thr_df


Best weight (by F1): 0.7


Unnamed: 0,Accuracy,Precision,Recall,F1,ROC-AUC,Threshold,Weight_CB
0,0.838552,0.18232,0.66,0.285714,0.821152,0.5,0.7
1,0.77593,0.146245,0.74,0.244224,0.821152,0.4,0.7
2,0.704501,0.120482,0.8,0.209424,0.821152,0.3,0.7
3,0.661448,0.110526,0.84,0.195349,0.821152,0.25,0.7
4,0.622309,0.1,0.84,0.178723,0.821152,0.2,0.7


In [12]:
#save threshold tuning csv
thr_path = "../figures/hybrid_threshold_results.csv"
thr_df.to_csv(thr_path, index=False)
print("Saved:", thr_path)

Saved: ../figures/hybrid_threshold_results.csv


In [13]:
best_threshold = float(thr_df.iloc[0]["Threshold"])
print("Best threshold (by F1):", best_threshold)

final_probs = p_hybrid_best
final_pred = (final_probs >= best_threshold).astype(int)
y_pred_hybrid = final_pred

cm = confusion_matrix(y_test, final_pred)
cm_df = pd.DataFrame(cm, index=["Actual_0","Actual_1"], columns=["Pred_0","Pred_1"])

cm_path = "../figures/confusion_matrix_hybrid.csv"
cm_df.to_csv(cm_path, index=True)

print("Saved:", cm_path)
cm_df

Best threshold (by F1): 0.5
Saved: ../figures/confusion_matrix_hybrid.csv


Unnamed: 0,Pred_0,Pred_1
Actual_0,824,148
Actual_1,17,33


In [14]:
#Cinfusion Matrix visualisation
from sklearn.metrics import confusion_matrix
from matplotlib.patches import Patch

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_hybrid)


fig, ax = plt.subplots(figsize=(5, 4))

# Base heatmap (neutral background)
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cbar=False,
    xticklabels=['No Stroke', 'Stroke'],
    yticklabels=['No Stroke', 'Stroke'],
    linewidths=0.5,
    linecolor='white',
    ax=ax
)

# Colours
green = "#E8F5E9"    # Correct prediction
orange = "#FDEBD0"  # Misclassification

# Overlay colours by correctness
cell_colours = {
    (0, 0): green,   # True Negative
    (1, 1): green,   # True Positive
    (0, 1): orange,  # False Positive
    (1, 0): orange   # False Negative
}

for (i, j), colour in cell_colours.items():
    ax.add_patch(
        plt.Rectangle((j, i), 1, 1, color=colour, alpha=0.8)
    )

# Labels and title
plt.title("Confusion Matrix â€“ Hybrid Model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")

# Legend
legend_elements = [
    Patch(facecolor=green, label='Correct Prediction'),
    Patch(facecolor=orange, label='Misclassification')
]

plt.legend(
    handles=legend_elements,
    loc='upper center',
    bbox_to_anchor=(0.5, -0.25),
    ncol=2,
    frameon=False,
    fontsize=10
)

plt.subplots_adjust(bottom=0.28)
plt.tight_layout()

# Save figure
plt.savefig(
    '../figures/confusion_matrix_hybrid.png',
    dpi=300,
    bbox_inches='tight'
)

plt.show()


In [15]:
#save hybrid for streamlit

hybrid_config = {
    "weight_cb": best_w,
    "threshold": best_threshold,
    "cb_model_path": "../models/catboost_baseline.cbm",
    "mlp_bundle_path": "../models/mlp_bundle.pkl",
    "cb_feature_names_path": "../models/catboost_feature_names.pkl",
    "cb_categorical_cols_path": "../models/catboost_categorical_cols.pkl"
}

config_path = "../models/hybrid_config.pkl"
joblib.dump(hybrid_config, config_path)

print("Saved:", config_path)
hybrid_config


Saved: ../models/hybrid_config.pkl


{'weight_cb': 0.7,
 'threshold': 0.5,
 'cb_model_path': '../models/catboost_baseline.cbm',
 'mlp_bundle_path': '../models/mlp_bundle.pkl',
 'cb_feature_names_path': '../models/catboost_feature_names.pkl',
 'cb_categorical_cols_path': '../models/catboost_categorical_cols.pkl'}

In [16]:
#final check

print("Models:", os.listdir("../models"))
print("Figures:", os.listdir("../figures"))


Models: ['.ipynb_checkpoints', 'catboost_baseline.cbm', 'catboost_categorical_cols.pkl', 'catboost_feature_names.pkl', 'hybrid_config.pkl', 'mlp_bundle.pkl', 'mlp_model.pkl']
Figures: ['.ipynb_checkpoints', 'catboost_feature_importance.csv', 'confusion_matrix_catboost.csv', 'confusion_matrix_catboost.png', 'confusion_matrix_hybrid.csv', 'confusion_matrix_hybrid.png', 'confusion_matrix_mlp.csv', 'confusion_matrix_mlp.png', 'hybrid_final_metrics.csv', 'hybrid_threshold_results.csv', 'hybrid_weight_results.csv', 'mlp_metrics.csv', 'shap_bar.png', 'shap_feature_importance_bar_catboost.png', 'shap_importance_catboost.csv', 'shap_local_meta.pkl', 'shap_summary.png', 'shap_summary_catboost.png', 'shap_waterfall_0.png', 'shap_waterfall_10.png', 'shap_waterfall_50.png', 'shap_waterfall_catboost_10.png']


In [17]:

# Build input_df with CatBoost schema first (ensures correct overall set)
input_df = pd.DataFrame([{col: np.nan for col in cb_feature_names}])

# Example patient values (edit)
for col, val in {
    "gender": "Male",
    "age": 67,
    "hypertension": 1,
    "heart_disease": 0,
    "ever_married": "Yes",
    "work_type": "Private",
    "Residence_type": "Urban",
    "avg_glucose_level": 228.69,
    "bmi": 36.6,
    "smoking_status": "formerly smoked"
}.items():
    if col in input_df.columns:
        input_df.loc[0, col] = val

In [18]:
# Align input for CatBoost
input_cb = input_df.reindex(columns=cb_feature_names, fill_value=np.nan)

# Make CatBoost categorical columns safe: no NaN, force string
for c in cb_cat_cols:
    if c in input_cb.columns:
        input_cb[c] = input_cb[c].fillna("Unknown").astype(str)

# Make numeric columns safe too
for c in input_cb.columns:
    if c not in cb_cat_cols:
        input_cb[c] = pd.to_numeric(input_cb[c], errors="coerce").fillna(0)

pool_in = Pool(input_cb, cat_features=cb_cat_idx)
p_cb_one = model_cb.predict_proba(pool_in)[:, 1][0]


In [19]:
# ---- MLP single prediction (SAFE) ----
input_mlp = input_df.reindex(columns=mlp_feature_names, fill_value=np.nan)

# categorical to string + no NaN
for c in mlp_bundle["categorical_cols"]:
    if c in input_mlp.columns:
        input_mlp[c] = input_mlp[c].fillna("Unknown").astype(str)

# numeric to numeric + no NaN
for c in mlp_bundle["numeric_cols"]:
    if c in input_mlp.columns:
        input_mlp[c] = pd.to_numeric(input_mlp[c], errors="coerce").fillna(0)

p_mlp_one = mlp.predict_proba(pre.transform(input_mlp))[:, 1][0]
print("MLP prob:", p_mlp_one)


MLP prob: 0.939381901723823


In [20]:
p_hybrid_one = best_w*p_cb_one + (1-best_w)*p_mlp_one
pred_hybrid = int(p_hybrid_one >= best_threshold)

print("CatBoost prob:", p_cb_one)
print("Hybrid prob:", p_hybrid_one)
print("Hybrid class (0/1):", pred_hybrid)


CatBoost prob: 0.6591143808341792
Hybrid prob: 0.7431946371010723
Hybrid class (0/1): 1
