## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import joblib

## Read Heart Disease dataset : https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease

In [2]:
df = pd.read_csv('heart_2020_cleaned.csv')
df['HeartDisease'] = df['HeartDisease'].map({'No': 0, 'Yes': 1})
df = df.drop(['PhysicalHealth', 'MentalHealth'], axis = 1)
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,Yes,No,No,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,0,20.34,No,No,Yes,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,0,26.58,Yes,No,No,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,0,24.21,No,No,No,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,0,23.71,No,No,No,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,Yes,No,No,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,0,29.84,Yes,No,No,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,0,24.24,No,No,No,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,0,32.81,No,No,No,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


## Check for NULLs

In [3]:
df.isna().sum() # No Nulls

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

## Split into train and test set (90 : 10)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['HeartDisease'], axis = 1),
    df['HeartDisease'],
    test_size = 0.1,
    stratify = df['HeartDisease'],
    random_state = 42
)

## Save Test Set For Streamlit Upload

In [8]:
test_set = pd.concat([X_test, y_test], axis = 1)
test_set.to_csv('HeartDisease_test.csv', index = False)

## Onehot categorical features and Scale Numerical Features

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_cols = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 
                    'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']
numeric_cols = ['BMI', 'SleepTime']

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown = "ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ],
    n_jobs = -1,
)

X_train = preprocessor.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns = preprocessor.get_feature_names_out())

X_test = preprocessor.transform(X_test)
X_test = pd.DataFrame(X_test, columns = preprocessor.get_feature_names_out())
X_test

Unnamed: 0,cat__Smoking_No,cat__Smoking_Yes,cat__AlcoholDrinking_No,cat__AlcoholDrinking_Yes,cat__Stroke_No,cat__Stroke_Yes,cat__DiffWalking_No,cat__DiffWalking_Yes,cat__Sex_Female,cat__Sex_Male,...,cat__GenHealth_Poor,cat__GenHealth_Very good,cat__Asthma_No,cat__Asthma_Yes,cat__KidneyDisease_No,cat__KidneyDisease_Yes,cat__SkinCancer_No,cat__SkinCancer_Yes,num__BMI,num__SleepTime
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.091228,-0.068535
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.922565,-1.461858
2,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.859581,0.628127
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.226896,0.628127
4,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.985855,-0.068535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31975,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-0.291149,0.628127
31976,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,-1.155607,-0.068535
31977,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.166755,-0.765196
31978,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.037637,-0.068535


## Save the preprocessor pipeline

In [12]:
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

# Model Training

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

In [8]:
# Define the six models and their paramaters

pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

models = {
    "Logistic_Regression": LogisticRegression(
        class_weight = "balanced",
        max_iter = 1000
    ),

    "Decision_Tree": DecisionTreeClassifier(
        class_weight = "balanced",
        max_depth = 7,
        random_state = 42
    ),

    "KNN": KNeighborsClassifier(
        n_neighbors = 5,
        weights = "distance",
        n_jobs = -1
    ),

    "Naive_Bayes": GaussianNB(),

    "Random_Forest": RandomForestClassifier(
        n_estimators = 100,
        max_depth = 7,
        class_weight = "balanced",
        random_state = 42,
        n_jobs = -1
    ),

    "XGBoost": XGBClassifier(
        n_estimators = 100,
        max_depth = 7,
        scale_pos_weight = pos_weight,
        random_state = 42
    )
}

In [9]:
# Function to evaluate six models on train and test set
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    
    model.fit(X_train, y_train)
    results = {}

    # Save Model
    model_path = f"model/{model_name}.pkl"
    joblib.dump(model, model_path)

    for split, X_data, y_data in [
        ("Train", X_train, y_train),
        ("Test", X_test, y_test),
    ]:
        y_pred = model.predict(X_data)
        y_prob = model.predict_proba(X_data)[:, 1]

        results[split] = {
            "Accuracy": accuracy_score(y_data, y_pred),
            "AUC": roc_auc_score(y_data, y_prob),
            "Precision": precision_score(y_data, y_pred),
            "Recall": recall_score(y_data, y_pred),
            "F1": f1_score(y_data, y_pred),
            "MCC": matthews_corrcoef(y_data, y_pred),
        }

    return results

In [10]:
all_results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    all_results[model_name] = evaluate_model(model, model_name, X_train, y_train, X_test, y_test)

Training Logistic_Regression...
Training Decision_Tree...
Training KNN...
Training Naive_Bayes...
Training Random_Forest...
Training XGBoost...


## Display Metrics

In [11]:
rows = []
for model_name, splits in all_results.items():
    for split, metrics in splits.items():
        row = {"Model": model_name, "Dataset": split}
        row.update(metrics)
        rows.append(row)

results_df = pd.DataFrame(rows)
results_df = results_df.round(2)
results_df = results_df.sort_values(by = ["Dataset", "F1"], ascending = False, ignore_index = True)
results_df

Unnamed: 0,Model,Dataset,Accuracy,AUC,Precision,Recall,F1,MCC
0,KNN,Train,0.99,1.0,1.0,0.94,0.97,0.96
1,XGBoost,Train,0.77,0.89,0.25,0.85,0.39,0.38
2,Logistic_Regression,Train,0.75,0.84,0.23,0.78,0.35,0.32
3,Random_Forest,Train,0.76,0.82,0.22,0.71,0.34,0.3
4,Naive_Bayes,Train,0.75,0.82,0.22,0.73,0.33,0.3
5,Decision_Tree,Train,0.72,0.8,0.2,0.76,0.32,0.28
6,Logistic_Regression,Test,0.75,0.84,0.22,0.77,0.34,0.31
7,Random_Forest,Test,0.76,0.82,0.22,0.71,0.34,0.3
8,XGBoost,Test,0.75,0.82,0.22,0.74,0.34,0.3
9,Naive_Bayes,Test,0.75,0.82,0.21,0.73,0.33,0.3
