# Predicting Heart Disease from Cleveland Database  

Authors: Albert C. Halim, Archer Liu, Stephanie Wu, & Ziyuan Zhao  
Date: November 18, 2024  

# Summary

In this project, we developed classification models using the Cleveland Heart Disease dataset to predict the presence of heart disease based on various clinical measurements. We evaluated the performance of four models: Support Vector Classifier (SVC), Linear Regression (adapted for classification), a Dummy Classifier (as a baseline), and a Decision Tree Classifier.


# Introduction

# Methods

## Data

## Analysis

## Results & Discussion

In [1]:
# File handling
import os
import requests
import zipfile

# Data handling
import numpy as np
import pandas as pd

# Preprocessing
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

# Machine Learning
from scipy.stats import expon, lognorm, loguniform, randint, uniform, norm
from sklearn.model_selection import  RandomizedSearchCV, cross_val_score, cross_validate
from sklearn.pipeline import make_pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier

# Scoring Metrics
from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer, f1_score, recall_score, precision_score

In [2]:
# Create the directory if it doesn't exist
raw_dir = "../data/raw"
if not os.path.exists(raw_dir):
    os.makedirs(raw_dir)

# Download data as zip
url = "https://archive.ics.uci.edu/static/public/45/heart+disease.zip"
response = requests.get(url)

# Save the zip file to the specified directory
zip_path = os.path.join(raw_dir, "heart+disease.zip")
with open(zip_path, 'wb') as f:
    f.write(response.content)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(raw_dir)

In [3]:
# read in data
colnames = [
    "age",       
    "sex",       
    "cp",        
    "trestbps",  
    "chol",      
    "fbs",       
    "restecg",   
    "thalach",   
    "exang",     
    "oldpeak",   
    "slope",     
    "ca",        
    "thal",      
    "num"  
]

heart_disease = pd.read_csv("../data/raw/processed.cleveland.data", names=colnames, header=None)
# Replace missing values with nan for ease of computational handling
heart_disease.replace('?', np.nan, inplace=True)
# heart_disease = heart_disease.dropna()
# Update the target variable 'num' (map values greater than 1 to 1)
heart_disease['num'] = heart_disease['num'].apply(lambda x: 1 if x > 1 else x)

In [4]:
# Scale and split into train & test
np.random.seed(522)
set_config(transform_output="pandas")

# Create the split
heart_disease_train, heart_disease_test = train_test_split(
    heart_disease, train_size=0.70, stratify=heart_disease["num"]
)
# Create the directory if it doesn't exist
processed_dir = "../data/processed"
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)
heart_disease_train.to_csv("../data/processed/heart_disease_train.csv")
heart_disease_test.to_csv("../data/processed/heart_disease_test.csv")

In [5]:
ca_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    StandardScaler()
)

thal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(sparse_output=False)
)

heart_disease_preprocessor = make_column_transformer(
    (ca_pipeline, ['ca']),  # Apply imputation and scaling to 'ca'
    (thal_pipeline, ['thal']),  # Apply imputation and encoding to 'thal'
    (OneHotEncoder(sparse_output=False), ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope']),
    (StandardScaler(), ["age", "trestbps", "chol", "thalach", "oldpeak"]),
    remainder='passthrough',
    verbose_feature_names_out=True
)

heart_disease_preprocessor.fit(heart_disease_train)
scaled_heart_disease_train = heart_disease_preprocessor.transform(heart_disease_train)
scaled_heart_disease_test = heart_disease_preprocessor.transform(heart_disease_test)

scaled_heart_disease_train.to_csv("../data/processed/scaled_heart_disease_train.csv")
scaled_heart_disease_test.to_csv("../data/processed/scaled_heart_disease_test.csv")

In [6]:
X_train = heart_disease_train.drop(columns=['num'])
y_train = heart_disease_train['num']
X_test = heart_disease_test.drop(columns=['num'])
y_test = heart_disease_test['num']

In [23]:
def randomized_search(X_train, y_train, model, param_dist, n_iter=100, cv=5, random_state=123):
    """
    Performs RandomizedSearchCV on the specified model and returns the best model.
    
    Parameters:
    X_train : DataFrame
        Training features
    y_train : Series
        Training labels
    model : estimator
        The model to be tuned
    param_dist : dict
        Hyperparameter distribution for RandomizedSearchCV
    n_iter : int, optional, default=100
        Number of iterations for RandomizedSearchCV
    cv : int, optional, default=5
        Number of cross-validation folds
    random_state : int, optional, default=123
        Random seed for reproducibility

    Returns:
    best_model : estimator
        The best model after RandomizedSearchCV
    """
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                       n_iter=n_iter, cv=cv, n_jobs=-1, random_state=random_state,
                                       return_train_score=True)
    
    # Fit the model
    random_search.fit(X_train, y_train)

    # Return the best model found by RandomizedSearchCV
    return random_search.best_estimator_

In [24]:
# This function is taken from UBC 571 Course
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [25]:
results_dict = {}
models = {
    "Dummy": DummyClassifier(random_state=123),
    "SVC": SVC(random_state=123)
}

for model in models.items():
    pipe = make_pipeline(heart_disease_preprocessor, model[1])
    results_dict[model[0]] = mean_std_cross_val_scores(
        pipe, X_train, y_train, cv=5, return_train_score=True
    )

income_pred_results_df = pd.DataFrame(results_dict).T
income_pred_results_df

Unnamed: 0,fit_time,score_time,test_score,train_score
Dummy,0.018 (+/- 0.003),0.011 (+/- 0.002),0.543 (+/- 0.007),0.542 (+/- 0.002)
SVC,0.017 (+/- 0.001),0.009 (+/- 0.001),0.844 (+/- 0.043),0.929 (+/- 0.017)


In [33]:
svc_param = {
    "gamma": loguniform(1e-4, 1e2),
    "C": loguniform(1e-4, 1e2),
    "class_weight": [None, "balanced"]
}

best_svc_model = randomized_search(X_train, y_train, SVC(random_state=123), svc_param)

# Calculate the train score (accuracy on training data)
train_score = best_svc_model.score(X_train, y_train)
print("SVC Best Model Train Accuracy Score: ", train_score)

SVC Best Model Train Accuracy Score:  0.8584905660377359


# Results