In [5]:
#To install packages needed to run this code, use: pip install -r requirements.txt
from ucimlrepo import fetch_ucirepo 
import sklearn.model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

# fetch dataset 
data = fetch_ucirepo(id=45)

#Convert to binary classification (no heart disease vs heart disease)
y = data.data.targets.to_numpy().reshape(-1) 
y = np.clip(y, 0, 1)  

#Impute missing values with mode
df = data.data.features
df = df.fillna(df.mode().iloc[0])

#Get feature values  
X = df.to_numpy() 

#Reencode categorical variables
categorical_columns       = ['cp','restecg']
categorical_indices       = [i for i,c in enumerate(df.columns) if c in categorical_columns]
non_categorical_indices   = [i for i,c in enumerate(df.columns) if c not in categorical_columns]
encoder                   = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
Xonehot                   = encoder.fit_transform(X[:,categorical_indices])
Xnew                      = np.hstack([X[:,non_categorical_indices],Xonehot])

#Split into train and test sets
Xtrain, Xtest, ytrain, ytest = sklearn.model_selection.train_test_split(Xnew, y, test_size=0.20, random_state=589,stratify=y)

#Standardize features
scaler = StandardScaler()
Xtrain= scaler.fit_transform(Xtrain)
Xtest = scaler.fit_transform(Xtest)

# Report dataset sizes and feature dimensions
print("Training cases:", Xtrain.shape[0])
print("Test cases:", Xtest.shape[0])
print("Feature dimensions:", Xtrain.shape[1])

# Train logistic regression model
lr = LogisticRegression(max_iter=1000, random_state=589)
lr.fit(Xtrain, ytrain)

# Predictions
ytrain_pred = lr.predict(Xtrain)
ytest_pred  = lr.predict(Xtest)

# Compute error rates
train_error = np.mean(ytrain_pred != ytrain)
test_error  = np.mean(ytest_pred != ytest)

print("Training error rate:", train_error)
print("Test error rate:", test_error)



Training cases: 242
Test cases: 61
Feature dimensions: 18
Training error rate: 0.12396694214876033
Test error rate: 0.16393442622950818
