In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


In [17]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
df = pd.read_csv(url, header=None)

# Rename columns
df.columns = [
    "age",
    "sex",
    "cp",
    "trestbps",
    "chol",
    "fbs",
    "restecg",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "target",
]

# Replace missing values "?" with median
df.replace("?", np.nan, inplace=True)
df.fillna(df.median(), inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

# Convert categorical variables into numerical ones
cat_vars = ["sex", "cp", "fbs", "restecg", "exang", "slope"]
df = pd.get_dummies(df, columns=cat_vars)

# Remove collinear variables
corr_matrix = df.corr(numeric_only=False).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
df.drop(to_drop, axis=1, inplace=True)
print("Dropped:", to_drop)

# Map target variable to binary labels
heart_disease_df["target"] = heart_disease_df["target"].map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1})

df

Dropped: ['sex_1.0', 'fbs_1.0', 'restecg_2.0', 'exang_1.0', 'slope_2.0']


Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca,thal,target,sex_0.0,cp_1.0,cp_2.0,cp_3.0,cp_4.0,fbs_0.0,restecg_0.0,restecg_1.0,exang_0.0,slope_1.0,slope_3.0
0,63.0,145.0,233.0,150.0,2.3,0.0,6.0,0,0,1,0,0,0,0,0,0,1,0,1
1,67.0,160.0,286.0,108.0,1.5,3.0,3.0,2,0,0,0,0,1,1,0,0,0,0,0
2,67.0,120.0,229.0,129.0,2.6,2.0,7.0,1,0,0,0,0,1,1,0,0,0,0,0
3,37.0,130.0,250.0,187.0,3.5,0.0,3.0,0,0,0,0,1,0,1,1,0,1,0,1
4,41.0,130.0,204.0,172.0,1.4,0.0,3.0,0,1,0,1,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,110.0,264.0,132.0,1.2,0.0,7.0,1,0,1,0,0,0,1,1,0,1,0,0
299,68.0,144.0,193.0,141.0,3.4,2.0,7.0,2,0,0,0,0,1,0,1,0,1,0,0
300,57.0,130.0,131.0,115.0,1.2,1.0,7.0,3,0,0,0,0,1,1,1,0,0,0,0
301,57.0,130.0,236.0,174.0,0.0,1.0,3.0,1,1,0,1,0,0,1,0,0,1,0,0


In [None]:
# Split data into training and testing sets
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((237, 18), (237,), (60, 18), (60,))