# Loading and prepping


In [None]:
# Imports
import pandas as pd
from ucimlrepo import fetch_ucirepo
# Libraries imported"

In [None]:
# Fetch the Heart Disease dataset directly from the UCI repository
heart_disease = fetch_ucirepo(id=45)
# Separate the data into features (X) and target (y)
X = heart_disease.data.features
y = heart_disease.data.targets
print("Data downloaded successfully!")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Combine features and target into one single DataFrame
df = pd.concat([X, y], axis=1)
# Take a look at the data
print("First 5 rows of the combined dataset:")
df.head()

In [None]:
# Check the original target values
print("Original 'num' (target) value counts:")
print(df['num'].value_counts().sort_index())

# Convert to binary: 0 = No Heart Disease | 1 = Heart Disease
df['target'] = (df['num'] > 0).astype(int)
# Remove the old 'num' column
df.drop('num', axis=1, inplace=True)

print("\nTarget cleaned and converted to binary")
print("New 'target' value counts:")
print(df['target'].value_counts())

In [None]:
# Perform a final check
print(f"Final dataset shape: {df.shape}")
print("\nMissing values in each column:")
print(df.isnull().sum())

# Save the cleaned dataset to the data folder
df.to_csv('../data/heart_disease.csv', index=False)
print("\nCleaned dataset saved to '../data/heart_disease.csv'")

In [None]:
# Handle missing values by filling them with the median (for numerical features)
# 'ca' and 'thal' are numerical, so we use median
df['ca'].fillna(df['ca'].median(), inplace=True)
df['thal'].fillna(df['thal'].median(), inplace=True)

# Verify that no missing values remain
print("Missing values after cleaning:")
print(df.isnull().sum())

# Save the FINAL cleaned dataset
df.to_csv('../data/heart_disease.csv', index=False)
print("\nFINAL cleaned dataset saved")