In [15]:
# 01_data_preprocessing.ipynb

# 1. Imports and folder setup
from ucimlrepo import fetch_ucirepo
import pandas as pd
import os
import json

# Make sure data and models folders exist in your project root
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# 2. Fetch dataset from UCIML repo
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets.copy()
y.columns = ['target']  # Fix: rename target column properly

# 3. Combine features and target into one DataFrame
df = pd.concat([X, y], axis=1)

# 4. Save raw dataset CSV to project data folder
df.to_csv('../data/heart_disease.csv', index=False)
print("Raw dataset saved to ../data/heart_disease.csv")

# 5. Quick data inspection
print("Shape:", df.shape)
print("Missing values per column:\n", df.isna().sum())
print("Duplicate rows:", df.duplicated().sum())
print("Target value counts:\n", df['target'].value_counts())

# 6. Identify categorical vs numerical columns (basic)
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols = [c for c in num_cols if c != 'target']

# Pick numeric columns with <=10 unique values as categorical candidates
cat_like = [c for c in num_cols if df[c].nunique() <= 10]
cat_cols = cat_like + df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = [c for c in num_cols if c not in cat_cols]

print("Initial numeric columns:", num_cols)
print("Initial categorical columns:", cat_cols)

# 7. Force known categorical columns into cat_cols (domain knowledge)
for forced_cat in ['sex', 'cp', 'thal', 'slope', 'ca', 'fbs', 'restecg', 'exang']:
    if forced_cat in num_cols:
        num_cols.remove(forced_cat)
        if forced_cat not in cat_cols:
            cat_cols.append(forced_cat)

print("Final numeric columns:", num_cols)
print("Final categorical columns:", cat_cols)

# 8. Save column lists for future use
json.dump({'num_cols': num_cols, 'cat_cols': cat_cols}, open('../models/cols.json', 'w'))

# 9. Save cleaned dataset (for now, raw but ready)
df.to_csv('../data/heart_disease_clean.csv', index=False)
print("Cleaned dataset saved to ../data/heart_disease_clean.csv")


Raw dataset saved to ../data/heart_disease.csv
Shape: (303, 14)
Missing values per column:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64
Duplicate rows: 0
Target value counts:
 target
0    164
1     55
2     36
3     35
4     13
Name: count, dtype: int64
Initial numeric columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Initial categorical columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
Final numeric columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Final categorical columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
Cleaned dataset saved to ../data/heart_disease_clean.csv
