In [1]:
import pandas as pd
import numpy as np


In [4]:
import os
os.listdir("../data/raw")



['framingham.csv', 'uci_heart.csv']

In [5]:
uci = pd.read_csv("../data/raw/uci_heart.csv")
framingham = pd.read_csv("../data/raw/framingham.csv")


In [6]:
uci.shape

(920, 16)

In [7]:
framingham.shape

(4240, 16)

In [8]:
uci.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='str')

In [9]:
framingham.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='str')

In [10]:
uci.dtypes

id            int64
age           int64
sex             str
dataset         str
cp              str
trestbps    float64
chol        float64
fbs          object
restecg         str
thalch      float64
exang        object
oldpeak     float64
slope           str
ca          float64
thal            str
num           int64
dtype: object

In [11]:
framingham.dtypes

male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object

In [12]:
uci_df = uci.copy()
framingham_df = framingham.copy()


In [18]:
uci_df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='str')

In [19]:
uci_df = uci_df.drop(columns=["id", "dataset"], errors="ignore")


In [20]:
uci_df["target"] = (uci_df["num"] > 0).astype(int)
uci_df = uci_df.drop(columns=["num"], errors="ignore")


In [21]:
framingham_df["target"] = framingham_df["TenYearCHD"]
framingham_df = framingham_df.drop(columns=["TenYearCHD"], errors="ignore")


In [22]:
uci_df["target"].value_counts(), framingham_df["target"].value_counts()


(target
 1    509
 0    411
 Name: count, dtype: int64,
 target
 0    3596
 1     644
 Name: count, dtype: int64)

In [23]:
# UCI → standard names
uci_df = uci_df.rename(columns={
    "trestbps": "sysBP",
    "chol": "totChol",
    "thalch": "heartRate" if "thalch" in uci_df.columns else "thalch"
})

# Framingham → standard names
framingham_df = framingham_df.rename(columns={
    "male": "sex"
})


In [24]:
set(uci_df.columns) - set(framingham_df.columns)


{'ca', 'cp', 'exang', 'fbs', 'oldpeak', 'restecg', 'slope', 'thal'}

In [25]:
set(framingham_df.columns) - set(uci_df.columns)


{'BMI',
 'BPMeds',
 'cigsPerDay',
 'currentSmoker',
 'diaBP',
 'diabetes',
 'education',
 'glucose',
 'prevalentHyp',
 'prevalentStroke'}

In [26]:
for col in uci_df.columns:
    if col not in framingham_df.columns:
        framingham_df[col] = np.nan

for col in framingham_df.columns:
    if col not in uci_df.columns:
        uci_df[col] = np.nan


In [27]:
final_columns = sorted(uci_df.columns)
uci_df = uci_df[final_columns]
framingham_df = framingham_df[final_columns]


In [28]:
merged_df = pd.concat([uci_df, framingham_df], ignore_index=True)


In [29]:
merged_df.shape


(5160, 24)

In [31]:
import os
os.makedirs("data/processed", exist_ok=True)


In [32]:
merged_df.to_csv("data/processed/heart_data_clean.csv", index=False)


In [33]:
os.listdir("data/processed")


['heart_data_clean.csv']

In [35]:
## Milestone 1: Data Readiness & Dataset Alignment

#In this section, we prepare raw heart disease datasets for modeling.
#Steps performed:
#- Removed non-informative identifiers
#- Unified target variable across datasets
#- Preserved clinically important features
#- Aligned schemas between UCI and Framingham datasets
#- Created a clean merged dataset for downstream feature engineering
#No feature scaling or encoding is performed at this stage.
