In [6]:
# ===== Cell 1: imports & load =====
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, IterativeImputer

# Load your dataset
df = pd.read_csv("../data/heart_disease.csv")  # <-- change path if needed

# Quick peek
display(df.head())
print("Shape:", df.shape)

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No


Shape: (10000, 21)


In [7]:
# ===== Cell 2: quick audit =====
print("Dtypes:\n", df.dtypes, "\n")
print("Missing values:\n", df.isna().sum(), "\n")
print("Target distribution:\n", df["Heart Disease Status"].value_counts())
print("\nAny duplicates?", df.duplicated().any())

Dtypes:
 Age                     float64
Gender                   object
Blood Pressure          float64
Cholesterol Level       float64
Exercise Habits          object
Smoking                  object
Family Heart Disease     object
Diabetes                 object
BMI                     float64
High Blood Pressure      object
Low HDL Cholesterol      object
High LDL Cholesterol     object
Alcohol Consumption      object
Stress Level             object
Sleep Hours             float64
Sugar Consumption        object
Triglyceride Level      float64
Fasting Blood Sugar     float64
CRP Level               float64
Homocysteine Level      float64
Heart Disease Status     object
dtype: object 

Missing values:
 Age                       29
Gender                    19
Blood Pressure            19
Cholesterol Level         30
Exercise Habits           25
Smoking                   25
Family Heart Disease      21
Diabetes                  30
BMI                       22
High Blood Pressure      

In [8]:
# ===== Cell 3: schema & split =====
TARGET_COL = "Heart Disease Status"

# Ensure target is binary 0/1
y = df[TARGET_COL].map({"No": 0, "Yes": 1}).astype(int)

# Features only
X = df.drop(columns=[TARGET_COL])

# Identify column types from your schema
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["number"]).columns.tolist()

print("Categorical:", cat_cols)
print("Numeric:", num_cols)

# Stratified split (avoid leakage: split BEFORE fitting imputers/scalers)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape, y_train.mean(), y_test.mean()

Categorical: ['Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes', 'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol', 'Alcohol Consumption', 'Stress Level', 'Sugar Consumption']
Numeric: ['Age', 'Blood Pressure', 'Cholesterol Level', 'BMI', 'Sleep Hours', 'Triglyceride Level', 'Fasting Blood Sugar', 'CRP Level', 'Homocysteine Level']


((8000, 20), (2000, 20), np.float64(0.2), np.float64(0.2))

In [9]:
# ===== Cell 4: preprocessing pipelines =====

# Numeric: IterativeImputer (MICE) -> StandardScaler
numeric_pipeline = Pipeline(steps=[
    ("imputer", IterativeImputer(random_state=42, sample_posterior=False, max_iter=15)),
    ("scaler", StandardScaler())
])

# Categorical: most_frequent impute -> OneHotEncoder
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# ColumnTransformer that applies the right pipeline to each column set
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols),
    ],
    remainder="drop"  # drop anything unexpected
)

preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,15
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [10]:
# ===== Cell 5: fit/transform =====

# Fit only on training data (prevents leakage)
preprocessor.fit(X_train)

# Transform both train and test
X_train_pre = preprocessor.transform(X_train)
X_test_pre  = preprocessor.transform(X_test)

# Inspect shapes
X_train_pre.shape, X_test_pre.shape

((8000, 35), (2000, 35))

In [11]:
# ===== Cell 6: recover feature names & build DataFrames =====
def get_feature_names(preprocessor, num_cols, cat_cols):
    num_feats = num_cols  # stays the same after scaling
    # OHE names
    ohe = preprocessor.named_transformers_["cat"].named_steps["ohe"]
    cat_feats = ohe.get_feature_names_out(cat_cols).tolist()
    return num_feats + cat_feats

feature_names = get_feature_names(preprocessor, num_cols, cat_cols)

X_train_df = pd.DataFrame(X_train_pre, columns=feature_names, index=X_train.index)
X_test_df  = pd.DataFrame(X_test_pre,  columns=feature_names, index=X_test.index)

display(X_train_df.head())
print("Train:", X_train_df.shape, " Test:", X_test_df.shape)

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Gender_Female,...,High LDL Cholesterol_Yes,Alcohol Consumption_High,Alcohol Consumption_Low,Alcohol Consumption_Medium,Stress Level_High,Stress Level_Low,Stress Level_Medium,Sugar Consumption_High,Sugar Consumption_Low,Sugar Consumption_Medium
7395,-1.66241,1.436965,1.065153,-0.044429,1.341158,-0.57632,1.643176,-0.380703,1.280234,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5642,-1.497552,-1.515204,-1.279271,0.173938,-0.018212,0.961271,0.074944,-0.985407,-1.099048,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
8866,-0.233638,0.471833,-1.233302,0.762464,1.118207,1.443203,-0.476056,1.345247,1.690009,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
8517,-1.112883,-0.379755,1.524844,-1.204864,1.635621,1.362881,0.001841,-1.567243,0.073427,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3706,-0.178685,-0.833934,-0.038105,-0.140973,0.556991,1.144865,1.21933,0.793706,1.333971,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


Train: (8000, 35)  Test: (2000, 35)


In [14]:
# ===== Cell 7: persist preprocessor (optional) =====
import joblib

joblib.dump(preprocessor, "preprocessor.joblib")
print("Saved preprocessor → preprocessor.joblib")

Saved preprocessor → preprocessor.joblib


In [12]:
# ===== Cell 8: outputs to use in modeling =====
print("y_train positive rate:", y_train.mean())
print("y_test  positive rate:", y_test.mean())

# If you prefer NumPy arrays:
X_train_np = X_train_df.values
X_test_np  = X_test_df.values

# Sanity check: no NaNs after preprocessing
assert not np.isnan(X_train_np).any(), "NaNs found in X_train after preprocessing!"
assert not np.isnan(X_test_np).any(), "NaNs found in X_test after preprocessing!"

(X_train_np.shape, X_test_np.shape, y_train.shape, y_test.shape)

y_train positive rate: 0.2
y_test  positive rate: 0.2


((8000, 35), (2000, 35), (8000,), (2000,))

In [13]:
X_train_df.to_csv("X_train_preprocessed.csv", index=False)
X_test_df.to_csv("X_test_preprocessed.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)