In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from imblearn.over_sampling import SMOTE


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
path = '../data/heart_disease.csv'
df = pd.read_csv(path)
df.head()


Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No


Rows and columns

In [4]:

print("Shape:", df.shape)

Shape: (10000, 21)


Identify categorical and numerical data

In [5]:

print("\nData Types:\n", df.dtypes)


Data Types:
 Age                     float64
Gender                   object
Blood Pressure          float64
Cholesterol Level       float64
Exercise Habits          object
Smoking                  object
Family Heart Disease     object
Diabetes                 object
BMI                     float64
High Blood Pressure      object
Low HDL Cholesterol      object
High LDL Cholesterol     object
Alcohol Consumption      object
Stress Level             object
Sleep Hours             float64
Sugar Consumption        object
Triglyceride Level      float64
Fasting Blood Sugar     float64
CRP Level               float64
Homocysteine Level      float64
Heart Disease Status     object
dtype: object


Number of missing values in each column

In [45]:

print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 Age                       29
Gender                    19
Blood Pressure            19
Cholesterol Level         30
Exercise Habits           25
Smoking                   25
Family Heart Disease      21
Diabetes                  30
BMI                       22
High Blood Pressure       26
Low HDL Cholesterol       25
High LDL Cholesterol      26
Alcohol Consumption     2586
Stress Level              22
Sleep Hours               25
Sugar Consumption         30
Triglyceride Level        26
Fasting Blood Sugar       22
CRP Level                 26
Homocysteine Level        20
Heart Disease Status       0
dtype: int64


Unique values per column

In [46]:

print("\nUnique values per column:\n", df.nunique())


Unique values per column:
 Age                       63
Gender                     2
Blood Pressure            61
Cholesterol Level        151
Exercise Habits            3
Smoking                    2
Family Heart Disease       2
Diabetes                   2
BMI                     9978
High Blood Pressure        2
Low HDL Cholesterol        2
High LDL Cholesterol       2
Alcohol Consumption        3
Stress Level               3
Sleep Hours             9975
Sugar Consumption          3
Triglyceride Level       301
Fasting Blood Sugar       81
CRP Level               9974
Homocysteine Level      9980
Heart Disease Status       2
dtype: int64


How balanced the target variable

In [47]:

print("\nTarget Distribution:\n", df['Heart Disease Status'].value_counts())


Target Distribution:
 Heart Disease Status
No     8000
Yes    2000
Name: count, dtype: int64


check duplicates

In [48]:

duplicates = df.duplicated()
if duplicates.any():
    print ("Duplicates exist")
else:
    print("Duplicates do not exist")

Duplicates do not exist


Separate features and target

In [49]:
X = df.drop("Heart Disease Status", axis=1)
y = df["Heart Disease Status"]

Split dataset first (80% train, 20% test)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Select numerical columns

In [51]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

Create median imputer

In [52]:
median_imputer = SimpleImputer(strategy="median")


Fit only on training data

In [53]:
X_train[num_cols] = median_imputer.fit_transform(X_train[num_cols])


Apply same transformation to test data

In [54]:
X_test[num_cols] = median_imputer.transform(X_test[num_cols])

Check missing values in numerical columns

In [55]:
print("Train missing:\n", X_train[num_cols].isnull().sum())
print("Test missing:\n", X_test[num_cols].isnull().sum())

Train missing:
 Age                    0
Blood Pressure         0
Cholesterol Level      0
BMI                    0
Sleep Hours            0
Triglyceride Level     0
Fasting Blood Sugar    0
CRP Level              0
Homocysteine Level     0
dtype: int64
Test missing:
 Age                    0
Blood Pressure         0
Cholesterol Level      0
BMI                    0
Sleep Hours            0
Triglyceride Level     0
Fasting Blood Sugar    0
CRP Level              0
Homocysteine Level     0
dtype: int64


Select categorical columns

In [56]:
cat_cols = X_train.select_dtypes(include=["object"]).columns

Create most frequent imputer

In [57]:
cat_imputer = SimpleImputer(strategy="most_frequent")

Fit only on training data

In [58]:
X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])

Apply same transformation to test data


In [59]:
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

Check missing values

In [60]:
print("Train missing:\n", X_train.isnull().sum())
print("Test missing:\n", X_test.isnull().sum())

Train missing:
 Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Exercise Habits         0
Smoking                 0
Family Heart Disease    0
Diabetes                0
BMI                     0
High Blood Pressure     0
Low HDL Cholesterol     0
High LDL Cholesterol    0
Alcohol Consumption     0
Stress Level            0
Sleep Hours             0
Sugar Consumption       0
Triglyceride Level      0
Fasting Blood Sugar     0
CRP Level               0
Homocysteine Level      0
dtype: int64
Test missing:
 Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Exercise Habits         0
Smoking                 0
Family Heart Disease    0
Diabetes                0
BMI                     0
High Blood Pressure     0
Low HDL Cholesterol     0
High LDL Cholesterol    0
Alcohol Consumption     0
Stress Level            0
Sleep Hours             0
Sugar Consumption       0
Triglyceride Level  

ordinal columns

In [61]:
ordinal_cols = ["Stress Level", "Exercise Habits", "Alcohol Consumption", "Sugar Consumption"]
ordinal_mapping = [
    ["Low", "Medium", "High"],   # Stress
    ["Low", "Medium", "High"],   # Exercise
    ["Low", "Medium", "High"],   # Alcohol
    ["Low", "Medium", "High"],   # Sugar
]

Apply Ordinal Encoding for ordered features

In [62]:
ord_encoder = OrdinalEncoder(categories=ordinal_mapping)
X_train[ordinal_cols] = ord_encoder.fit_transform(X_train[ordinal_cols])
X_test[ordinal_cols] = ord_encoder.transform(X_test[ordinal_cols])

Apply One-Hot Encoding for the rest of categorical columns

In [63]:
cat_cols = [c for c in cat_cols if c not in ordinal_cols]
X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

Align columns between train & test

In [64]:
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

Feature scaling

In [65]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

Handle Class Imbalance

In [66]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

In [67]:
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_bal).value_counts())

Before SMOTE: Heart Disease Status
No     6400
Yes    1600
Name: count, dtype: int64
After SMOTE: Heart Disease Status
No     6400
Yes    6400
Name: count, dtype: int64
