In [1]:
# Import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

Load Dataset


In [2]:
# Load the heart disease dataset
dataset = pd.read_csv('dataset/heart_disease.csv')

In [3]:
# Show first 5 rows and basic info
print("Dataset loaded successfully!")
print("Shape:", dataset.shape)
print("\nFirst 5 rows:")
print(dataset.head())

Dataset loaded successfully!
Shape: (10000, 21)

First 5 rows:
    Age  Gender  Blood Pressure  Cholesterol Level Exercise Habits Smoking  \
0  56.0    Male           153.0              155.0            High     Yes   
1  69.0  Female           146.0              286.0            High      No   
2  46.0    Male           126.0              216.0             Low      No   
3  32.0  Female           122.0              293.0            High     Yes   
4  60.0    Male           166.0              242.0             Low     Yes   

  Family Heart Disease Diabetes        BMI High Blood Pressure  ...  \
0                  Yes       No  24.991591                 Yes  ...   
1                  Yes      Yes  25.221799                  No  ...   
2                   No       No  29.855447                  No  ...   
3                  Yes       No  24.130477                 Yes  ...   
4                  Yes      Yes  20.486289                 Yes  ...   

  High LDL Cholesterol Alcohol Consumptio

In [4]:
# Separate features (X) and target (y) - keep original for reference
X_original = dataset.iloc[:, :-1].values
y_original = dataset.iloc[:, -1].values

In [5]:
print("\nOriginal X sample (row 1):")
print(X_original[1])


Original X sample (row 1):
[69.0 'Female' 146.0 286.0 'High' 'No' 'Yes' 'Yes' 25.2217985244363 'No'
 'Yes' 'No' 'Medium' 'High' 8.744033968961478 'Medium' 133.0 157.0
 9.355389404894291 19.298875477603804]


In [6]:
print("Original y sample:", y_original[:10])

Original y sample: ['No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No']


Data Preprocessing: Convert All to Numeric (Label Encoding)

In [7]:
# Make a copy of dataset to work safely
df = dataset.copy()

In [8]:
# List of all categorical columns (text columns that need encoding)
categorical_cols = [
    'Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 'Diabetes',
    'High Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol',
    'Alcohol Consumption', 'Stress Level', 'Sugar Consumption'
]

In [9]:
# Fill missing values in categorical columns with 'Unknown'
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

In [10]:
# Apply LabelEncoder to all categorical columns
print("Converting categorical columns to numeric using LabelEncoder...")
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

Converting categorical columns to numeric using LabelEncoder...


In [11]:
# Convert target column 'Heart Disease Status' to 0 and 1
df['Heart Disease Status'] = LabelEncoder().fit_transform(df['Heart Disease Status'])

In [12]:
# Fill missing values in numeric columns with mean
print("Filling missing numeric values with column mean...")
df = df.fillna(df.mean())

Filling missing numeric values with column mean...


In [13]:
print("All data is now numeric!")
print("\nFirst 5 rows after encoding:")
print(df.head())

All data is now numeric!

First 5 rows after encoding:
    Age  Gender  Blood Pressure  Cholesterol Level  Exercise Habits  Smoking  \
0  56.0       1           153.0              155.0                0        2   
1  69.0       0           146.0              286.0                0        0   
2  46.0       1           126.0              216.0                1        0   
3  32.0       0           122.0              293.0                0        2   
4  60.0       1           166.0              242.0                1        2   

   Family Heart Disease  Diabetes        BMI  High Blood Pressure  ...  \
0                     2         0  24.991591                    2  ...   
1                     2         2  25.221799                    0  ...   
2                     0         0  29.855447                    0  ...   
3                     2         0  24.130477                    2  ...   
4                     2         2  20.486289                    2  ...   

   High LDL Cholest

Outlier Detection and Removal 

In [14]:
# Convert dataframe to numpy array for outlier detection
X = df.drop('Heart Disease Status', axis=1).values
y = df['Heart Disease Status'].values

print(f"Original data shape: {X.shape}")

Original data shape: (10000, 20)


In [15]:
# Detect and remove outliers using IQR method
Q1 = np.percentile(X, 25, axis=0)
Q3 = np.percentile(X, 75, axis=0)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [16]:
# Keep only rows without outliers
mask = ~np.any((X < lower_bound) | (X > upper_bound), axis=1)
X_clean = X[mask]
y_clean = y[mask]

In [17]:
print(f"Outliers removed: {X.shape[0] - X_clean.shape[0]} rows")

Outliers removed: 0 rows


In [18]:
print(f"Clean data shape: {X_clean.shape[0]} rows")

Clean data shape: 10000 rows


Train-Test Split + Normalization (Standardization)

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean
)

In [20]:
print(f"Training set: {X_train.shape[0]} samples")

Training set: 8000 samples


In [21]:
print(f"Testing set:  {X_test.shape[0]} samples")

Testing set:  2000 samples


In [22]:
# Apply StandardScaler (mean=0, std=1) - fit on train only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
print("X_train_scaled sample (first 5 rows):")
print(X_train_scaled[:5])

X_train_scaled sample (first 5 rows):
[[-1.66241820e+00  9.90097829e-01  1.43695865e+00  1.06516751e+00
  -1.21503032e+00 -1.03259547e+00  1.00237910e+00 -1.00740505e+00
  -4.44086807e-02  9.90112804e-01  1.01334052e+00 -9.92715769e-01
  -4.70030366e-01 -1.00532642e-02  1.34115193e+00 -1.22372403e+00
  -5.76304393e-01  1.64318497e+00 -3.80708295e-01  1.28021380e+00]
 [-1.49755990e+00 -9.96056291e-01 -1.51521007e+00 -1.27925675e+00
  -1.21503032e+00 -1.03259547e+00 -9.99876276e-01 -1.00740505e+00
   1.73957975e-01 -1.01239065e+00  1.01334052e+00  1.00998910e+00
   4.25692103e-01 -1.22863074e+00  1.01789186e-04 -2.59489941e-03
   9.61286991e-01  7.49535060e-02 -9.85412505e-01 -1.09906762e+00]
 [-2.33646195e-01 -9.96056291e-01  4.71826569e-01 -1.23328765e+00
   1.20414403e+00  9.71228362e-01  1.00237910e+00  9.95886122e-01
   7.62484473e-01  9.90112804e-01 -9.89308732e-01  1.00998910e+00
   1.32141457e+00  1.20852421e+00  1.11820129e+00 -2.59489941e-03
   1.44321862e+00 -4.76046738e-01  1

Export final clean data to ARFF file (for Weka)

In [24]:
pip install liac-arff -q

Note: you may need to restart the kernel to use updated packages.


In [25]:
import arff
import numpy as np

In [26]:
df_final = pd.DataFrame(X_clean, columns=df.drop('Heart Disease Status', axis=1).columns)
df_final['Heart Disease Status'] = y_clean

In [27]:
df_final['Heart Disease Status'] = df_final['Heart Disease Status'].map({0: 'No', 1: 'Yes'})

In [28]:
print("Final clean dataset for export:")
print(df_final.head())

Final clean dataset for export:
    Age  Gender  Blood Pressure  Cholesterol Level  Exercise Habits  Smoking  \
0  56.0     1.0           153.0              155.0              0.0      2.0   
1  69.0     0.0           146.0              286.0              0.0      0.0   
2  46.0     1.0           126.0              216.0              1.0      0.0   
3  32.0     0.0           122.0              293.0              0.0      2.0   
4  60.0     1.0           166.0              242.0              1.0      2.0   

   Family Heart Disease  Diabetes        BMI  High Blood Pressure  ...  \
0                   2.0       0.0  24.991591                  2.0  ...   
1                   2.0       2.0  25.221799                  0.0  ...   
2                   0.0       0.0  29.855447                  0.0  ...   
3                   2.0       0.0  24.130477                  2.0  ...   
4                   2.0       2.0  20.486289                  2.0  ...   

   High LDL Cholesterol  Alcohol Consumpti

In [29]:
print(f"Total samples: {len(df_final)}")

Total samples: 10000


In [30]:
attributes = []
for col in df_final.columns:
    if col == 'Heart Disease Status':
        attributes.append((col, ['No', 'Yes']))  # nominal class
    else:
        attributes.append((col, 'NUMERIC'))      # all features are numeric

In [31]:
# Data list
data_list = df_final.values.tolist()

In [32]:
arff_data = {
    'relation': 'dataset/heart_disease_clean',
    'description': 'Heart Disease Dataset - Full preprocessing: Label Encoding + Outlier Removal + Ready for Weka',
    'attributes': attributes,
    'data': data_list
}

In [33]:
arff_data = {
    'relation': 'heart_disease_clean',
    'description': 'Heart Disease Dataset - Full preprocessing: Label Encoding + Outlier Removal + Ready for Weka',
    'attributes': attributes,
    'data': data_list
}

# Xuáº¥t file .arff
with open('heart_disease_clean.arff', 'w', encoding='utf-8') as f:
    arff.dump(arff_data, f)