In [89]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [90]:
# Load datasets
diabetes_df = pd.read_csv("../Datasets/diabetes.csv")
heart_df = pd.read_csv("../Datasets/heart_disease_uci.csv")


In [91]:
# Preview dataset
heart_df.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [92]:
# Dataset structure and data types
heart_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [93]:
heart_df.isnull().sum()


id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [94]:
# Check missing values
heart_df.isnull().sum()


id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [95]:
numeric_cols = heart_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = heart_df.select_dtypes(include=['object', 'bool']).columns


In [96]:
# Fill numeric columns with median
heart_df[numeric_cols] = heart_df[numeric_cols].fillna(
    heart_df[numeric_cols].median()
)

# Fill categorical columns with mode
for col in categorical_cols:
    heart_df[col].fillna(heart_df[col].mode()[0], inplace=True)


  heart_df[col].fillna(heart_df[col].mode()[0], inplace=True)


In [97]:
heart_df.isnull().sum()


id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [98]:
# num: 0 = no disease, >0 = disease
heart_df['num'] = heart_df['num'].apply(lambda x: 1 if x > 0 else 0)


In [99]:
encoder = LabelEncoder()

for col in categorical_cols:
    heart_df[col] = encoder.fit_transform(heart_df[col])


In [100]:
# Drop non-predictive ID column
X = heart_df.drop(['num', 'id'], axis=1)
y = heart_df['num']


In [101]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [102]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [103]:
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (736, 14)
Test set shape: (184, 14)


In [104]:
# Save preprocessed dataset
heart_df.to_csv("../Datasets/heart_preprocessed.csv", index=False)



The dataset was preprocessed through systematic data cleaning and transformation. Missing values were handled using median imputation for numerical features and mode imputation for categorical features. The target variable was binarized to represent the presence or absence of heart disease. Categorical attributes were encoded numerically, and feature scaling was applied to standardize the feature space. Finally, the dataset was split into training and testing sets using stratified sampling to preserve class balance.