Load the dataset

In [20]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/heart_disease_uci.csv')

# View basic info
print(df.shape)
print(df.head())

(920, 16)
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0 

In [21]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB
None
               id         age    trestbps        chol      thalch     oldpeak  \
count  920.000000  920.000000  861.000000  

In [22]:
df.rename(columns={
    'age': 'Age',
    'sex': 'Sex',
    'dataset': 'DatasetSource',
    'cp': 'ChestPainType',
    'trestbps': 'RestingBP',
    'chol': 'Cholesterol',
    'fbs': 'FastingBS',
    'restecg': 'RestingECG',
    'thalch': 'MaxHR',
    'exang': 'ExerciseAngina',
    'oldpeak': 'Oldpeak',
    'slope': 'ST_Slope',
    'ca': 'NumMajorVessels',
    'thal': 'Thalassemia',
    'num': 'HeartDisease'
}, inplace=True)

In [23]:
df.drop(['id', 'DatasetSource'], axis=1, inplace=True)

In [24]:
print(df.isnull().sum())

Age                  0
Sex                  0
ChestPainType        0
RestingBP           59
Cholesterol         30
FastingBS           90
RestingECG           2
MaxHR               55
ExerciseAngina      55
Oldpeak             62
ST_Slope           309
NumMajorVessels    611
Thalassemia        486
HeartDisease         0
dtype: int64


In [25]:
# Numerical columns -> Fill with median
num_cols = ['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'NumMajorVessels']
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [26]:
# Categorical columns -> Fill with mode
cat_cols = ['FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Thalassemia']
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

  df[col] = df[col].fillna(df[col].mode()[0])


In [27]:
# Label Encoding categorical columns
label_cols = ['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Thalassemia']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [28]:
# Adjust Target Column
# Sometimes 'HeartDisease' is 0, 1, 2, 3, 4 in this dataset
# 0 = Healthy, 1-4 = Heart disease -> Convert to binary 0/1
df['HeartDisease'] = df['HeartDisease'].apply(lambda x: 1 if x > 0 else 0)

In [29]:
# Save processed data
df.to_csv('data/processed_heart_data.csv', index=False)