In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


## Data Labels

1. **age**: Age in years  
2. **sex**: Sex  
   - `1` = Male  
   - `0` = Female  
3. **cp**: Chest pain type  
   - Value `1`: Typical angina  
   - Value `2`: Atypical angina  
   - Value `3`: Non-anginal pain  
   - Value `4`: Asymptomatic  
4. **trestbps**: Resting blood pressure (in mm Hg on admission to the hospital)  
5. **chol**: Serum cholesterol in mg/dl  
6. **fbs**: Fasting blood sugar (> 120 mg/dl)  
   - `1` = True  
   - `0` = False  
7. **restecg**: Resting electrocardiographic results  
   - Value `0`: Normal  
   - Value `1`: Having ST-T wave abnormality (T wave inversions and/or ST  
     elevation or depression of > 0.05 mV)  
   - Value `2`: Showing probable or definite left ventricular hypertrophy by  
     Estes' criteria  
8. **thalach**: Maximum heart rate achieved  
9. **exang**: Exercise-induced angina  
   - `1` = Yes  
   - `0` = No  
10. **oldpeak**: ST depression induced by exercise relative to rest  
11. **slope**: The slope of the peak exercise ST segment  
    - Value `1`: Upsloping  
    - Value `2`: Flat  
    - Value `3`: Downsloping  
12. **ca**: Number of major vessels (0-3) colored by fluoroscopy  
13. **thal**:  
    - `3` = Normal  
    - `6` = Fixed defect  
    - `7` = Reversible defect
14. **label**: 
    - `0` = Absence 
    - `1` = Presence 
    - `2` = Presence
    - `3` = Presence
    - `4` = Presence

In [2]:
columns = ['age','sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'label']

In [20]:
hungary_df = pd.read_csv('data/processed.hungarian.data',index_col=False, names = columns)
swiss_df = pd.read_csv('data/processed.switzerland.data',index_col=False, names = columns)
cleveland_df = pd.read_csv('data/processed.cleveland.data',index_col=False, names = columns)
va_df = pd.read_csv('data/processed.va.data',index_col=False, names = columns)

# Combine the four dataset into one consolidated set 
combined_df = pd.concat([hungary_df, swiss_df, cleveland_df, va_df], axis = 0)

# Replacing ? with NaN values 
combined_df.replace('?', np.nan, inplace = True)  


In [54]:
# Checking datatypes of each feature
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 0 to 199
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       920 non-null    float64 
 1   sex       920 non-null    category
 2   cp        920 non-null    category
 3   trestbps  861 non-null    float64 
 4   chol      890 non-null    float64 
 5   fbs       830 non-null    object  
 6   restecg   918 non-null    object  
 7   thalach   865 non-null    float64 
 8   exang     865 non-null    object  
 9   oldpeak   865 non-null    float64 
 10  slope     611 non-null    object  
 11  ca        309 non-null    object  
 12  thal      434 non-null    object  
 13  label     920 non-null    category
dtypes: category(3), float64(5), object(6)
memory usage: 89.5+ KB


In [69]:
# Casting continuous features to float64 instead of categories 
combined_df['trestbps'] = combined_df['trestbps'].astype('float64')
combined_df['chol'] = combined_df['chol'].astype('float64')
combined_df['thalach'] = combined_df['thalach'].astype('float64')
combined_df['oldpeak'] = combined_df['thalach'].astype('float64')

# Casting label as categorical 
combined_df['label'] = combined_df['label'].astype('category')

# Casting as categorical 
combined_df['cp'] = combined_df['cp'].astype('object')
combined_df['sex'] = combined_df['sex'].astype('object')

# For the following features, have to first convert dtype to number first to ensure the category labels 
# are not affected by decimals (i.e. 1.0 and 1 are not treated as different groups)
combined_df['exang'] = pd.to_numeric(combined_df['exang'], errors='coerce').astype('object')
combined_df['thal'] = pd.to_numeric(combined_df['thal'], errors='coerce').astype('object')
combined_df['fbs'] = pd.to_numeric(combined_df['fbs'], errors='coerce').astype('object')
combined_df['ca'] = pd.to_numeric(combined_df['ca'], errors='coerce').astype('object')
combined_df['slope'] = pd.to_numeric(combined_df['slope'], errors='coerce').astype('object')
combined_df['restecg'] = pd.to_numeric(combined_df['restecg'], errors='coerce').astype('object')


In [70]:
train_df, test_df = train_test_split(combined_df, test_size=0.3, random_state=123)

In [35]:
train_df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
232,46.0,1.0,4.0,130.0,222.0,0.0,0.0,112.0,0.0,112.0,,,,1
150,54.0,1.0,2.0,120.0,246.0,0.0,0.0,110.0,0.0,110.0,,,,0
112,66.0,1.0,4.0,150.0,0.0,0.0,0.0,108.0,1.0,108.0,2.0,,7.0,3
132,53.0,0.0,2.0,140.0,216.0,0.0,0.0,142.0,1.0,142.0,2.0,,,0
9,34.0,0.0,2.0,130.0,161.0,0.0,0.0,190.0,0.0,190.0,,,,0
254,44.0,1.0,4.0,130.0,290.0,0.0,0.0,100.0,1.0,100.0,2.0,,,1
69,63.0,1.0,2.0,,217.0,1.0,1.0,,,,,,,1
82,46.0,1.0,4.0,110.0,238.0,0.0,1.0,140.0,1.0,140.0,2.0,,3.0,0
64,43.0,0.0,2.0,150.0,186.0,0.0,0.0,154.0,0.0,154.0,,,,0
144,58.0,1.0,3.0,105.0,240.0,0.0,2.0,154.0,1.0,154.0,2.0,0.0,7.0,0


In [44]:
# Check the percentage of missing values for each feature. 

missing_percentage = (train_df.isnull().sum() / len(train_df)) * 100
print(missing_percentage)

age          0.000000
sex          0.000000
cp           0.000000
trestbps     6.521739
chol         2.795031
fbs          8.850932
restecg      0.310559
thalach      6.055901
exang        6.055901
oldpeak      6.055901
slope       34.316770
ca          65.062112
thal        51.242236
label        0.000000
dtype: float64


## EDA findings 

- The group with no heart disease exhibits on average higher ST depression induced by exercise relative to rest, higher maximum heart rate and lower serum cholestorel. 

- Heart disease is more common among people over 55. 

- People with heart disease is more likely to experience asymptomatic chest pains.

- Males appear to be more susceptible to heart disease. 

- People without heart disease tend to have lower fasting blood sugar when compared to the positive group.

In [71]:
import altair_ally as aly
import altair as alt

aly.alt.data_transformers.enable('vegafusion')

aly.dist(train_df, color='label')

In [72]:
aly.dist(train_df, dtype = 'object', color = 'label')

## Features pre-processing 

In [82]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline

X_train = train_df.drop(columns=["label"])
X_test = test_df.drop(columns=["label"])
y_train = train_df["label"]
y_test = test_df["label"]

In [98]:
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'] # standard scaling for numerical features
categorical_features = ['cp', 'restecg'] # onehot encoding for categorical features with > 2 classes
binary_features = ['sex', 'exang', 'fbs'] # simple imputing on the binary features
drop_features = ['thal', 'ca', 'slope'] # dropping features with signifcant NaN values 

In [96]:
numeric_transformer_pipe = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())
categorical_transfomer_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(drop = 'if_binary', sparse_output = False)) 
imputer = SimpleImputer(strategy = 'most_frequent')

In [99]:
preprocessor = make_column_transformer(
    (numeric_transformer_pipe, numeric_features),
    (categorical_transfomer_pipe, categorical_features),
    (imputer, binary_features),
    ("drop", drop_features)
)

In [100]:
transformed = preprocessor.fit_transform(X_train)

In [101]:
col_names = ( 
    numeric_features +
    preprocessor.named_transformers_['pipeline-2'].get_feature_names_out().tolist() + 
    binary_features
)

In [102]:
transformed_df = pd.DataFrame(transformed, columns = col_names)
transformed_df

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,cp_1.0,cp_2.0,cp_3.0,cp_4.0,restecg_0.0,restecg_1.0,restecg_2.0,sex,exang,fbs
0,-0.806021,-0.1196,0.178542,-1.048509,-1.048509,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.046673,-0.660108,0.402221,-1.12703,-1.12703,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.325714,0.961415,-1.890497,-1.205551,-1.205551,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
3,-0.059914,0.420908,0.122622,0.129303,0.129303,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-2.085061,-0.1196,-0.389978,2.013803,2.013803,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,-0.592847,-0.1196,0.392901,0.835991,0.835991,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
640,-0.379674,0.691162,-1.890497,0.011522,0.011522,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
641,0.79278,-0.1196,-1.890497,-2.422623,-2.422623,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
642,0.47302,2.042431,-1.890497,-1.323332,-1.323332,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
