## Load data

In [1]:
import pandas as pd

In [2]:
DATA_DIR = "dataset/heart.csv"
CONTINIOUS_ATTRIBUTES = ["age", "trestbps", "chol", "thalach", "oldpeak"]
DISCRETE_ATTRIBUTES = ["cp", "restecg", "slope", "ca", "thal"]
BINARY_ATTRIBUTES = ["sex", "fbs", "exang", "target"]

data = pd.read_csv(DATA_DIR)
data.loc[:, CONTINIOUS_ATTRIBUTES] = data.loc[:, CONTINIOUS_ATTRIBUTES].astype("float64", copy=False)
data.loc[:, DISCRETE_ATTRIBUTES] = data.loc[:, DISCRETE_ATTRIBUTES].astype("int8", copy=False)
data.loc[:, BINARY_ATTRIBUTES] = data.loc[:, BINARY_ATTRIBUTES].astype("int8", copy=False)

In [3]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## Train-test split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_set, test_set = train_test_split(data, test_size=100, stratify=data.target, random_state=42)

## Prepare data

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [7]:
# Since we're gonna use cross validation, we cannot normalize train data beforehand, 
# instead, we will normalize train-validation data on the go while training.
def normalize(train_set, test_set):
    scaler = StandardScaler()
    scaler.fit(train_set)
    
    return scaler.transform(train_set), scaler.transform(test_set)

In [8]:
encoder = OneHotEncoder(categories="auto", drop="first")
encoder.fit(train_set.loc[:, DISCRETE_ATTRIBUTES])

_train = encoder.transform(train_set.loc[:, DISCRETE_ATTRIBUTES])
_test = encoder.transform(test_set.loc[:, DISCRETE_ATTRIBUTES])

In [9]:
encoder.categories_ 

[array([0, 1, 2, 3], dtype=int8),
 array([0, 1, 2], dtype=int8),
 array([0, 1, 2], dtype=int8),
 array([0, 1, 2, 3, 4], dtype=int8),
 array([0, 1, 2, 3], dtype=int8)]

In [10]:
encoder.get_feature_names(DISCRETE_ATTRIBUTES)

array(['cp_1', 'cp_2', 'cp_3', 'restecg_1', 'restecg_2', 'slope_1',
       'slope_2', 'ca_1', 'ca_2', 'ca_3', 'ca_4', 'thal_1', 'thal_2',
       'thal_3'], dtype=object)