In [228]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

In [229]:
warnings.filterwarnings("ignore")

In [230]:
df = pd.read_csv('../data/classifier/processed/df.csv', sep = ';', encoding = 'iso-8859-1')

In [231]:
df.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,M,ATA,140.0,289.0,0,Normal,172.0,N,0.0,Up,1,White,180.233238,100.825018
1,49.0,F,NAP,160.0,180.0,0,Normal,156.0,N,1.0,Flat,0,White,173.975042,63.384922


<span style="font-family: Georgia, serif;">

# Preprocessing
</span>

In [232]:
df['Age'] = df['Age'].astype(int)

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Encoders
</span>

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Manual
</span>

In [233]:
def manual_encoder(df):
    df['Sex'] = df['Sex'].map({'M': 0, 'F': 1})
    df['ChestPainType'] = df['ChestPainType'].map({'ASY': 3, 'NAP': 2, 'ATA': 1, 'TA': 0})
    df['RestingECG'] = df['RestingECG'].map({'Normal': 0, 'ST': 1, 'LVH': 2})
    df['ExerciseAngina'] = df['ExerciseAngina'].map({'N': 0, 'Y': 1})
    df['ST_Slope'] = df['ST_Slope'].map({'Up': 0, 'Flat': 1, 'Down': 2})
    df['Race'] = df['Race'].map({'White': 0, 'Hispanic': 1, 'Black': 2, 'Asian': 3, 'Other': 4})
    return df

In [234]:
df2 = df.copy()
df2 = manual_encoder(df2)

In [235]:
df2.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40,0,1,140.0,289.0,0,0,172.0,0,0.0,0,1,0,180.233238,100.825018
1,49,1,2,160.0,180.0,0,0,156.0,0,1.0,1,0,0,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### LabelEncoder
</span>

In [236]:
def label_encoder(df, columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])
    return df

df_label = df.copy()
df_label = label_encoder(df_label, ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Race'])

In [237]:
df_label.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40,1,1,140.0,289.0,0,1,172.0,0,0.0,2,1,4,180.233238,100.825018
1,49,0,2,160.0,180.0,0,1,156.0,0,1.0,1,0,4,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### OneHotEncoder
</span>

In [238]:
def onehot_encoder(df, columns):
    encoder = OneHotEncoder(sparse_output=False, drop='first') 
    encoded_df = pd.DataFrame(encoder.fit_transform(df[columns]), columns=encoder.get_feature_names_out(columns))
    df.drop(columns, axis=1, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)
    return df

df_hot = df.copy()
df_hot = onehot_encoder(df_hot, ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Race'])

In [239]:
df_hot.head(2)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Height,Weight,Sex_M,...,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,Race_Black,Race_Hispanic,Race_Other,Race_White
0,40,140.0,289.0,0,172.0,0.0,1,180.233238,100.825018,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160.0,180.0,0,156.0,1.0,0,173.975042,63.384922,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [240]:
df2['Age'] = df2['Age'].astype(int)

In [241]:
df2.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40,0,1,140.0,289.0,0,0,172.0,0,0.0,0,1,0,180.233238,100.825018
1,49,1,2,160.0,180.0,0,0,156.0,0,1.0,1,0,0,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Separation of attributes
</span>

In [242]:
target_index = 11
predictor_indices = [i for i in range(df2.shape[1]) if i != target_index]

In [243]:
predictors = df2.iloc[:, predictor_indices].values
target = df2.iloc[:, target_index].values

In [244]:
predictors_label = df_label.iloc[:, predictor_indices].values

In [245]:
predictors_hot = df_hot.iloc[:, predictor_indices].values

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Scaling
</span>

In [246]:
continuous_indices = [3, 4, 7, 9, 12, 13]

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### MinMax
</span>

In [247]:
def minmax_scaler(df, continuous_indices):
    scaler = MinMaxScaler()
    df_continuous = df[:, continuous_indices] 
    df[:, continuous_indices] = scaler.fit_transform(df_continuous)
    return df

In [248]:
predictors_mm = minmax_scaler(predictors, continuous_indices)
predictors_mm_label = minmax_scaler(predictors_label, continuous_indices)
predictors_mm_hot = minmax_scaler(predictors_hot, continuous_indices)

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Normalizer
</span>

In [249]:
def normalizer(df, continuous_indices):
    normalizer = Normalizer()
    df_continuous = df[:, continuous_indices] 
    df[:, continuous_indices] = normalizer.fit_transform(df_continuous)
    return df

In [250]:
predictors_norm = normalizer(predictors, continuous_indices)
predictors_norm_label = normalizer(predictors_label, continuous_indices)
predictors_norm_hot = normalizer(predictors_hot, continuous_indices)

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Standard Scaler
</span>

In [251]:
def standard_scaler(df, continuous_indices):
    scaler = StandardScaler()
    df_continuous = df[:, continuous_indices] 
    df[:, continuous_indices] = scaler.fit_transform(df_continuous)
    return df

In [252]:
predictors_std = standard_scaler(predictors, continuous_indices)
predictors_std_label = standard_scaler(predictors_label, continuous_indices)
predictors_std_hot = standard_scaler(predictors_hot, continuous_indices)