In [635]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

In [636]:
warnings.filterwarnings("ignore")

In [637]:
df = pd.read_csv('../data/classifier/processed/df.csv', sep = ';', encoding = 'iso-8859-1')

In [638]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,M,ATA,140.0,289.0,0,Normal,172.0,N,0.0,Up,1,White,180.233238,100.825018
1,49.0,F,NAP,160.0,180.0,0,Normal,156.0,N,1.0,Flat,0,White,173.975042,63.384922
2,37.0,M,ATA,130.0,283.0,0,ST,98.0,N,0.0,Up,1,Hispanic,171.904901,83.027778
3,48.0,F,ASY,138.0,214.0,0,Normal,108.0,Y,1.5,Flat,1,White,160.074206,66.458538
4,54.0,M,NAP,150.0,195.0,0,Normal,122.0,N,0.0,Up,0,White,177.775159,84.033176


<span style="font-family: Georgia, serif;">

# Preprocessing
</span>

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Encoders
</span>

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Manual
</span>

In [639]:
def manual_encoder(df):
    df['Sex'] = df['Sex'].map({'M': 0, 'F': 1})
    df['ChestPainType'] = df['ChestPainType'].map({'ASY': 3, 'NAP': 2, 'ATA': 1, 'TA': 0})
    df['RestingECG'] = df['RestingECG'].map({'Normal': 0, 'ST': 1, 'LVH': 2})
    df['ExerciseAngina'] = df['ExerciseAngina'].map({'N': 0, 'Y': 1})
    df['ST_Slope'] = df['ST_Slope'].map({'Up': 0, 'Flat': 1, 'Down': 2})
    df['Race'] = df['Race'].map({'White': 0, 'Hispanic': 1, 'Black': 2, 'Asian': 3, 'Other': 4})
    return df

In [640]:
df2 = df.copy()
df2 = manual_encoder(df2)

In [641]:
df2.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,0,1,140.0,289.0,0,0,172.0,0,0.0,0,1,0,180.233238,100.825018
1,49.0,1,2,160.0,180.0,0,0,156.0,0,1.0,1,0,0,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### LabelEncoder
</span>

In [642]:
def label_encoder(df, columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])
    return df

df_label = df.copy()
df_label = label_encoder(df_label, ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Race'])

In [643]:
df_label.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,1,1,140.0,289.0,0,1,172.0,0,0.0,2,1,4,180.233238,100.825018
1,49.0,0,2,160.0,180.0,0,1,156.0,0,1.0,1,0,4,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### OneHotEncoder
</span>

In [644]:
def onehot_encoder(df, columns):
    encoder = OneHotEncoder(sparse_output=False, drop='first') 
    encoded_df = pd.DataFrame(encoder.fit_transform(df[columns]), columns=encoder.get_feature_names_out(columns))
    df.drop(columns, axis=1, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)
    return df

df_hot = df.copy()
df_hot = onehot_encoder(df_hot, ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Race'])

In [645]:
df_hot.head(2)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Height,Weight,Sex_M,...,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,Race_Black,Race_Hispanic,Race_Other,Race_White
0,40.0,140.0,289.0,0,172.0,0.0,1,180.233238,100.825018,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49.0,160.0,180.0,0,156.0,1.0,0,173.975042,63.384922,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Separation of attributes
</span>

In [646]:
predictors = df2.drop(columns=['HeartDisease']).values
target = df2['HeartDisease'].values

In [647]:
predictors_label = df_label.drop(columns=['HeartDisease']).values

In [648]:
predictors_hot = df_hot.drop(columns=['HeartDisease']).values

In [649]:
predictors_label

array([[ 40.        ,   1.        ,   1.        , ...,   4.        ,
        180.23323783, 100.82501836],
       [ 49.        ,   0.        ,   2.        , ...,   4.        ,
        173.97504219,  63.38492174],
       [ 37.        ,   1.        ,   1.        , ...,   2.        ,
        171.90490114,  83.02777765],
       ...,
       [ 45.63477593,   0.        ,   2.        , ...,   4.        ,
        172.18791988, 101.65311221],
       [ 69.09697007,   1.        ,   2.        , ...,   1.        ,
        166.57696146,  58.82176318],
       [ 73.10344263,   0.        ,   0.        , ...,   4.        ,
        166.71048127,  60.77257129]])

In [650]:
predictors_hot

array([[ 40.        , 140.        , 289.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 49.        , 160.        , 180.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 37.        , 130.        , 283.        , ...,   1.        ,
          0.        ,   0.        ],
       ...,
       [ 45.63477593, 109.32257062, 185.26983988, ...,   0.        ,
          0.        ,   1.        ],
       [ 69.09697007, 134.9071995 , 238.96808547, ...,   0.        ,
          0.        ,   0.        ],
       [ 73.10344263, 130.03801168, 100.        , ...,   0.        ,
          0.        ,   1.        ]])

In [651]:
predictors

array([[ 40.        ,   0.        ,   1.        , ...,   0.        ,
        180.23323783, 100.82501836],
       [ 49.        ,   1.        ,   2.        , ...,   0.        ,
        173.97504219,  63.38492174],
       [ 37.        ,   0.        ,   1.        , ...,   1.        ,
        171.90490114,  83.02777765],
       ...,
       [ 45.63477593,   1.        ,   2.        , ...,   0.        ,
        172.18791988, 101.65311221],
       [ 69.09697007,   0.        ,   2.        , ...,   2.        ,
        166.57696146,  58.82176318],
       [ 73.10344263,   1.        ,   3.        , ...,   0.        ,
        166.71048127,  60.77257129]])

In [652]:
target

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Scaling
</span>

In [653]:
continuous = ['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'Height', 'Weight']
continuous_indices = [3, 4, 7, 9, 13, 14]

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### MinMax
</span>

In [654]:
def minmax_scaler(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

In [655]:
predictors_mm = minmax_scaler(predictors, continuous_indices)

In [656]:
predictors_mm

array([[ 40.        ,   0.        ,   1.        , ...,   0.        ,
        180.23323783, 100.82501836],
       [ 49.        ,   1.        ,   2.        , ...,   0.        ,
        173.97504219,  63.38492174],
       [ 37.        ,   0.        ,   1.        , ...,   1.        ,
        171.90490114,  83.02777765],
       ...,
       [ 45.63477593,   1.        ,   2.        , ...,   0.        ,
        172.18791988, 101.65311221],
       [ 69.09697007,   0.        ,   2.        , ...,   2.        ,
        166.57696146,  58.82176318],
       [ 73.10344263,   1.        ,   3.        , ...,   0.        ,
        166.71048127,  60.77257129]])

In [657]:
predictors_mm_label = minmax_scaler(predictors_label, continuous_indices)

In [658]:
predictors_mm_hot = minmax_scaler(predictors_hot, continuous_indices)

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Normalizer
</span>

In [659]:
def normalizer(df, columns):
    normalizer = Normalizer()
    df[columns] = normalizer.fit_transform(df[columns])
    return df

In [660]:
predictors_nor = normalizer(predictors, continuous_indices)

In [661]:
predictors_nor_label = normalizer(predictors_label, continuous_indices)

In [662]:
predictors_nor_hot = normalizer(predictors_hot, continuous_indices)

In [663]:
predictors_nor_hot

array([[ 40.        , 140.        , 289.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 49.        , 160.        , 180.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 37.        , 130.        , 283.        , ...,   1.        ,
          0.        ,   0.        ],
       ...,
       [ 45.63477593, 109.32257062, 185.26983988, ...,   0.        ,
          0.        ,   1.        ],
       [ 69.09697007, 134.9071995 , 238.96808547, ...,   0.        ,
          0.        ,   0.        ],
       [ 73.10344263, 130.03801168, 100.        , ...,   0.        ,
          0.        ,   1.        ]])

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Standard Scaler
</span>

In [664]:
def standard_scaler(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

In [665]:
predictors_scal = standard_scaler(predictors, continuous_indices)

In [666]:
predictors_scal_label = standard_scaler(predictors_label, continuous_indices)

In [667]:
predictors_scal_hot = standard_scaler(predictors_hot, continuous_indices)