In [216]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

In [217]:
warnings.filterwarnings("ignore")

In [218]:
df = pd.read_csv('../data/classifier/processed/df.csv', sep = ';', encoding = 'iso-8859-1')

In [219]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,M,ATA,140.0,289.0,0,Normal,172.0,N,0.0,Up,1,White,180.233238,100.825018
1,49.0,F,NAP,160.0,180.0,0,Normal,156.0,N,1.0,Flat,0,White,173.975042,63.384922
2,37.0,M,ATA,130.0,283.0,0,ST,98.0,N,0.0,Up,1,Hispanic,171.904901,83.027778
3,48.0,F,ASY,138.0,214.0,0,Normal,108.0,Y,1.5,Flat,1,White,160.074206,66.458538
4,54.0,M,NAP,150.0,195.0,0,Normal,122.0,N,0.0,Up,0,White,177.775159,84.033176


<span style="font-family: Georgia, serif;">

# Preprocessing
</span>

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Encoders
</span>

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Manual
</span>

In [220]:
def manual_encoder(df):
    df['Sex'] = df['Sex'].map({'M': 0, 'F': 1})
    df['ChestPainType'] = df['ChestPainType'].map({'ASY': 3, 'NAP': 2, 'ATA': 1, 'TA': 0})
    df['RestingECG'] = df['RestingECG'].map({'Normal': 0, 'ST': 1, 'LVH': 2})
    df['ExerciseAngina'] = df['ExerciseAngina'].map({'N': 0, 'Y': 1})
    df['ST_Slope'] = df['ST_Slope'].map({'Up': 0, 'Flat': 1, 'Down': 2})
    df['Race'] = df['Race'].map({'White': 0, 'Hispanic': 1, 'Black': 2, 'Asian': 3, 'Other': 4})
    return df

In [221]:
df2 = df.copy()
df2 = manual_encoder(df2)

In [222]:
df2.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,0,1,140.0,289.0,0,0,172.0,0,0.0,0,1,0,180.233238,100.825018
1,49.0,1,2,160.0,180.0,0,0,156.0,0,1.0,1,0,0,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### LabelEncoder
</span>

In [223]:
def label_encoder(df, columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])
    return df

df_label = df.copy()
df_label = label_encoder(df_label, ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Race'])

In [224]:
df_label.head(2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.0,1,1,140.0,289.0,0,1,172.0,0,0.0,2,1,4,180.233238,100.825018
1,49.0,0,2,160.0,180.0,0,1,156.0,0,1.0,1,0,4,173.975042,63.384922


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### OneHotEncoder
</span>

In [225]:
def onehot_encoder(df, columns):
    encoder = OneHotEncoder(sparse_output=False, drop='first') 
    encoded_df = pd.DataFrame(encoder.fit_transform(df[columns]), columns=encoder.get_feature_names_out(columns))
    df.drop(columns, axis=1, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)
    return df

df_hot = df.copy()
df_hot = onehot_encoder(df_hot, ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'Race'])

In [226]:
df_hot.head(2)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Height,Weight,Sex_M,...,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,Race_Black,Race_Hispanic,Race_Other,Race_White
0,40.0,140.0,289.0,0,172.0,0.0,1,180.233238,100.825018,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49.0,160.0,180.0,0,156.0,1.0,0,173.975042,63.384922,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Separation of attributes
</span>

In [227]:
predictors = df2.drop(columns=['HeartDisease']).values
target = df2['HeartDisease'].values

In [228]:
predictors_label = df_label.drop(columns=['HeartDisease']).values

In [229]:
predictors_hot = df_hot.drop(columns=['HeartDisease']).values

In [230]:
predictors_label

array([[ 40.        ,   1.        ,   1.        , ...,   4.        ,
        180.23323783, 100.82501836],
       [ 49.        ,   0.        ,   2.        , ...,   4.        ,
        173.97504219,  63.38492174],
       [ 37.        ,   1.        ,   1.        , ...,   2.        ,
        171.90490114,  83.02777765],
       ...,
       [ 45.63477593,   0.        ,   2.        , ...,   4.        ,
        172.18791988, 101.65311221],
       [ 69.09697007,   1.        ,   2.        , ...,   1.        ,
        166.57696146,  58.82176318],
       [ 73.10344263,   0.        ,   0.        , ...,   4.        ,
        166.71048127,  60.77257129]])

In [231]:
predictors_hot

array([[ 40.        , 140.        , 289.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 49.        , 160.        , 180.        , ...,   0.        ,
          0.        ,   1.        ],
       [ 37.        , 130.        , 283.        , ...,   1.        ,
          0.        ,   0.        ],
       ...,
       [ 45.63477593, 109.32257062, 185.26983988, ...,   0.        ,
          0.        ,   1.        ],
       [ 69.09697007, 134.9071995 , 238.96808547, ...,   0.        ,
          0.        ,   0.        ],
       [ 73.10344263, 130.03801168, 100.        , ...,   0.        ,
          0.        ,   1.        ]])

In [232]:
predictors

array([[ 40.        ,   0.        ,   1.        , ...,   0.        ,
        180.23323783, 100.82501836],
       [ 49.        ,   1.        ,   2.        , ...,   0.        ,
        173.97504219,  63.38492174],
       [ 37.        ,   0.        ,   1.        , ...,   1.        ,
        171.90490114,  83.02777765],
       ...,
       [ 45.63477593,   1.        ,   2.        , ...,   0.        ,
        172.18791988, 101.65311221],
       [ 69.09697007,   0.        ,   2.        , ...,   2.        ,
        166.57696146,  58.82176318],
       [ 73.10344263,   1.        ,   3.        , ...,   0.        ,
        166.71048127,  60.77257129]])

In [233]:
target

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

## Scaling
</span>

In [234]:
continuous = ['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'Height', 'Weight']
continuous_indices = [df2.columns.get_loc(col) for col in continuous]

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### MinMax
</span>

In [235]:
def minmax_scaler(array):
    scaler = MinMaxScaler()
    return scaler.fit_transform(array)

In [236]:
predictors_continuous = predictors[:, continuous_indices]

IndexError: index 14 is out of bounds for axis 1 with size 14

In [None]:
predictors_mm_continuous = minmax_scaler(predictors_continuous)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
predictors_mm_label = minmax_scaler(predictors_label, continuous)

In [None]:
predictors_mm_hot = minmax_scaler(predictors_hot, continuous)

In [None]:
predictors_mm

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.000000,M,ATA,0.500000,0.393822,0,Normal,0.788732,N,0.015873,Up,1,White,0.600075,0.548416
1,49.000000,F,NAP,0.666667,0.183398,0,Normal,0.676056,N,0.174603,Flat,0,White,0.524381,0.223524
2,37.000000,M,ATA,0.416667,0.382239,0,ST,0.267606,N,0.015873,Up,1,Hispanic,0.499342,0.393978
3,48.000000,F,ASY,0.483333,0.249035,0,Normal,0.338028,Y,0.253968,Flat,1,White,0.356247,0.250196
4,54.000000,M,NAP,0.583333,0.212355,0,Normal,0.436620,N,0.015873,Up,0,White,0.570344,0.402702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8741,74.348939,M,ASY,0.436209,0.348919,1,Normal,0.337433,N,0.341368,Flat,1,White,0.480574,0.397179
8742,56.593558,F,NAP,0.196762,0.028958,1,ST,0.162982,Y,0.117487,Flat,1,Asian,0.324913,0.288997
8743,45.634776,F,NAP,0.244355,0.193571,0,ST,0.312193,Y,0.047718,Flat,0,White,0.502765,0.555602
8744,69.096970,M,NAP,0.457560,0.297236,0,Normal,0.750772,Y,0.385102,Flat,1,Black,0.434899,0.183927


In [None]:
predictors_mm_label

In [None]:
predictors_mm_hot

<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Normalizer
</span>

In [None]:
def normalizer(df, columns):
    normalizer = Normalizer()
    df[columns] = normalizer.fit_transform(df[columns])
    return df

In [None]:
predictors_nor = normalizer(predictors, continuous)

In [None]:
predictors_nor

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.000000,M,ATA,0.384809,0.303093,0,Normal,0.607023,N,0.012216,Up,1,White,0.461829,0.422071
1,49.000000,F,NAP,0.586841,0.161438,0,Normal,0.595106,N,0.153696,Flat,0,White,0.461592,0.196760
2,37.000000,M,ATA,0.466974,0.428390,0,ST,0.299916,N,0.017789,Up,1,Hispanic,0.559632,0.441546
3,48.000000,F,ASY,0.593191,0.305638,0,Normal,0.414859,Y,0.311693,Flat,1,White,0.437219,0.307064
4,54.000000,M,NAP,0.565591,0.205896,0,Normal,0.423340,N,0.015390,Up,0,White,0.552997,0.390454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8741,74.348939,M,ASY,0.452056,0.361595,1,Normal,0.349692,N,0.353770,Flat,1,White,0.498032,0.411608
8742,56.593558,F,NAP,0.379367,0.055831,1,ST,0.314237,Y,0.226520,Flat,1,Asian,0.626447,0.557200
8743,45.634776,F,NAP,0.280592,0.222277,0,ST,0.358490,Y,0.054794,Flat,0,White,0.577324,0.637997
8744,69.096970,M,NAP,0.412126,0.267721,0,Normal,0.676224,Y,0.346862,Flat,1,Black,0.391715,0.165664


<span style="font-family: Georgia, serif; font-weight: 100; letter-spacing: 0.8px;">

### Standard Scaler
</span>

In [None]:
def standard_scaler(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

In [None]:
predictors_scal = standard_scaler(predictors, continuous)

In [None]:
predictors_scal

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Race,Height,Weight
0,40.000000,M,ATA,-0.267188,0.431073,0,Normal,0.591128,N,-1.157423,Up,1,White,-0.217745,0.596599
1,49.000000,F,NAP,1.224315,-0.446026,0,Normal,0.510365,N,-0.108888,Flat,0,White,-0.219982,-1.431631
2,37.000000,M,ATA,0.339398,1.206889,0,ST,-1.490192,N,-1.116118,Up,1,Hispanic,0.705205,0.771912
3,48.000000,F,ASY,1.271197,0.446834,0,Normal,-0.711200,Y,1.062055,Flat,1,White,-0.449991,-0.438686
4,54.000000,M,NAP,1.067438,-0.170748,0,Normal,-0.653727,N,-1.133899,Up,0,White,0.642594,0.311986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8741,74.348939,M,ASY,0.229261,0.793304,1,Normal,-1.152852,N,1.373890,Flat,1,White,0.123900,0.502414
8742,56.593558,F,NAP,-0.307367,-1.099920,1,ST,-1.393137,Y,0.430819,Flat,1,Asian,1.335735,1.813013
8743,45.634776,F,NAP,-1.036574,-0.069320,0,ST,-1.093222,Y,-0.841869,Flat,0,White,0.872168,2.540340
8744,69.096970,M,NAP,-0.065521,0.212059,0,Normal,1.060112,Y,1.322700,Flat,1,Black,-0.879400,-1.711556
