# Model Training

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer , StandardScaler 
import category_encoders as ce
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("Heart.csv")

## Cleaning Data

In [4]:
df = df.dropna()

In [33]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_cols = numeric_df.columns 

for col in numeric_cols:
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    df[col] = np.clip(df[col], lower_limit, upper_limit)

## Handling Categorical Data 

In [6]:
print(df['ChestPain'].unique())
print(df['Thal'].unique())

['typical' 'asymptomatic' 'nonanginal' 'nontypical']
['fixed' 'normal' 'reversable']


### Which Encoding types should I use?

If we check the categories, we can see these are nominal data. So we can't use ordinal encoding. Also, target encoding is risky for this because it's not high cardinality, and there is a chance of overfitting. 

*I'm gonna use One-Hot encoding, Label encoding , Frequency encoding and Binary encoding*

In [28]:
categorical_cols = ['Thal', 'ChestPain']
df_one_hot = pd.get_dummies(df , columns = categorical_cols , dtype = int )
df_one_hot.head(3)

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Target,Thal_fixed,Thal_normal,Thal_reversable,ChestPain_asymptomatic,ChestPain_nonanginal,ChestPain_nontypical,ChestPain_typical
0,63,1,145,233,0,2,150,0,2.3,3,0,0,1,0,0,0,0,0,1
1,67,1,160,286,0,2,108,1,1.5,2,2,1,0,1,0,1,0,0,0
2,67,1,120,229,0,2,129,1,2.6,2,2,1,0,0,1,1,0,0,0


In [29]:
le = LabelEncoder()
df_label = df.copy()

for col in categorical_cols:
    df_label[col] = le.fit_transform(df[col])

df_label.head(3)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,Target
0,63,1,3,145,233,0,2,150,0,2.3,3,0,0,0
1,67,1,0,160,286,0,2,108,1,1.5,2,2,1,1
2,67,1,0,120,229,0,2,129,1,2.6,2,2,2,1


In [30]:
df_freq = df.copy()

for col in categorical_cols : 
    freqs = df_freq[col].value_counts()
    df_freq[col] = df[col].map(freqs)
    
df_freq.head(3)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,Target
0,63,1,23,145,233,0,2,150,0,2.3,3,0,18,0
1,67,1,143,160,286,0,2,108,1,1.5,2,2,166,1
2,67,1,143,120,229,0,2,129,1,2.6,2,2,117,1


In [31]:
binary_encoder = ce.BinaryEncoder(cols = categorical_cols)
df_binary = binary_encoder.fit_transform(df)
df_binary.head(3)

Unnamed: 0,Age,Sex,ChestPain_0,ChestPain_1,ChestPain_2,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal_0,Thal_1,Target
0,63,1,0,0,1,145,233,0,2,150,0,2.3,3,0,0,1,0
1,67,1,0,1,0,160,286,0,2,108,1,1.5,2,2,1,0,1
2,67,1,0,1,0,120,229,0,2,129,1,2.6,2,2,1,1,1


Right now, we have two different datasets for each encoding system. We are going to split all of them into training and test sets to train our model.

## Spliting Datasets

In [32]:
dfs = [df_binary, df_freq , df_label , df_one_hot] 
splits = {}

for name, data in zip(["label", "onehot", "binary", "freq"], dfs ):
    
    X = data.drop("Target", axis=1)
    y = data["Target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    splits[name] = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }