# Model Training

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder , OrdinalEncoder
from sklearn.preprocessing import StandardScaler , MinMaxScaler
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("Heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,Target
0,63,1,typical,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2,reversable,1
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0,normal,0


In [4]:
df["Oldpeak"].unique()

array([2.3, 1.5, 2.6, 3.5, 1.4, 0.8, 3.6, 0.6, 3.1, 0.4, 1.3, 0. , 0.5,
       1.6, 1. , 1.2, 0.2, 1.8, 3.2, 2.4, 2. , 2.5, 2.2, 2.8, 3. , 3.4,
       6.2, 4. , 5.6, 2.9, 0.1, 2.1, 1.9, 4.2, 0.9, 1.1, 3.8, 0.7, 0.3,
       4.4])

## Cleaning Data

In [5]:
df = df.dropna()

In [6]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
numeric_cols = numeric_df.columns 

for col in numeric_cols:
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    df[col] = np.clip(df[col], lower_limit, upper_limit)

## Handling Categorical Data 

In [7]:
print(df['ChestPain'].unique())
print(df['Thal'].unique())

['typical' 'asymptomatic' 'nonanginal' 'nontypical']
['fixed' 'normal' 'reversable']


### Which Encoding types should I use?

    In our dataset we can see that Thal , RestECG, Slope , Ca and ChestPain columns are categorical columns. Even though some of them can look like numerical. (like RestECG, Slope , Ca) They doesn't have meanings like numerical columns. We have to encode them. 

1. For Thal - RestECG - Chestpain I'm gonna use one_hot and label for different models.
2. Slope has a real order. So I'm gonna use ordinal encoding.
3. Ca has many categories so I decided to use Frequency encoding for this.

In [8]:
df_one_hot = pd.get_dummies(df , columns = ["Thal", "RestECG" , "ChestPain"] , dtype = int)
df_one_hot.head(3)

Unnamed: 0,Age,Sex,RestBP,Chol,Fbs,MaxHR,ExAng,Oldpeak,Slope,Ca,...,Thal_fixed,Thal_normal,Thal_reversable,RestECG_0,RestECG_1,RestECG_2,ChestPain_asymptomatic,ChestPain_nonanginal,ChestPain_nontypical,ChestPain_typical
0,63,1,145,233,0,150,0,2.3,3,0.0,...,1,0,0,0,0,1,0,0,0,1
1,67,1,160,286,0,108,1,1.5,2,2.5,...,0,1,0,0,0,1,1,0,0,0
2,67,1,120,229,0,129,1,2.6,2,2.0,...,0,0,1,0,0,1,1,0,0,0


In [9]:
le = LabelEncoder()
df_label = df.copy()

for col in ["Thal" , "RestECG" , "ChestPain" ]:
    df_label[col] = le.fit_transform(df[col])

df_label.head(3)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,Target
0,63,1,3,145,233,0,2,150,0,2.3,3,0.0,0,0
1,67,1,0,160,286,0,2,108,1,1.5,2,2.5,1,1
2,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,1


In [21]:
ordinal_encoder = OrdinalEncoder()
datasets = {
    "label": df_label,
    "onehot": df_one_hot
}
freq_encoding = df['Ca'].value_counts(normalize = False)
for name in datasets:
    datasets[name]['Slope'] = ordinal_encoder.fit_transform(datasets[name][['Slope']])
    datasets[name]['Ca'] = datasets[name]['Ca'].map(freq_encoding)

    Right now, we have two different datasets. We are going to split both of them into training and test sets to train our model.

## Spliting Datasets

In [22]:
splits = {}
for name, data in datasets.items():

    X = data.drop("Target", axis=1)
    y = data["Target"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=111 if name=="label" else 100
    )

    splits[name] = {
        "X_train": X_train.copy(),
        "X_test": X_test.copy(),
        "y_train": y_train,
        "y_test": y_test
    }

## Feature Scaling

In [23]:
continuous_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']

standard = StandardScaler()
minmax = MinMaxScaler()

for name, data in splits.items():
    scaler = standard if name == "label" else minmax
    
    data["X_train"][continuous_cols] = scaler.fit_transform(data["X_train"][continuous_cols])
    data["X_test"][continuous_cols] = scaler.transform(data["X_test"][continuous_cols])

## Model Training 