## 2.1 Handling missing values

In [29]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

In [19]:
hd = pd.read_csv(r"C:\Users\muzam\OneDrive\Desktop\PROJECTS\Resources\datasets\heart.csv")
hd_df = pd.DataFrame(hd)

In [20]:
hd_df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

No missing values so we proceed with the next stage

## 2.2 Categorical Encoding

In [21]:
hd_df = hd_df.select_dtypes(include="integer")
# Dataset is already fully numeric
hd_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,0,0,3,0
3,61,1,0,148,203,0,1,161,0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1,3,2,0


### OneHotEncoding 

In [23]:
hd_ohe = pd.get_dummies(hd_df)
hd_ohe

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,0,0,3,0
3,61,1,0,148,203,0,1,161,0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,2,0,2,1


## 2.3 Feature Scaling

* Prevent the model from being confused, and treat all features as equal 
* Through conversion into a common (0-1) range
* Implemented using MinMaxScaler

In [28]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(hd_df)
scaled_df

array([[0.47916667, 1.        , 0.        , ..., 0.5       , 1.        ,
        0.        ],
       [0.5       , 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.85416667, 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.375     , 1.        , 0.        , ..., 0.25      , 0.66666667,
        0.        ],
       [0.4375    , 0.        , 0.        , ..., 0.        , 0.66666667,
        1.        ],
       [0.52083333, 1.        , 0.        , ..., 0.25      , 1.        ,
        0.        ]])

In [32]:
var_thres = VarianceThreshold(0)
var_thres.fit(scaled_df)

In [35]:
# Boolean mask of selected features
mask = var_thres.get_support()
mask

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

ALl columns selected due to variability, none dropped as none have 0 variance. All are important