## Data Cleaning

### Imputer: `"imputer"`
- substitute the `na` value with `median` or `most_frequent`

In [1]:
from sklearn.impute import SimpleImputer
numeric_imputer = SimpleImputer(strategy = 'median')
categorical_imputer = SimpleImputer(strategy = 'most_frequent')

### OneHotEncoder: `"onehot"`
- transform string like features into n columns with 0 and 1
- `handle_unknown = 'ignore'` means to ignore unknown value from the test dataset

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore')

### Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Create pipeline 1

numeric_transformer = Pipeline(steps = [
    ('num', numeric_imputer), 
    ('scaler', StandardScaler())
])


# Create pipeline 2

cat_transformer = Pipeline(steps = [
    ('cat', categorical_imputer),
    ('onehot', ohe), 
])

# Pack the pipelines and transform together

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, []), 
        ('cat', cat_transformer, [])
    ]
)