In [21]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
import sys
sys.path.append("..")
from utils import ColumnSelector, ColumnDropper, DTypeTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
data_dir = Path(".").absolute().parent / "data"
data = pd.read_csv(data_dir / "train.csv")
X = data[[c for c in data.columns if c != "Survived"]]
y = data[["Survived"]]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [35]:
X.Pclass.unique()

array([3, 1, 2])

### steps

- select columns
- set dtypes
- encode
- scale
- impute
- model

In [39]:
NUM_COLS = ["Fare", "Age"]
NOM_COLS = ["Sex", "Embarked"]
ORD_COLS = ["Pclass"]
ALL_COLS = NUM_COLS + NOM_COLS + ORD_COLS 

# select columns
c_selector = ColumnSelector(ALL_COLS)

# nominal_pipeline
nom_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore")),
])

# ordinal_pipeline
ord_pipe = Pipeline([
    ("encode", OrdinalEncoder(categories=[[1, 2, 3,]])),
    ("impute", SimpleImputer(strategy="most_frequent")),
])

# numeric pipeline
num_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("impute", SimpleImputer(strategy="median")),
])

preprocessor = ColumnTransformer(
    [("nom", nom_pipe, NOM_COLS),
    ("ord", ord_pipe, ORD_COLS),
    ("num", num_pipe, NUM_COLS)],
)

pipe = Pipeline([
    ("col_select", c_selector),
    ("preprocess", preprocessor)
    ])


In [37]:
?OrdinalEncoder

[0;31mInit signature:[0m
[0mOrdinalEncoder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategories[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;34m<[0m[0;32mclass[0m [0;34m'numpy.float64'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhandle_unknown[0m[0;34m=[0m[0;34m'error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0munknown_value[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Encode categorical features as an integer array.

The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are converted to ordinal integers. This results in
a single column of integers (0 to n_categories - 1) per feature.

Read more in the :ref:`User Guide <preprocessing_categorical_features>`.

.. vers