<a href="https://colab.research.google.com/github/PaulToronto/Hands-on-Maching-Learning-Book/blob/main/2_2_Transformation_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformation Pipelines

## Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import make_column_selector

## Data

In [2]:
X, _ = make_blobs(
    n_samples=100,
    n_features=4,
    shuffle=True,
    random_state=42
)

X = np.hstack((X, np.random.choice(['Apple', 'Orange', 'Strawberry'], size=100).reshape(100, 1)))
X = np.hstack((X, np.random.choice(['Carrot', 'Pea', 'Squash', 'Spinach'], size=100).reshape(100, 1)))

df = pd.DataFrame(X, columns=['Red', 'Green', 'Blue', 'Purple', 'Fruit', 'Vegetable'])

for col in df.columns:
    num_nans = np.random.randint(1, 4)
    nan_indices = np.random.choice(df.index, size=num_nans, replace=False)
    df.loc[nan_indices, col] = np.nan

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Red        98 non-null     object
 1   Green      97 non-null     object
 2   Blue       97 non-null     object
 3   Purple     99 non-null     object
 4   Fruit      99 non-null     object
 5   Vegetable  99 non-null     object
dtypes: object(6)
memory usage: 4.8+ KB


In [3]:
df['Red'] = df['Red'].astype(np.float64)
df['Green'] = df['Green'].astype(np.float64)
df['Blue'] = df['Blue'].astype(np.float64)
df['Purple'] = df['Purple'].astype(np.float64)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Red        98 non-null     float64
 1   Green      97 non-null     float64
 2   Blue       97 non-null     float64
 3   Purple     99 non-null     float64
 4   Fruit      99 non-null     object 
 5   Vegetable  99 non-null     object 
dtypes: float64(4), object(2)
memory usage: 4.8+ KB


In [4]:
df_num = df[['Red', 'Green', 'Blue', 'Purple']]
df_num.head()

Unnamed: 0,Red,Green,Blue,Purple
0,-8.90477,-6.693655,-9.500114,8.175956
1,2.119976,3.388442,-9.5638,9.896195
2,-6.619744,-6.098287,-10.075278,
3,-6.52184,-6.319325,-7.755277,8.377325
4,2.098105,3.48429,-8.61319,9.25114


In [5]:
df_cat = df[['Fruit', 'Vegetable']]
df_cat.head()

Unnamed: 0,Fruit,Vegetable
0,Orange,Squash
1,Orange,Pea
2,Apple,Carrot
3,Apple,Squash
4,Orange,Carrot


## `Pipeline` and `make_pipeline`

- The `Pipeline` constructor takes a list of name/estimator pairs (2-tuples)
- The names must be unique and can't contain double undersore, __
- The estimators must all be transformers (have a `fit_transform()` method), except for the last one, which can be anything, a transformer, a predictor, or any other type of estimator
- When you call the pipeline's `fit()` method, it calls `fit_transform()` on all the transformers, passing the output of each to the next transformer, until it gets to the last one where it just calls the `fit()` method
- The pipeline exposes the same methods as the final estimator

### Numeric Pipeline

In [6]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

num_pipeline

In [7]:
print(num_pipeline)

Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                ('standardize', StandardScaler())])


- If you don't want to name the tranformers, you can use the `make_pipeline()` function instead

In [8]:
num_pipeline_make = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

num_pipeline_make

In [9]:
print(num_pipeline_make)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler())])


In [10]:
num_pipeline.fit(df_num)
num_pipeline.transform(df_num)[:5]

array([[-1.76117018, -1.33362544, -0.7783082 ,  0.58529515],
       [ 1.24509793,  0.19248026, -0.78816717,  1.1016411 ],
       [-1.1380811 , -1.24350576, -0.86734744,  0.4450859 ],
       [-1.11138412, -1.27696387, -0.50819576,  0.64573787],
       [ 1.23913392,  0.20698858, -0.64100651,  0.90802157]])

In [11]:
num_pipeline_make.fit_transform(df_num)[:5]

array([[-1.76117018, -1.33362544, -0.7783082 ,  0.58529515],
       [ 1.24509793,  0.19248026, -0.78816717,  1.1016411 ],
       [-1.1380811 , -1.24350576, -0.86734744,  0.4450859 ],
       [-1.11138412, -1.27696387, -0.50819576,  0.64573787],
       [ 1.23913392,  0.20698858, -0.64100651,  0.90802157]])

### Categorical Pipeline

In [12]:
cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

cat_pipeline

In [13]:
print(cat_pipeline)

Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(drop='first'))])


In [14]:
cat_pipeline_make = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='first')
)

cat_pipeline_make

In [15]:
print(cat_pipeline_make)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder(drop='first'))])


In [16]:
cat_pipeline.fit_transform(df_cat)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 132 stored elements and shape (100, 5)>

In [17]:
cat_pipeline_make.fit_transform(df_cat)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 132 stored elements and shape (100, 5)>

### `get_feature_names_out()`

In [18]:
num_pipeline.get_feature_names_out()

array(['Red', 'Green', 'Blue', 'Purple'], dtype=object)

In [19]:
num_pipeline_make.get_feature_names_out()

array(['Red', 'Green', 'Blue', 'Purple'], dtype=object)

In [20]:
cat_pipeline.get_feature_names_out()

array(['Fruit_Orange', 'Fruit_Strawberry', 'Vegetable_Pea',
       'Vegetable_Spinach', 'Vegetable_Squash'], dtype=object)

In [21]:
cat_pipeline_make.get_feature_names_out()

array(['Fruit_Orange', 'Fruit_Strawberry', 'Vegetable_Pea',
       'Vegetable_Spinach', 'Vegetable_Squash'], dtype=object)

### Recover a `DataFrame`

In [22]:
df_cat_prepared = cat_pipeline.fit_transform(df_cat)
df_cat_prepared = pd.DataFrame(
    df_cat_prepared.toarray(),
    columns = cat_pipeline.get_feature_names_out(),
    index=df_cat.index
)

df_cat_prepared.head()

Unnamed: 0,Fruit_Orange,Fruit_Strawberry,Vegetable_Pea,Vegetable_Spinach,Vegetable_Squash
0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0


### Pipeline indexing

In [23]:
test0 = num_pipeline[0]
test0

In [24]:
test1 = num_pipeline[1]
test1

In [25]:
test2 = cat_pipeline[0]
test2

In [26]:
test3 = cat_pipeline_make[1]
test3

In [27]:
test2.get_feature_names_out(), test3.get_feature_names_out()

(array(['Fruit', 'Vegetable'], dtype=object),
 array(['x0_Orange', 'x0_Strawberry', 'x1_Pea', 'x1_Spinach', 'x1_Squash'],
       dtype=object))

In [28]:
test4 = cat_pipeline[1]
test4.get_feature_names_out()

array(['x0_Orange', 'x0_Strawberry', 'x1_Pea', 'x1_Spinach', 'x1_Squash'],
      dtype=object)

In [29]:
cat_pipeline.get_feature_names_out()

array(['Fruit_Orange', 'Fruit_Strawberry', 'Vegetable_Pea',
       'Vegetable_Spinach', 'Vegetable_Squash'], dtype=object)

In [30]:
cat_pipeline_make.get_feature_names_out()

array(['Fruit_Orange', 'Fruit_Strawberry', 'Vegetable_Pea',
       'Vegetable_Spinach', 'Vegetable_Squash'], dtype=object)

In [31]:
num_pipeline[:-1]

### `steps` and `named_steps`

- `steps` is a list of 2-tuples:
    - (name of the step, estimator object)
- `named_steps` is a dictionary
    - keys are the step names
    - values are the estimator object

In [32]:
num_pipeline.steps

[('impute', SimpleImputer(strategy='median')),
 ('standardize', StandardScaler())]

In [33]:
test5 = num_pipeline.steps

In [34]:
type(test5)

list

In [35]:
test6 = num_pipeline.steps[0]
type(test6)

tuple

In [36]:
test6[1]

In [37]:
test7 = num_pipeline.named_steps
type(test7)

In [38]:
test7['impute']

## `ColumnTransformer` and `make_column_transformer`

- The `ColumnTransformer` constructor takes a list of 3-tuples
    - name: must be unique and contain no double underscores
    - transformer:
    - list of names or indices to specify the columns

In [39]:
df_num.columns, df_cat.columns

(Index(['Red', 'Green', 'Blue', 'Purple'], dtype='object'),
 Index(['Fruit', 'Vegetable'], dtype='object'))

In [40]:
num_attribs = ['Red', 'Green', 'Blue', 'Purple']
cat_attribs = ['Fruit', 'Vegetable']

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

preprocessing

In [41]:
print(preprocessing)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardize',
                                                  StandardScaler())]),
                                 ['Red', 'Green', 'Blue', 'Purple']),
                                ('cat',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(drop='first'))]),
                                 ['Fruit', 'Vegetable'])])


In [42]:
preprocessing2 = make_column_transformer(
    (num_pipeline_make, num_attribs),
    (cat_pipeline_make, cat_attribs)
)

preprocessing2

In [43]:
print(preprocessing2)

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Red', 'Green', 'Blue', 'Purple']),
                                ('pipeline-2',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='first'))]),
                                 ['Fruit', 'Vegetable'])])


### `"passthrough"`

In [44]:
preprocessing3 = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, ['Fruit']),
    ('pass', 'passthrough', ['Vegetable'])
])

preprocessing3

In [45]:
print(preprocessing3)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardize',
                                                  StandardScaler())]),
                                 ['Red', 'Green', 'Blue', 'Purple']),
                                ('cat',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(drop='first'))]),
                                 ['Fruit']),
                                ('pass', 'passthrough', ['Vegetable'])])


In [46]:
preprocessing3.fit_transform(df)
preprocessing3

In [47]:
preprocessing3.get_feature_names_out()

array(['num__Red', 'num__Green', 'num__Blue', 'num__Purple',
       'cat__Fruit_Orange', 'cat__Fruit_Strawberry', 'pass__Vegetable'],
      dtype=object)

In [48]:
preprocessing4 = ColumnTransformer([
    ('num', num_pipeline_make, num_attribs),
    ('cat', cat_pipeline_make, ['Fruit']),
    ('pass', 'passthrough', ['Vegetable'])
])

preprocessing4.fit_transform(df)
preprocessing4

In [49]:
preprocessing4.get_feature_names_out()

array(['num__Red', 'num__Green', 'num__Blue', 'num__Purple',
       'cat__Fruit_Orange', 'cat__Fruit_Strawberry', 'pass__Vegetable'],
      dtype=object)

In [50]:
preprocessing5 = make_column_transformer(
    (num_pipeline, num_attribs),
    (cat_pipeline, ['Fruit']),
    ('passthrough', ['Vegetable'])
)

preprocessing5.fit_transform(df)
preprocessing5

In [51]:
print(preprocessing5)

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardize',
                                                  StandardScaler())]),
                                 ['Red', 'Green', 'Blue', 'Purple']),
                                ('pipeline-2',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(drop='first'))]),
                                 ['Fruit']),
                                ('passthrough', 'passthrough', ['Vegetable'])])


In [52]:
preprocessing5.get_feature_names_out()

array(['pipeline-1__Red', 'pipeline-1__Green', 'pipeline-1__Blue',
       'pipeline-1__Purple', 'pipeline-2__Fruit_Orange',
       'pipeline-2__Fruit_Strawberry', 'passthrough__Vegetable'],
      dtype=object)

### `"drop"`

In [53]:
preprocessing6 = make_column_transformer(
    (num_pipeline, num_attribs),
    (cat_pipeline, ['Fruit']),
    ('drop', ['Vegetable'])
)

preprocessing6.fit_transform(df)
preprocessing6

In [54]:
preprocessing6.get_feature_names_out()

array(['pipeline-1__Red', 'pipeline-1__Green', 'pipeline-1__Blue',
       'pipeline-1__Purple', 'pipeline-2__Fruit_Orange',
       'pipeline-2__Fruit_Strawberry'], dtype=object)

### `make_column_selector`

In [55]:
preprocessing8 = ColumnTransformer([
    ('num', num_pipeline, make_column_selector(dtype_include=np.number)),
    ('cat', cat_pipeline, make_column_selector(dtype_include=object))
])

preprocessing8.fit_transform(df)
preprocessing8

In [56]:
preprocessing8.get_feature_names_out()

array(['num__Red', 'num__Green', 'num__Blue', 'num__Purple',
       'cat__Fruit_Orange', 'cat__Fruit_Strawberry', 'cat__Vegetable_Pea',
       'cat__Vegetable_Spinach', 'cat__Vegetable_Squash'], dtype=object)

In [57]:
preprocessing9 = ColumnTransformer([
    ('num', num_pipeline, make_column_selector(dtype_include=np.number)),
    ('cat', cat_pipeline, make_column_selector(dtype_exclude=np.number))
])

preprocessing9.fit_transform(df)
preprocessing9

In [58]:
preprocessing9.get_feature_names_out()

array(['num__Red', 'num__Green', 'num__Blue', 'num__Purple',
       'cat__Fruit_Orange', 'cat__Fruit_Strawberry', 'cat__Vegetable_Pea',
       'cat__Vegetable_Spinach', 'cat__Vegetable_Squash'], dtype=object)