<a href="https://colab.research.google.com/github/PaulToronto/Hands-on-Maching-Learning-Book/blob/main/2_2_Transformation_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformation Pipelines

## Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import make_column_selector

## Data

In [2]:
X, _ = make_blobs(
    n_samples=100,
    n_features=4,
    shuffle=True,
    random_state=42
)

X = np.hstack((X, np.random.choice(['Apple', 'Orange', 'Strawberry'], size=100).reshape(100, 1)))
X = np.hstack((X, np.random.choice(['Carrot', 'Pea', 'Squash', 'Spinach'], size=100).reshape(100, 1)))

df = pd.DataFrame(X, columns=['Red', 'Green', 'Blue', 'Purple', 'Fruit', 'Vegetable'])

for col in df.columns:
    num_nans = np.random.randint(1, 4)
    nan_indices = np.random.choice(df.index, size=num_nans, replace=False)
    df.loc[nan_indices, col] = np.nan

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Red        98 non-null     object
 1   Green      99 non-null     object
 2   Blue       99 non-null     object
 3   Purple     99 non-null     object
 4   Fruit      97 non-null     object
 5   Vegetable  97 non-null     object
dtypes: object(6)
memory usage: 4.8+ KB


In [3]:
df_num = df[['Red', 'Green', 'Blue', 'Purple']]
df_num.head()

Unnamed: 0,Red,Green,Blue,Purple
0,-8.904769777808877,-6.693655278506519,-9.5001142214044,8.175956250294927
1,2.1199763334130077,3.3884417720654425,-9.563799939825008,9.896195334485336
2,-6.619744396902846,-6.098286721498637,-10.075278467514092,6.003066302414426
3,-6.521839830802987,-6.319325066907712,-7.755276513460734,8.377324967533607
4,2.098104793057902,3.484289844408797,-8.6131903806662,9.251139661737747


In [4]:
df_cat = df[['Fruit', 'Vegetable']]
df_cat.head()

Unnamed: 0,Fruit,Vegetable
0,Orange,Pea
1,Orange,
2,Strawberry,Carrot
3,Orange,Carrot
4,Orange,Pea


## `Pipeline` and `make_pipeline`

- The `Pipeline` constructor takes a list of name/estimator pairs (2-tuples)
- The names must be unique and can't contain double undersore, __
- The estimators must all be transformers (have a `fit_transform()` method), except for the last one, which can be anything, a transformer, a predictor, or any other type of estimator
- When you call teh pipeline's `fit()` method, it calls `fit_transform()` on all the transformers, passing the output of each to the next transformer, until it gets to the last one where it just calls the `fit()` method
- The pipeline exposes the same methods as the final estimator

### Numeric Pipeline

In [5]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

num_pipeline

In [6]:
print(num_pipeline)

Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                ('standardize', StandardScaler())])


- If you don't want to name the tranformers, you can use the `make_pipeline()` function instead

In [7]:
num_pipeline_make = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

num_pipeline_make

In [8]:
print(num_pipeline_make)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler())])


In [9]:
num_pipeline.fit(df_num)
num_pipeline.transform(df_num)[:5]

array([[-1.76388229, -1.33415573, -0.75479866,  0.57811574],
       [ 1.24501113,  0.18481718, -0.76470406,  1.09835442],
       [-1.14024907, -1.24445726, -0.84425716, -0.07901467],
       [-1.11352878, -1.27775899, -0.48341434,  0.63901413],
       [ 1.23904191,  0.19925769, -0.61685046,  0.90327519]])

In [10]:
num_pipeline_make.fit_transform(df_num)[:5]

array([[-1.76388229, -1.33415573, -0.75479866,  0.57811574],
       [ 1.24501113,  0.18481718, -0.76470406,  1.09835442],
       [-1.14024907, -1.24445726, -0.84425716, -0.07901467],
       [-1.11352878, -1.27775899, -0.48341434,  0.63901413],
       [ 1.23904191,  0.19925769, -0.61685046,  0.90327519]])

### Categorical Pipeline

In [11]:
cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

cat_pipeline

In [12]:
print(cat_pipeline)

Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(drop='first'))])


In [13]:
cat_pipeline_make = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='first')
)

cat_pipeline_make

In [14]:
print(cat_pipeline_make)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder(drop='first'))])


In [15]:
cat_pipeline.fit_transform(df_cat)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 145 stored elements and shape (100, 5)>

In [16]:
cat_pipeline_make.fit_transform(df_cat)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 145 stored elements and shape (100, 5)>

### `get_feature_names_out()`

In [17]:
num_pipeline.get_feature_names_out()

array(['Red', 'Green', 'Blue', 'Purple'], dtype=object)

In [18]:
num_pipeline_make.get_feature_names_out()

array(['Red', 'Green', 'Blue', 'Purple'], dtype=object)

In [19]:
cat_pipeline.get_feature_names_out()

array(['Fruit_Orange', 'Fruit_Strawberry', 'Vegetable_Pea',
       'Vegetable_Spinach', 'Vegetable_Squash'], dtype=object)

In [20]:
cat_pipeline_make.get_feature_names_out()

array(['Fruit_Orange', 'Fruit_Strawberry', 'Vegetable_Pea',
       'Vegetable_Spinach', 'Vegetable_Squash'], dtype=object)

### Recover a `DataFrame`

In [21]:
df_cat_prepared = cat_pipeline.fit_transform(df_cat)
df_cat_prepared = pd.DataFrame(
    df_cat_prepared.toarray(),
    columns = cat_pipeline.get_feature_names_out(),
    index=df_cat.index
)

df_cat_prepared.head()

Unnamed: 0,Fruit_Orange,Fruit_Strawberry,Vegetable_Pea,Vegetable_Spinach,Vegetable_Squash
0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0


### Pipeline indexing