In [1]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

from pprint import pprint

In [2]:
df = pd.DataFrame({
    'gender': ['Female', 'Male', 'Male', 'Female', 'Unknown'],
    'age':    [22      , None,      33    , 44      , 28      ],
    'city':   ['Hamburg', 'Warsaw', 'Beijin', 'Hamburg', 'Oslo']
})
df_test = pd.DataFrame({
    'gender': ['Female'],
    'age': [np.nan],
    'city': ['Hamburg']
})

In [3]:
df

Unnamed: 0,gender,age,city
0,Female,22.0,Hamburg
1,Male,,Warsaw
2,Male,33.0,Beijin
3,Female,44.0,Hamburg
4,Unknown,28.0,Oslo


In [4]:
df_test

Unnamed: 0,gender,age,city
0,Female,,Hamburg


In [5]:
from sklearn.pipeline import make_pipeline, make_union, Pipeline, FeatureUnion

# Feature union

- Concat results of multiple transormer objects. 
- make_union - convenience function
- combines several transformer objects into a new transformer that combines their output. 
- The transformers are applied in parallel,


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
union = make_union(
    SimpleImputer(strategy='constant', fill_value=1), 
    StandardScaler(),
    
    verbose=True
)

In [10]:
union.fit_transform(df.age.to_frame().values)

[FeatureUnion] . (step 1 of 2) Processing simpleimputer, total=   0.0s
[FeatureUnion]  (step 2 of 2) Processing standardscaler, total=   0.0s


array([[22.        , -1.20759819],
       [ 1.        ,         nan],
       [33.        ,  0.15482028],
       [44.        ,  1.51723875],
       [28.        , -0.46446084]])

In [11]:
union.transform(df_test.age.to_frame().values)

array([[ 1., nan]])

# Pipeline
- Sequentially apply a list of transforms and a final estimator
- The final estimator only needs to implement fit
- The transformers in the pipeline can be cached using memory argument.
- The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters

In [19]:
from sklearn.base import TransformerMixin, BaseEstimator
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, name):
        self.name=name

    def fit(self, df, *args, **kwargs):
        return self
        
    
    def transform(self, df, *args, **kwargs):
        return df[self.name].to_frame().values
    

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, ColumnTransformer, make_column_selector

cache_dir = './sklearn_cache'
gender_f = make_pipeline(
    FeatureSelector('gender'),
    OneHotEncoder(sparse=False), 
    verbose=True,
    memory=cache_dir
)

In [30]:
gender_f.fit_transform(df)

[Pipeline] ..... (step 2 of 2) Processing onehotencoder, total=   0.0s


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [31]:
gender_f.transform(df_test)

array([[1., 0., 0.]])

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing  import StandardScaler, Normalizer
age_f = make_pipeline(
    FeatureSelector('age'),
    SimpleImputer(strategy='median'), 
    StandardScaler(), 
    
    verbose=True,
    memory=cache_dir)

In [33]:
age_f.fit_transform(df)

[Pipeline] ... (step 1 of 3) Processing featureselector, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] .... (step 3 of 3) Processing standardscaler, total=   0.0s


array([[-1.31237504],
       [-0.13814474],
       [ 0.20721711],
       [ 1.72680926],
       [-0.48350659]])

In [36]:
features = make_union(gender_f, age_f, verbose=True)

In [37]:
features.fit_transform(df)

[Pipeline] ..... (step 2 of 2) Processing onehotencoder, total=   0.0s
[FeatureUnion] .... (step 1 of 2) Processing pipeline-1, total=   0.0s
[Pipeline] ..... (step 2 of 3) Processing simpleimputer, total=   0.0s
[Pipeline] .... (step 3 of 3) Processing standardscaler, total=   0.0s
[FeatureUnion] .... (step 2 of 2) Processing pipeline-2, total=   0.0s


array([[ 1.        ,  0.        ,  0.        , -1.31237504],
       [ 0.        ,  1.        ,  0.        , -0.13814474],
       [ 0.        ,  1.        ,  0.        ,  0.20721711],
       [ 1.        ,  0.        ,  0.        ,  1.72680926],
       [ 0.        ,  0.        ,  1.        , -0.48350659]])

In [42]:
from sklearn.utils import estimator_html_repr
from IPython.core.display import HTML
HTML(estimator_html_repr(features))

# ColumnSelector

# Scalling
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler

# Combining estimatros

https://scikit-learn.org/stable/modules/compose.html#combining-estimators

In [None]:
# 