In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier as rfc

# Pipelines

In [8]:
cat_columns=[]
num_columns=[]

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('onehot', OneHotEncoder())
])

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(fill_value=0)),
    ('scaler', StandardScaler())
])

preprocess_pipe = ColumnTransformer([
    ('cat', categorical_pipe, cat_columns),
    ('num', numeric_pipe, num_columns)])


Les pipelines forment une arborescence de methodes et de paramêtres.

In [9]:
preprocess_pipe

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0)),
                                                 ('onehot',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                            

Les Pipelines exposent l'ensemble des parametres de chaque 'sous pipeline'. On peut utiliser set_param pour les modifier.
Chaque hyperparametre peut etre utiliser dans un gridsearch.

In [10]:
print(preprocess_pipe.get_params().keys())

dict_keys(['n_jobs', 'remainder', 'sparse_threshold', 'transformer_weights', 'transformers', 'verbose', 'cat', 'num', 'cat__memory', 'cat__steps', 'cat__verbose', 'cat__imputer', 'cat__onehot', 'cat__imputer__add_indicator', 'cat__imputer__copy', 'cat__imputer__fill_value', 'cat__imputer__missing_values', 'cat__imputer__strategy', 'cat__imputer__verbose', 'cat__onehot__categories', 'cat__onehot__drop', 'cat__onehot__dtype', 'cat__onehot__handle_unknown', 'cat__onehot__sparse', 'num__memory', 'num__steps', 'num__verbose', 'num__imputer', 'num__scaler', 'num__imputer__add_indicator', 'num__imputer__copy', 'num__imputer__fill_value', 'num__imputer__missing_values', 'num__imputer__strategy', 'num__imputer__verbose', 'num__scaler__copy', 'num__scaler__with_mean', 'num__scaler__with_std'])


On peut changer les paramatres défini par défaut

In [11]:
print(preprocess_pipe.get_params()['num__imputer__fill_value'])

0


In [12]:
preprocess_pipe.set_params(num__imputer__fill_value=1)
print(preprocess_pipe.get_params()['num__imputer__fill_value'])

1


pipeline complet avec preprocessing et modele

In [13]:
model = rfc()

full_pipe = Pipeline(
    [('pp', preprocess_pipe),
    ('rfc', model)]
)

In [14]:
print(full_pipe.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'pp', 'rfc', 'pp__n_jobs', 'pp__remainder', 'pp__sparse_threshold', 'pp__transformer_weights', 'pp__transformers', 'pp__verbose', 'pp__cat', 'pp__num', 'pp__cat__memory', 'pp__cat__steps', 'pp__cat__verbose', 'pp__cat__imputer', 'pp__cat__onehot', 'pp__cat__imputer__add_indicator', 'pp__cat__imputer__copy', 'pp__cat__imputer__fill_value', 'pp__cat__imputer__missing_values', 'pp__cat__imputer__strategy', 'pp__cat__imputer__verbose', 'pp__cat__onehot__categories', 'pp__cat__onehot__drop', 'pp__cat__onehot__dtype', 'pp__cat__onehot__handle_unknown', 'pp__cat__onehot__sparse', 'pp__num__memory', 'pp__num__steps', 'pp__num__verbose', 'pp__num__imputer', 'pp__num__scaler', 'pp__num__imputer__add_indicator', 'pp__num__imputer__copy', 'pp__num__imputer__fill_value', 'pp__num__imputer__missing_values', 'pp__num__imputer__strategy', 'pp__num__imputer__verbose', 'pp__num__scaler__copy', 'pp__num__scaler__with_mean', 'pp__num__scaler__with_std', 'rfc__boots