# Building Pipelines

In [1]:
# import and split the dataset

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [2]:
# building the pipeline

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

pipe = Pipeline([('scaler', MinMaxScaler()), ('svm', SVC())]).fit(X_train, y_train)
pipe.score(X_test, y_test)

0.972027972027972

# Using Pipelines in Grid Search

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

pipe = Pipeline([('scaler', MinMaxScaler()), ('svm', SVC())]).fit(X_train, y_train)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5).fit(X_train, y_train)

print('Best cross-validation score: {:.2f}'.format(grid.best_score_))
print('Test set score: {:.2f}'.format(grid.score(X_test, y_test)))
print('Best parameters: {}'.format(grid.best_params_))

Best cross-validation score: 0.98
Test set score: 0.97
Best parameters: {'svm__C': 1, 'svm__gamma': 1}


# Convenient Pipeline Creation make_pipeline

In [5]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(MinMaxScaler(), SVC(C=100))
pipe.steps

[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]