In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

# Algorithm Chains and Pipelines

In [2]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# load and split the data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

# compute minimum and maximum on the training data
scaler = MinMaxScaler().fit(X_train)
# rescale training data
X_train_scaled = scaler.transform(X_train)

svm = SVC()
# learn an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)
# scale test data and score the scaled data
X_test_scaled = scaler.transform(X_test)
svm.score(X_test_scaled, y_test)

0.95104895104895104

### Parameter Selection with Preprocessing 

In [3]:
from sklearn.model_selection import GridSearchCV
# illustration purposes only, don't use this code
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)
print("best cross-validation accuracy:", grid.best_score_)
print("test set score: ", grid.score(X_test_scaled, y_test))
print("best parameters: ", grid.best_params_)

best cross-validation accuracy: 0.981220657277
test set score:  0.972027972028
best parameters:  {'C': 1, 'gamma': 1}


## the way we scaled the data somehow we leaked unscaled test data??

In [18]:
import os
os.listdir(os.getcwd())

['.gitignore',
 '.ipynb_checkpoints',
 '01 Review of Supervised Learning.ipynb',
 '02 Pipelines.ipynb',
 '03 Evaluation Metrics.ipynb',
 '04 working with text data.ipynb',
 '5 Out Of Core Learning.ipynb',
 '6 Custom Estimators.ipynb',
 'data',
 'extra - Feature engineering, feature selection.ipynb',
 'extra - feature extraction with NMF and PCA.ipynb',
 'extra - Gradient Boosting.ipynb',
 'extra - imbalanced datasets.ipynb',
 'extra - missing values.ipynb',
 'extra - Neural Networks.ipynb',
 'extra - Outlier Detection.ipynb',
 'extra - topic modelling.ipynb',
 'figures',
 'mglearn',
 'plots',
 'robust_pca.py',
 'solutions',
 'tree_plotting.py']

In [21]:
from scipy.misc import imread

ImportError: cannot import name 'imread'

In [20]:
import mglearn
mglearn.plots.plot_improper_processing()

ImportError: cannot import name 'imread'

### Building Pipelines

In [7]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

In [8]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [9]:
pipe.score(X_test, y_test)

0.95104895104895104

### Using Pipelines in Grid-searches

In [10]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [11]:
# HOW DOES THIS HELP AVOID THE EARLIER PROBLEM?
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("best cross-validation accuracy:", grid.best_score_)
print("test set score: ", grid.score(X_test, y_test))
print("best parameters: ", grid.best_params_)

best cross-validation accuracy: 0.981220657277
test set score:  0.972027972028
best parameters:  {'svm__C': 1, 'svm__gamma': 1}


In [22]:
mglearn.plots.plot_proper_processing()

NameError: name 'mglearn' is not defined

In [27]:
(myenv)

NameError: name 'myenv' is not defined

## ANYTHING DONE OUTSIDE CROSS VALIDATION IS BAD??

In [23]:
rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100,))

In [None]:
from sklearn.feature_selection import SelectPercentile, f_regression

select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y)
X_selected = select.transform(X)
print(X_selected.shape)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
np.mean(cross_val_score(Ridge(), X_selected, y, cv=5))

In [None]:
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression, percentile=5)), ("ridge", Ridge())])
np.mean(cross_val_score(pipe, X, y, cv=5))

### The General Pipeline Interface

In [None]:
def fit(self, X, y):
    X_transformed = X
    for step in self.steps[:-1]:
        # iterate over all but the final step
        # fit and transform the data
        X_transformed = step[1].fit_transform(X_transformed, y)
    # fit the last step
    self.steps[-1][1].fit(X_transformed, y)
    return self

In [None]:
def predict(self, X):
    X_transformed = X
    for step in self.steps[:-1]:
        # iterate over all but the final step
        # transform the data
        X_transformed = step[1].transform(X_transformed)
    # fit the last step
    return self.steps[-1][1].predict(X_transformed)

![pipeline_illustration](figures/pipeline.svg)

### Convenient Pipeline creation with ``make_pipeline``

In [None]:
from sklearn.pipeline import make_pipeline
# standard syntax
pipe_long = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])
# abbreviated syntax
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))

In [None]:
pipe_short.steps

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
pipe.steps

#### Accessing step attributes

In [None]:
# fit the pipeline defined above to the cancer dataset
pipe.fit(cancer.data)
# extract the first two principal components from the "pca" step
components = pipe.named_steps["pca"].components_
print(components.shape)

#### Accessing attributes in grid-searched pipeline.

In [None]:
pipe.named_steps.pca.components_

In [None]:
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [None]:
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=4)
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_)

In [None]:
print(grid.best_estimator_.named_steps["logisticregression"])

In [None]:
print(grid.best_estimator_.named_steps["logisticregression"].coef_)

### Grid-searching preprocessing steps and model parameters

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)

from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(),
    Ridge())

In [None]:
param_grid = {'polynomialfeatures__degree': [1, 2, 3],
              'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
res = pd.pivot_table(res, index=['param_polynomialfeatures__degree', 'param_ridge__alpha'],
               values=['mean_train_score', 'mean_test_score'])

In [None]:
res['mean_train_score'].unstack()

In [None]:
print(grid.best_params_)

In [None]:
grid.score(X_test, y_test)

In [None]:
from sklearn.linear_model import Lasso

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('regressor', Ridge())])
param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), None],
              'regressor': [Ridge(), Lasso()],
              'regressor__alpha': np.logspace(-3, 3, 7)}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

In [None]:
grid.best_params_

# Exercise

Load the boston housing dataset using ``sklearn.datasets.load_boston``. Create a pipline using scaling, polynomial features and a linear regression model (like ridge or lasso).

Search over the best options for the polynomial features together with the regularization of a linear model.