In [None]:
# Pipeline and hyperparameter tuning

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

The Boston dataset is a small set composed of 506 samples and 13 features used for regression problems.

In [2]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

The pipeline we are going to setup is composed of the following tasks:

   * Data Normalization: in this tutorial we have selected three different normalization methods, including the QuantileTransformer (check out the documentation)..
   * Dimensionality Reduction: we selected Principal Component Analysis (PCA) and a univariate feature selection algorithm as possible candidates.
   * Regression: we apply a simple regularized linear method, although the method is easily extendable to other learning algorithms.


In [7]:
# manual way (scaler, PCA, ridge regression)
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()
X_train = scaler.fit_transform(X_train)
X_train = pca.fit_transform(X_train)
ridge.fit(X_train, y_train)

Ridge()

In [8]:
#simpler way
from sklearn.pipeline import Pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reduce_dim', PCA()),
        ('regressor', Ridge())
        ])

In [9]:
pipe = pipe.fit(X_train, y_train)
print('Testing score: ', pipe.score(X_test, y_test))

Testing score:  -7702.2625132236535


In [10]:
print(pipe.steps[1][1].explained_variance_)

[1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455
 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455 1.0026455]


# pipeline Tuning

In [12]:
import numpy as np
n_features_to_test = np.arange(1, 11)

In [13]:
alpha_to_test = 2.0**np.arange(-6, +6)

In [14]:
params = {'reduce_dim__n_components': n_features_to_test,\
              'regressor__alpha': alpha_to_test}

In [17]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Final score is:  -12806.499263567763
[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:    1.6s finished


In [18]:
gridsearch.best_params_

{'reduce_dim__n_components': 10, 'regressor__alpha': 2.0}

# Pipeline Tuning (Advanced Version)

In [22]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

In [23]:
params = {'scaler': scalers_to_test,
        'reduce_dim__n_components': n_features_to_test,
        'regressor__alpha': alpha_to_test}

In [24]:
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,\
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]

In [26]:
gridsearch = GridSearchCV(pipe, params, verbose=1,n_jobs=-1).fit(X_train, y_train)
print('Final score is: ', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 720 candidates, totalling 3600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:    2.1s
Final score is:  -3556.9294478860256
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:    3.9s finished


In [27]:
gridsearch.best_params_

{'reduce_dim': SelectKBest(score_func=<function f_regression at 0x7f280f397040>),
 'reduce_dim__k': 10,
 'regressor__alpha': 8.0,
 'scaler': RobustScaler()}