In [1]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, 
                  train_size=0.8, 
                  test_size=0.2, 
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [4]:
from sklearn.decomposition import PCA

pca = PCA(whiten=True)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca  = pca.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear')

In [6]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.956140350877193

In [8]:
from sklearn.pipeline import Pipeline

estimators = [('pca', PCA(whiten=True)),
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [9]:
pipe.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [10]:
pipe.score(X_test, y_test)

0.9649122807017544

In [11]:
estimators = [('pca', PCA(whiten=True)), 
              ('clf', LogisticRegression(solver='liblinear'))]
pipe = Pipeline(estimators)

In [12]:
from sklearn.model_selection import GridSearchCV

param = {'clf__C':[1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10]} # clf.C

gs = GridSearchCV(pipe, param)
gs.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=True)),
                                       ('clf',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                     

In [13]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 1}, 0.9560439560439561, Pipeline(memory=None,
          steps=[('pca',
                  PCA(copy=True, iterated_power='auto', n_components=None,
                      random_state=None, svd_solver='auto', tol=0.0,
                      whiten=True)),
                 ('clf',
                  LogisticRegression(C=1, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='warn', n_jobs=None,
                                     penalty='l2', random_state=None,
                                     solver='liblinear', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False))

In [14]:
gs.score(X_test, y_test)

0.9649122807017544

In [16]:
from sklearn.svm import SVC

C_range = [1e-3,1e-2,1,1e2,1e3]

param = {'clf__C':C_range,
         'clf__kernel':['linear','rbf'],
         'pca__whiten':[True, False],
          'pca__n_components':[30,20,10]}

estimators = [('pca',PCA()),
              ('clf',SVC(gamma='auto'))]

pipe = Pipeline(estimators)

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.6min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              SVC(C=1.0, cache_size=200,
                                                  class_weight=None, coef0=0.0,
                                                  decision_function_shape='ovr',
                                                  degree=3, gamma='auto',
                                                  kernel

In [17]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'pca__whiten': False,
  'pca__n_components': 30,
  'clf__kernel': 'linear',
  'clf__C': 100.0},
 0.9560439560439561,
 Pipeline(memory=None,
          steps=[('pca',
                  PCA(copy=True, iterated_power='auto', n_components=30,
                      random_state=None, svd_solver='auto', tol=0.0,
                      whiten=False)),
                 ('clf',
                  SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
                      decision_function_shape='ovr', degree=3, gamma='auto',
                      kernel='linear', max_iter=-1, probability=False,
                      random_state=None, shrinking=True, tol=0.001,
                      verbose=False))],
          verbose=False))

In [18]:
gs.score(X_test, y_test)

0.956140350877193

In [19]:
gs.cv_results_

{'mean_fit_time': array([8.50722562e+01, 3.60097090e-02, 1.63327853e-02, 1.98368613e+00,
        7.66611099e-03, 5.99741936e-03, 1.99995836e-02, 7.33272235e-03,
        7.00076421e-03, 3.99963061e-03]),
 'std_fit_time': array([5.67294995e+01, 1.28447111e-02, 2.49697629e-03, 1.27165866e+00,
        9.44089487e-04, 8.15465805e-04, 3.55812096e-03, 9.42853601e-04,
        8.16242742e-04, 6.83651389e-07]),
 'mean_score_time': array([0.00099874, 0.00133435, 0.00233412, 0.0006667 , 0.00333309,
        0.00200137, 0.00133451, 0.00233324, 0.00166702, 0.00100056]),
 'std_score_time': array([2.03239311e-06, 4.71932673e-04, 4.71876126e-04, 4.71426560e-04,
        1.88598720e-03, 1.47400196e-06, 4.72831444e-04, 4.70809040e-04,
        9.42741029e-04, 2.97360213e-07]),
 'param_pca__whiten': masked_array(data=[False, False, True, False, True, True, False, True,
                    False, True],
              mask=[False, False, False, False, False, False, False, False,
                    False, Fals