In [1]:
%matplotlib inline


=========================================================
Pipelining: chaining a PCA and a logistic regression
=========================================================

The PCA does an unsupervised dimensionality reduction, while the logistic
regression does the prediction.

We use a GridSearchCV to set the dimensionality of the PCA




In [10]:
print(__doc__)


# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause


import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

logistic = linear_model.LogisticRegression()

pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

# Plot the PCA spectrum
pca.fit(X_digits)

# plt.figure(1, figsize=(4, 3))
# plt.clf()
# plt.axes([.2, .2, .7, .7])
# plt.plot(pca.explained_variance_, linewidth=2)
# plt.axis('tight')
# plt.xlabel('n_components')
# plt.ylabel('explained_variance_')

# Prediction
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)

# Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              logistic__C=Cs))
estimator.fit(X_digits, y_digits)

# plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
#             linestyle=':', label='n_components chosen')
# plt.legend(prop=dict(size=12))
# plt.show()

Automatically created module for IPython interactive environment


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [20, 40, 64], 'logistic__C': array([  1.00000e-04,   1.00000e+00,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [6]:
import pandas as pd
pd.DataFrame(estimator.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_logistic__C,param_pca__n_components,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.027984,0.00089,0.875904,0.918462,0.0001,20,"{'logistic__C': 0.0001, 'pca__n_components': 20}",9,0.887043,0.909623,0.874791,0.919866,0.865772,0.925895,0.005491,0.000137,0.008719,0.006717
1,0.046127,0.000648,0.88759,0.929595,0.0001,40,"{'logistic__C': 0.0001, 'pca__n_components': 40}",7,0.893688,0.924686,0.889816,0.927379,0.879195,0.936719,0.005299,2e-05,0.006122,0.005156
2,0.052713,0.000728,0.887034,0.930429,0.0001,64,"{'logistic__C': 0.0001, 'pca__n_components': 64}",8,0.892027,0.925523,0.889816,0.927379,0.879195,0.938385,0.001317,2.1e-05,0.005596,0.005676
3,0.081763,0.000704,0.916528,0.969112,1.0,20,"{'logistic__C': 1.0, 'pca__n_components': 20}",3,0.925249,0.968201,0.916528,0.966611,0.907718,0.972523,0.007141,6e-05,0.007157,0.002498
4,0.194054,0.000731,0.922649,0.99082,1.0,40,"{'logistic__C': 1.0, 'pca__n_components': 40}",1,0.910299,0.992469,0.944908,0.989983,0.912752,0.990008,0.014334,1.3e-05,0.015772,0.001166
5,0.318012,0.00098,0.917641,0.99499,1.0,64,"{'logistic__C': 1.0, 'pca__n_components': 64}",2,0.908638,0.994142,0.944908,0.994992,0.899329,0.995837,0.023731,0.000108,0.019652,0.000692
6,0.158002,0.001062,0.902615,0.98219,10000.0,20,"{'logistic__C': 10000.0, 'pca__n_components': 20}",4,0.89701,0.982427,0.90818,0.979132,0.902685,0.985012,0.014003,0.000433,0.004566,0.002407
7,0.437109,0.000868,0.900946,0.999443,10000.0,40,"{'logistic__C': 10000.0, 'pca__n_components': 40}",5,0.862126,0.999163,0.931553,0.999165,0.909396,1.0,0.015546,9.6e-05,0.028996,0.000394
8,0.738991,0.000926,0.894268,1.0,10000.0,64,"{'logistic__C': 10000.0, 'pca__n_components': 64}",6,0.865449,1.0,0.923205,1.0,0.894295,1.0,0.096165,7.2e-05,0.023609,0.0


In [9]:
X_digits.shape

(1797, 64)