# パイプラインに対するクロスバリデーション

## データの準備<a name="data"></a>

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

np.unique(df['Class label'])

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import train_test_split

X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## パイプライン構築<a name="pipeline"></a>

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC

param_grid = {'poly__degree': np.arange(1, 4), 'estimator__C': np.logspace(-2, 2, 5), 'estimator__gamma': np.logspace(-2, 1, 4)}

pipe = Pipeline([
    ('poly', PolynomialFeatures()),
    ('estimator', SVC(random_state=0))])

## クロスバリデーション実行<a name="cross-validation"></a>

In [None]:
from sklearn.model_selection import GridSearchCV

cv = GridSearchCV(pipe, param_grid, scoring='accuracy', n_jobs=-1, cv=5, verbose=1)
cv.fit(X_train, y_train)

In [None]:
cv_result = pd.DataFrame(cv.cv_results_)
cv_result

In [None]:
cv.best_params_

In [None]:
score_cv = cv.best_score_
score_test = cv.best_estimator_.score(X_test, y_test)
print('CV: {cv:.3f}, Test: {test:.3f}'.format(cv=score_cv, test=score_test))