# アルゴリズムとパイプライン

In [1]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=0)

# データの最小・最大値を0-1の範囲にスケーリング
scater = MinMaxScaler().fit(X_train)

X_train_scaled = scater.transform(X_train)

svm = SVC()
svm.fit(X_train_scaled, y_train)

X_test_scaled = scater.transform(X_test)
print("Test set score: {:.2f}".format(svm.score(X_test_scaled, y_test)))

Test set score: 0.94


### パイプラインの構築

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svm', SVC())
  ])

pipe.fit(X_train, y_train)
print("Test set score: {:.2f}".format(pipe.score(X_test, y_test)))

# 簡潔にやってくれる上に、交差検証なども簡単にできる

Test set score: 0.94


### パイプラインを使った交差検証

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))