# パイプラインに対するクロスバリデーション

## データの準備<a name="data"></a>

In [None]:
import numpy as np
import pandas as pd

# USの国勢調査から収入（50K以下かどうか）を予測する
# http://archive.ics.uci.edu/ml/datasets/Adult
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
categorical = [1, 3, 5, 6, 7, 8, 9, 13]
continuous = [i for i in range(df.columns.size - 1) if i not in categorical]
df.tail()

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = categorical + [df.columns.size - 1]
label_encoders = [LabelEncoder().fit(df.iloc[:, i]) for i in categorical_columns]
for i, col in enumerate(categorical_columns):
    df.iloc[:, col] = label_encoders[i].transform(df.iloc[:, col])

df.tail()

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

X_categorical, X_continuous, y = df.iloc[:, categorical], df.iloc[:, continuous], df.iloc[:, -1]

ohe = OneHotEncoder(sparse=False).fit(X_categorical)
X_ohe = ohe.transform(X_categorical)

X_ohe_train, X_ohe_test, X_continuous_train, X_continuous_test, y_train, y_test = train_test_split(X_ohe, X_continuous, y, test_size=.1, random_state=0)

scaler = StandardScaler().fit(X_continuous_train)
X_scaled_train = scaler.transform(X_continuous_train)
X_scaled_test = scaler.transform(X_continuous_test)
X_train, X_test = np.hstack((X_scaled_train, X_ohe_train)), np.hstack((X_scaled_test, X_ohe_test))

for name, arr in zip(['X_train', 'X_test', 'y_train', 'y_test'], [X_train, X_test, y_train, y_test]):
    print('{name}\n shape: {shape}\n sample: {sample}'.format(name=name, shape=arr.shape, sample=arr[:3]))

## パイプライン構築<a name="pipeline"></a>

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('PCA', PCA()),
    ('estimator', LogisticRegression(random_state=0))
])

param_grid = {
    'PCA__n_components': np.arange(50, 101, 10),
    'estimator__penalty': ['l1', 'l2'],
    'estimator__C': np.logspace(-2, 2, 5)
}

## クロスバリデーション実行<a name="cross-validation"></a>

In [None]:
from sklearn.model_selection import RandomizedSearchCV

cv = RandomizedSearchCV(pipe, param_grid, n_iter=10, scoring='accuracy', n_jobs=-1, cv=5, verbose=2, random_state=0)
cv.fit(X_train, y_train)

In [None]:
cv_result = pd.DataFrame(cv.cv_results_)
cv_result

In [None]:
cv.best_params_

In [None]:
score_cv = cv.best_score_
score_test = cv.best_estimator_.score(X_test, y_test)
print('CV: {cv:.3f}, Test: {test:.3f}'.format(cv=score_cv, test=score_test))