# Comparing GridSearch and Hyperopt by using Breast Cancer Wisonsin dataset

# References
hyperoptって何してんの？  
https://qiita.com/kenchin110100/items/ac3edb480d789481f134#hyperopt%E3%81%AE%E4%BD%BF%E3%81%84%E6%96%B9

hyperoptで再現性を担保するために必要なパラメータ設定について  
https://qiita.com/Jolt_power/items/c28bfb2b2e0b99509a40

Hyperparameter optimization for Neural Networks  
http://neupy.com/2016/12/17/hyperparameter_optimization_for_neural_networks.html

In [1]:
import pandas as pd
import numpy as np

# Loading the Breast Cancer Wisconsin dataset
## M = malignant, B = benign

In [2]:
df = pd.read_csv('./rawdata/wdbc.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [3]:
df.iloc[:,1].value_counts()

B    357
M    212
Name: 1, dtype: int64

In [4]:
X = df.loc[:,2:].values
print(X)

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]


In [5]:
y = df.loc[:,1]
print(y)

0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: 1, Length: 569, dtype: object


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y=le.fit_transform(y)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=1)

## SVC with GridSearchCV

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

In [9]:
svc = SVC(random_state=1)
st_kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)#.split(X_train, y_train)

pipe_svc = make_pipeline(StandardScaler(), svc)

In [10]:
%%time
param_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

param_grid = [{'svc__C':param_range, 'svc__kernel':['rbf'], 'svc__gamma':param_range}]

gs = GridSearchCV(estimator=pipe_svc,
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=st_kfold)

gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
print(gs.score(X_test, y_test))

0.9781159420289856
{'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
0.9736842105263158
Wall time: 7.65 s


In [11]:
%%time
param_C = [5,10,15,20,30,40,50,60,70,80]
param_gamma = [0.006, 0.008, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06]

param_grid2 = [{'svc__C':param_C, 'svc__kernel':['rbf'], 'svc__gamma':param_gamma}]

gs2 = GridSearchCV(estimator=pipe_svc,
                 param_grid=param_grid2,
                 scoring='accuracy',
                 cv=st_kfold)

gs2.fit(X_train, y_train)
print(gs2.best_score_)
print(gs2.best_params_)
print(gs2.score(X_test, y_test))

0.9846376811594203
{'svc__C': 30, 'svc__gamma': 0.008, 'svc__kernel': 'rbf'}
0.9736842105263158
Wall time: 4.02 s


# Hyperopt

In [27]:
from hyperopt import hp, tpe, Trials, fmin
from sklearn.model_selection import cross_val_score

In [28]:
hyperopt_parameters = {
    'C': hp.loguniform('C', np.log(0.0001), np.log(1000)),
    'gamma': hp.loguniform('gamma', np.log(0.0001), np.log(1000)),
    'kernel': hp.choice('kernel', ['rbf'])
}

In [29]:
def objective(args):
    
    svcH = SVC(**args, random_state=1)
    st_kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

    pipe_svcH = make_pipeline(StandardScaler(), svcH)
    
    score_array = cross_val_score(estimator=pipe_svcH, X=X_train, y=y_train, cv=st_kfold)
    
    score_mean = np.mean(score_array)
    
    return -1*score_mean

In [30]:
# iterationする回数
max_evals = 500
# 試行の過程を記録するインスタンス
trials = Trials()

In [31]:
%%time
best = fmin(
    # 最小化する値を定義した関数
    objective,
    # 探索するパラメータのdictもしくはlist
    space=hyperopt_parameters,
    # どのロジックを利用するか、基本的にはtpe.suggestでok
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials,
    # 試行の過程を出力
    verbose=1,
    # 結果再現のため乱数を固定
    rstate=np.random.RandomState(0)
)

100%|█████████████████████████████████████████████| 500/500 [00:42<00:00, 11.68trial/s, best loss: -0.9846376811594203]
Wall time: 42.8 s


In [32]:
best

{'C': 26.271132480263706, 'gamma': 0.009415740834817117, 'kernel': 0}

In [33]:
trials.best_trial['result']

{'loss': -0.9846376811594203, 'status': 'ok'}

In [34]:
svc_best = SVC(C=best['C'], gamma=best['gamma'], kernel='rbf', random_state=1)
pipe_best = make_pipeline(StandardScaler(), svc_best)
pipe_best.fit(X_train, y_train)
print(pipe_best.score(X_test, y_test))

0.9736842105263158
