## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [2]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

## Regression Problem

In [9]:
diabetes = datasets.load_diabetes()

x_train, x_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.1)

normalizer = StandardScaler()

normalizer.fit(diabetes.data)

x_train_n, x_test_n = normalizer.transform(x_train), normalizer.transform(x_test)

In [13]:
GBRG = GradientBoostingRegressor(random_state=42)

GBRG.fit(x_train_n, y_train)

y_pred = GBRG.predict(x_test_n)

MSE, R2_score = metrics.mean_squared_error(y_pred, y_test), metrics.r2_score(y_pred, y_test)

print("MSE : %.3f"%MSE)

print("R2 score : %.3f"%R2_score)

MSE : 3702.845
R2 score : -0.692


In [18]:
n_estimators = [n for n in range(10,201,20)]

max_depth = [d for d in range(1,10,2)]

learning_rate = [lr/100 for lr in range(1,10)]

min_samples_leaf =[msl for msl in range(1,5)]

random_state=[42]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, 
                  learning_rate=learning_rate, min_samples_leaf=min_samples_leaf,random_state=random_state)

grid_search = GridSearchCV(GBRG, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

grid_result = grid_search.fit(x_train_n, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 1800 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1569 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 2819 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 4569 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed:   38.2s finished


In [19]:
grid_result.best_params_

{'learning_rate': 0.08,
 'max_depth': 1,
 'min_samples_leaf': 4,
 'n_estimators': 130,
 'random_state': 42}

In [20]:
n_estimators = grid_result.best_params_['n_estimators']
max_depth = grid_result.best_params_['max_depth']
min_samples_leaf = grid_result.best_params_['min_samples_leaf']
learning_rate = grid_result.best_params_['learning_rate']


GBRG_best = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                      min_samples_leaf=min_samples_leaf, 
                                      learning_rate=learning_rate, random_state=42)

GBRG_best.fit(x_train_n, y_train)

y_pred = GBRG_best.predict(x_test_n)

MSE, R2_score = metrics.mean_squared_error(y_pred, y_test), metrics.r2_score(y_pred, y_test)

print("MSE : %.3f"%MSE)

print("R2 score : %.3f"%R2_score)

MSE : 3001.758
R2 score : -0.333


## Classification Problem

In [43]:
wine = datasets.load_wine()

x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.1)

normalizer = StandardScaler()

normalizer.fit(wine.data)

x_train_n, x_test_n = normalizer.transform(x_train), normalizer.transform(x_test)

In [44]:
GBCF = GradientBoostingClassifier(random_state=1)

GBCF.fit(x_train_n, y_train)

y_pred = GBCF.predict(x_test_n)

Acc, F1_score = metrics.accuracy_score(y_pred, y_test), metrics.f1_score(y_pred, y_test, average='macro')

print("Accuracy : %.3f"%Acc)

print("F1 score : %.3f"%F1_score)

Accuracy : 0.889
F1 score : 0.915


In [37]:
n_estimators = [n for n in range(100,600,100)]

max_depth = [d for d in range(1,10,2)]

learning_rate = [lr/100 for lr in range(1,10)]

min_samples_leaf =[msl for msl in range(1,10)]

random_state=[42]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, 
                  learning_rate=learning_rate, min_samples_leaf=min_samples_leaf,random_state=random_state)

grid_search = GridSearchCV(GBCF, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

grid_result = grid_search.fit(x_train_n, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 2025 candidates, totalling 6075 fits


[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 259 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 509 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done 859 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1309 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1859 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2509 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 3259 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 4109 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 5059 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 6075 out of 6075 | elapsed:  5.8min finished


In [38]:
grid_result.best_params_

{'learning_rate': 0.02,
 'max_depth': 5,
 'min_samples_leaf': 9,
 'n_estimators': 400,
 'random_state': 42}

In [45]:
n_estimators = grid_result.best_params_['n_estimators']
max_depth = grid_result.best_params_['max_depth']
min_samples_leaf = grid_result.best_params_['min_samples_leaf']
learning_rate = grid_result.best_params_['learning_rate']


GBCF_best = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                      min_samples_leaf=min_samples_leaf, 
                                      learning_rate=learning_rate, random_state=42)

GBCF_best.fit(x_train_n, y_train)

y_pred = GBCF_best.predict(x_test_n)

Acc, F1_score = metrics.accuracy_score(y_pred, y_test), metrics.f1_score(y_pred, y_test, average='macro')

print("Accuracy : %.3f"%Acc)

print("F1 score : %.3f"%F1_score)

Accuracy : 1.000
F1 score : 1.000
