# [範例重點]
了解 sklearn 中，GridSearchCV 的使用方法與原理

In [30]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
import warnings 

warnings.filterwarnings('ignore')

In [31]:
boston = datasets.load_boston()
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state = 42)

clf = GradientBoostingRegressor()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)

print(f' score : {metrics.mean_squared_error(pred, y_test)} ')

 score : 8.927816260948038 


In [34]:
n_estimators = [100,200,300]
max_depth = [1,3,5]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth)

gs = GridSearchCV(clf,param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, verbose = 0)
gs_result = gs.fit(x_train,y_train)

print(f' best score : {-1*gs_result.best_score_} , best param : {gs_result.best_params_} ' )

 best score : 12.562401404013583 , best param : {'max_depth': 3, 'n_estimators': 200} 


In [35]:
clf = GradientBoostingRegressor(**gs_result.best_params_)
clf.fit(x_train,y_train)
pred = clf.predict(x_test)

print(f' score after parameter tuning : {metrics.mean_squared_error(pred, y_test)} ')

 score after parameter tuning : 8.930458085862195 


# [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [36]:
#import the random hyperparameter search method to compare with grid hyperparameter search
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd

wine = datasets.load_wine()

train_df = pd.DataFrame(wine.data, columns = wine.feature_names)
train_label = pd.DataFrame(wine.target)

print(train_df.head(5))
print(train_label.head(5))

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                  

In [37]:
train_x, test_x, train_y, test_y = train_test_split(train_df,train_label, test_size = 0.25, random_state = 2019)
clf = GradientBoostingClassifier()
clf.fit(train_x, train_y)
pred = clf.predict(test_x)

print(f' Accuracy : {metrics.accuracy_score(pred,test_y)} ')

 Accuracy : 0.9333333333333333 


In [38]:
params = dict(n_estimators = [100,300,500], max_depth = [3,5,7])
GridSearch = GridSearchCV(clf,params, scoring = 'accuracy', n_jobs = -1, verbose = 0)
gsResult = GridSearch.fit(train_x, train_y)

print(f' Best accuracy : {gsResult.best_score_} , Best params : {gsResult.best_params_} ')

 Best accuracy : 0.9699248120300752 , Best params : {'max_depth': 5, 'n_estimators': 100} 


In [39]:
clf = GradientBoostingClassifier(**gsResult.best_params_)
clf.fit(train_x,train_y)
pred  = clf.predict(test_x)
print(f' Accuracy after tuned parameters : {metrics.accuracy_score(pred,test_y)} ')

 Accuracy after tuned parameters : 0.9333333333333333 


In [40]:
n_estimators = np.arange(50,550,100)
max_depth = np.arange(3,11,2)
random_params = dict(n_estimators = n_estimators, max_depth = max_depth)
rs = RandomizedSearchCV(clf, random_params, scoring = 'accuracy', n_jobs = -1, verbose = 0)
rsResult = rs.fit(train_x,train_y)

print(f' Best Score : {rsResult.best_score_} , Best Param : {rsResult.best_params_} ')

 Best Score : 0.9699248120300752 , Best Param : {'n_estimators': 250, 'max_depth': 5} 


In [41]:
clf = GradientBoostingClassifier(**rsResult.best_params_)
clf.fit(train_x,train_y)
pred = clf.predict(test_x)
print(f' Accuracy after tuned parameters : {metrics.accuracy_score(pred,test_y)} ')

 Accuracy after tuned parameters : 0.9333333333333333 
