In [22]:
import pandas as pd
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split,GridSearchCV

csv = pd.read_csv('./sample_data/iris.csv')
x = csv.iloc[:,1:5]
y = csv.iloc[:,5]
train_data, test_data, train_label, test_label = train_test_split(x,y)


def eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print("오차 행렬\n")
    print(confusion)
    print("\n정답률 : ", accuracy)

xg_model = XGBClassifier(learning_rate = 0.1,
                         n_estimators = 1000,
                         max_depth = 5,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective= 'multi:softmax',
                         nthread=4,
                         scale_pos_weight=1,
                         seed=27
                        )
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xg_model, param_grid = param_test1, cv = 3)
gsearch1.fit(train_data,train_label)
scores_df = pd.DataFrame(gsearch1.cv_results_)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch1.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch1.best_score_))
xg_model = gsearch1.best_estimator_



    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.208774      0.006582         0.001330    4.696845e-04   
1        0.186166      0.016375         0.002994    8.126421e-04   
2        0.196142      0.016774         0.003326    2.050170e-03   
3        0.197139      0.007523         0.003324    1.243773e-03   
4        0.181182      0.001697         0.002992    1.411246e-03   
5        0.175199      0.015798         0.002664    9.398802e-04   
6        0.197804      0.005421         0.002661    9.420102e-04   
7        0.181846      0.012466         0.001995    1.123916e-07   
8        0.179520      0.007052         0.001995    7.018853e-07   
9        0.187830      0.014933         0.001995    4.052337e-07   
10       0.178520      0.011486         0.002337    4.640022e-04   
11       0.181846      0.015480         0.002328    4.708646e-04   

   param_max_depth param_min_child_weight  \
0                3                      1   
1                3       



    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.208774      0.006582         0.001330    4.696845e-04   
1        0.186166      0.016375         0.002994    8.126421e-04   
2        0.196142      0.016774         0.003326    2.050170e-03   
3        0.197139      0.007523         0.003324    1.243773e-03   
4        0.181182      0.001697         0.002992    1.411246e-03   
5        0.175199      0.015798         0.002664    9.398802e-04   
6        0.197804      0.005421         0.002661    9.420102e-04   
7        0.181846      0.012466         0.001995    1.123916e-07   
8        0.179520      0.007052         0.001995    7.018853e-07   
9        0.187830      0.014933         0.001995    4.052337e-07   
10       0.178520      0.011486         0.002337    4.640022e-04   
11       0.181846      0.015480         0.002328    4.708646e-04   

   param_max_depth param_min_child_weight  \
0                3                      1   
1                3       

In [None]:
# 앞, 뒤 1
param_test2 = {
    'max_depth':[2,3,4],
    'min_child_weight':[1,2,4,5,6]
}
gsearch2 = GridSearchCV(estimator = xg_model, param_grid = param_test2, cv = 3)
gsearch2.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch2.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch2.best_score_))
xg_model = gsearch2.best_estimator_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xg_model, param_grid = param_test3, cv = 3)
gsearch3.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch3.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch3.best_score_))
xg_model = gsearch3.best_estimator_

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = xg_model, param_grid = param_test4, cv = 3)
gsearch4.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch4.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch4.best_score_))
xg_model = gsearch4.best_estimator_

In [None]:
#앞뒤 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = xg_model, param_grid = param_test5, cv = 3)
gsearch5.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch5.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch5.best_score_))
xg_model = gsearch5.best_estimator_

In [None]:
#정규화
param_test6 = { 
 'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100] 
}
gsearch6 = GridSearchCV(estimator = xg_model, param_grid = param_test6, cv = 3)
gsearch6.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch6.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch6.best_score_))
xg_model = gsearch6.best_estimator_

In [None]:
#가까운 값 설정
param_test7 = { 
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05] 
}
gsearch7 = GridSearchCV(estimator = xg_model, param_grid = param_test7, cv = 3)
gsearch7.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch7.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch7.best_score_))
xg_model = gsearch7.best_estimator_

In [None]:
#learning_rate
param_test8 = { 
 'learning_rate': [i/100.0 for i in range(1,30,5)] 
}
gsearch8 = GridSearchCV(estimator = xg_model, param_grid = param_test8, cv = 3)
gsearch8.fit(train_data,train_label)
scores_df[['params','mean_test_score','rank_test_score','split0_test_score','split2_test_score']]
print(scores_df)
print("최적의 파라미터:",gsearch8.best_params_)
print("최고 정확도: {0:.4f}".format(gsearch8.best_score_))
xg_model = gsearch8.best_estimator_
#lambda, rate 필요하면 더 자세히

In [None]:
xg_model.fit(train_data.values,train_label.values)
xg_pred = xg_model.predict(test_data.values)

print("Extreme Gradient Boost")
print(xg_model.feature_importances_)
pyplot.bar(range(len(xg_model.feature_importances_)), xg_model.feature_importances_)
pyplot.show()
plot_importance(xg_model)
pyplot.show()
eval(test_label.values,xg_pred)