# 5장. XGBoost 소개

*아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.*

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://nbviewer.org/github/rickiepark/handson-gb/blob/main/Chapter05/Advanced_XGBoost_Unveiled.ipynb"><img src="https://jupyter.org/assets/share.png" width="60" />주피터 노트북 뷰어로 보기</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/rickiepark/handson-gb/blob/main/Chapter05/Advanced_XGBoost_Unveiled.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩(Colab)에서 실행하기</a>
  </td>
</table>

In [None]:
# 노트북이 코랩에서 실행 중인지 체크합니다.
import sys
if 'google.colab' in sys.modules:
    !pip install -q --upgrade xgboost
    !wget -q https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter05/atlas-higgs-challenge-2014-v2.csv.gz

[K     |████████████████████████████████| 255.9 MB 35 kB/s 
[?25h

In [13]:
# 경고 끄기
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
xgb.set_config(verbosity=0)

## XGBoost 모델 만들기

### 붓꽃 데이터셋

In [14]:
import pandas as pd
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [15]:
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                  columns=iris['feature_names'] + ['target'])

# np.c_ --> concatenate (합친다는) 의미

In [16]:
df.head()

# target 추가된 것 확인할 수 있다.

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# 데이터를 훈련 세트와 테스트 세트로 나눕니다.
X_train, X_test, y_train, y_test = train_test_split(iris['data'], 
                                                    iris['target'], random_state=2)

#### XGBoost 분류 모델

In [19]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [20]:
xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', 
                    max_depth=6, learning_rate=0.1, n_estimators=100, 
                    n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
score = accuracy_score(y_pred, y_test)
print('점수: ' + str(score))

점수: 0.9736842105263158


`accuracy_score()` 함수 대신 `score()` 메서드를 사용할 수 있습니다.

In [21]:
xgb.score(X_test, y_test)

# 둘 다 똑같은 값 출력됨 
# 우리는 mse 활용할고 있었음

0.9736842105263158

XGBoost의 기본 파이썬 API를 사용하는 경우 부스터(Booster) 객체의 `predict()` 메서드는 `multi:softprob`일 때 확률을 반환하고 `multi:softmax`일 때 클래스 레이블을 반환합니다.

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test[:5])

param = {'objective': 'multi:softprob', 'num_class': 3}
bstr = xgb.train(param, dtrain, 10)
bstr.predict(dtest)

array([[0.9486482 , 0.02711029, 0.02424142],
       [0.9486482 , 0.02711029, 0.02424142],
       [0.02841366, 0.05416913, 0.9174172 ],
       [0.9486482 , 0.02711029, 0.02424142],
       [0.9486482 , 0.02711029, 0.02424142]], dtype=float32)

In [None]:
param = {'objective': 'multi:softmax', 'num_class': 3}
bstr = xgb.train(param, dtrain, 10)
bstr.predict(dtest)

array([0., 0., 2., 0., 0.], dtype=float32)

### 당뇨병 데이터셋

In [22]:
X, y = datasets.load_diabetes(return_X_y=True)

#### XGBoost 회귀 모델 (교차 검증)

In [23]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror', 
                   max_depth=6, learning_rate=0.1, n_estimators=100, 
                   n_jobs=-1)

scores = cross_val_score(xgb, X, y, 
                         scoring='neg_mean_squared_error', cv=5)

# 평가 점수의 제곱근을 계산한합니다.
rmse = np.sqrt(-scores)

# RMSE를 출력합니다.
print('RMSE:', np.round(rmse, 3))

# 평균 점수를 출력합니다.
print('RMSE 평균: %0.3f' % (rmse.mean()))

RMSE: [63.033 59.689 64.538 63.699 64.661]
RMSE 평균: 63.124


In [24]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


## 힉스 보손 찾기 - 사례연구

### 데이터

In [25]:
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')
df.head()

# 250000개만 사용하겠다는 의미

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [26]:
del df['Weight']
del df['KaggleSet']
df = df.rename(columns={"KaggleWeight": "Weight"})
label_col = df['Label']
del df['Label']
df['Label'] = label_col

##### 데이터 프레임의 열을 삭제하고 옮기는 방법

In [27]:
df_h = pd.read_csv('atlas-higgs-challenge-2014-v2.csv.gz', nrows=250000, compression='gzip')

In [28]:
df_new = df_h.drop(['Weight', 'KaggleSet', 'Label'], axis=1).assign(Label=df_h['Label'])
df_new = df_new.rename(columns={'KaggleWeight': 'Weight'})


# df_h에서 드롭해도, df_h는 그대로 유지 -> 드롭된 새로운 객체가 df_new로 들어가는 것! :) 

In [29]:
df_new.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [30]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [31]:
df.info()

# DEF : 유도된 데이터
# PRI : 측정된 데이터 (primitives)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [32]:
df['Label'].replace(('s', 'b'), (1, 0), inplace=True)

# s -> 1 , b -> 0 으로 바꾸겠다는 의미

In [34]:
X = df.iloc[:,1:31]
y = df.iloc[:,-1]

'''
X: 전체 행 중에, 열: 1부터 (첫번째 열 삭제, 이벤트 아이디) 31 까지 (30까지)
y: 전체 행 중에, 열: 마지막 한 줄만! (label, target)
'''

'\nX: 전체 행 중에, 열: 1부터 (첫번째 열 삭제, 이벤트 아이디) 31 까지 (30까지)\ny: 전체 행 중에, 열: 마지막 한 줄만! (label, target)\n'

##### 열을 선택하는 다른 방법

In [None]:
X_new = df_new.loc[:, ~df_new.columns.isin(['EventId','Weight','Label'])]
y_new = df_new.loc[:, 'Label']

# 이건 컬럼이름을 기준으로 변형

In [35]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_new = le.fit_transform(df_new['Label'])
y_new

array([1, 0, 0, ..., 1, 0, 0])

In [36]:
print(le.classes_)

['b' 's']


### 측정 지표

In [38]:
xgb = XGBClassifier(n_estimators=5)
xgb.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_test, y_test)],
        eval_metric='mlogloss')

# mlogloss -> multi -

[0]	validation_0-mlogloss:0.73655	validation_1-mlogloss:0.74850
[1]	validation_0-mlogloss:0.52525	validation_1-mlogloss:0.54465
[2]	validation_0-mlogloss:0.38855	validation_1-mlogloss:0.41754
[3]	validation_0-mlogloss:0.29493	validation_1-mlogloss:0.33279
[4]	validation_0-mlogloss:0.22880	validation_1-mlogloss:0.27454


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=5, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [39]:
xgb.evals_result()

{'validation_0': OrderedDict([('mlogloss',
               [0.7365548280732972,
                0.5252466491822686,
                0.3885539788752794,
                0.2949308359197208,
                0.22880319266447])]),
 'validation_1': OrderedDict([('mlogloss',
               [0.7485030999309138,
                0.5446543285721227,
                0.4175380499739396,
                0.33279044534030716,
                0.2745441574799387])])}

### 가중치

In [40]:
df['test_Weight'] = df['Weight'] * 550000 / len(y)

In [41]:
s = np.sum(df[df['Label']==1]['test_Weight'])
b = np.sum(df[df['Label']==0]['test_Weight'])

In [42]:
b/s

593.9401931492318

### 모델

In [43]:
import xgboost as xgb

# 넘파이 배열로 xgboost.DMatrix 만들기. -999.0는 누락된 값으로 다룹니다.
xgmat = xgb.DMatrix(X, y, missing = -999.0, weight=df['test_Weight'])

# xgboost 파라미터 설정
param = {}
# 랭킹만 필요하기 때문에 로지스틱 함수를 적용하기 전의 출력을 사용합니다.
param['objective'] = 'binary:logitraw'
# 양성 샘플의 가중치를 조정합니다.
param['scale_pos_weight'] = b/s
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'

# ams 지표를 추가합니다.
plst = list(param.items())+[('eval_metric', 'ams@0.15')]

watchlist = [(xgmat,'train')]

# 120개 트리 부스팅
num_round = 120

print('데이터 로딩 완료, 트리 부스팅 시작')
bst = xgb.train(plst, xgmat, num_round, watchlist)
bst.save_model('higgs.model')
print('훈련 종료')

데이터 로딩 완료, 트리 부스팅 시작
[0]	train-auc:0.91091	train-ams@0.15:3.71929
[1]	train-auc:0.91535	train-ams@0.15:3.99108
[2]	train-auc:0.91800	train-ams@0.15:4.11401
[3]	train-auc:0.91953	train-ams@0.15:4.24602
[4]	train-auc:0.92050	train-ams@0.15:4.24262
[5]	train-auc:0.92133	train-ams@0.15:4.25282
[6]	train-auc:0.92226	train-ams@0.15:4.33230
[7]	train-auc:0.92338	train-ams@0.15:4.35821
[8]	train-auc:0.92389	train-ams@0.15:4.37353
[9]	train-auc:0.92427	train-ams@0.15:4.36459
[10]	train-auc:0.92484	train-ams@0.15:4.36423
[11]	train-auc:0.92543	train-ams@0.15:4.40963
[12]	train-auc:0.92584	train-ams@0.15:4.41484
[13]	train-auc:0.92645	train-ams@0.15:4.46174
[14]	train-auc:0.92688	train-ams@0.15:4.43823
[15]	train-auc:0.92738	train-ams@0.15:4.47194
[16]	train-auc:0.92805	train-ams@0.15:4.52048
[17]	train-auc:0.92848	train-ams@0.15:4.57089
[18]	train-auc:0.92903	train-ams@0.15:4.60307
[19]	train-auc:0.92937	train-ams@0.15:4.65550
[20]	train-auc:0.92989	train-ams@0.15:4.69064
[21]	train-auc:0.93018	

KeyboardInterrupt: ignored

##### 사이킷런 API로 구현하기

In [None]:
clf = XGBClassifier(n_estimators=120, learning_rate=0.1, missing=-999.0, 
                    scale_pos_weight=b/s)

clf.fit(X, y, sample_weight=df['test_Weight'], 
        eval_set=[(X, y)], eval_metric=['auc', 'ams@0.15'],
        sample_weight_eval_set=[df['test_Weight']])

clf.save_model('higgs-sklearn.model')

[0]	validation_0-auc:0.91091	validation_0-ams@0.15:3.70024
[1]	validation_0-auc:0.91531	validation_0-ams@0.15:3.97742
[2]	validation_0-auc:0.91774	validation_0-ams@0.15:4.07677
[3]	validation_0-auc:0.91935	validation_0-ams@0.15:4.20546
[4]	validation_0-auc:0.92014	validation_0-ams@0.15:4.13620
[5]	validation_0-auc:0.92102	validation_0-ams@0.15:4.16953
[6]	validation_0-auc:0.92194	validation_0-ams@0.15:4.26084
[7]	validation_0-auc:0.92234	validation_0-ams@0.15:4.26244
[8]	validation_0-auc:0.92333	validation_0-ams@0.15:4.32964
[9]	validation_0-auc:0.92419	validation_0-ams@0.15:4.38172
[10]	validation_0-auc:0.92474	validation_0-ams@0.15:4.39413
[11]	validation_0-auc:0.92532	validation_0-ams@0.15:4.40828
[12]	validation_0-auc:0.92592	validation_0-ams@0.15:4.44710
[13]	validation_0-auc:0.92633	validation_0-ams@0.15:4.45121
[14]	validation_0-auc:0.92696	validation_0-ams@0.15:4.48980
[15]	validation_0-auc:0.92743	validation_0-ams@0.15:4.51729
[16]	validation_0-auc:0.92808	validation_0-ams@0.1

In [None]:
clf.evals_result()

{'validation_0': OrderedDict([('auc',
               [0.910911121245328,
                0.9153075565204724,
                0.9177425637607691,
                0.9193451707006353,
                0.9201387259271532,
                0.9210228657574712,
                0.9219436673788809,
                0.9223374307608306,
                0.9233299557910364,
                0.9241862369604908,
                0.9247368586444726,
                0.9253168235329534,
                0.9259226966190566,
                0.9263338816612989,
                0.9269582907714696,
                0.9274275958592545,
                0.9280811353634686,
                0.9284680048272551,
                0.9290349645288697,
                0.9295197089339288,
                0.9300404740456479,
                0.9304065602632191,
                0.9307342640086455,
                0.9311335049352224,
                0.9314240026414097,
                0.9317187324394958,
                0.932057729

In [None]:
clf.score(X, y)

0.800476