# 8.5 안전 운전자 예측 경진대회 성능 개선 II : XGBoost 모델

- [안전 운전자 예측 경진대회 링크](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction)
- [모델링 코드 참고 링크](https://www.kaggle.com/xiaozhouwang/2nd-place-lightgbm-solution)

### 8.5.2 하이퍼파라미터 최적화

* 데이터셋 준비

In [None]:
%config Completer.use_jedi = False

import pandas as pd

data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')


In [None]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1)

all_features = all_data.columns

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [None]:
all_data['num_missing'] = (all_data == -1).sum(axis=1)

In [None]:
remaining_features = [feature for feature in all_features if ('cat' not in feature and 'calc' not in feature)]

remaining_features.append('num_missing')

In [None]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True

for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [None]:
all_data['mix_ind']

In [None]:
cat_count_features = []
for feature in cat_features +['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x: val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [None]:
cat_count_features

In [None]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features + cat_count_features].drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining), encoded_cat_matrix], format='csr')

In [None]:
num_train = len(train)
X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [None]:
import numpy as np

def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0]
    L_mid = np.linspace(1 / n_samples, 1, n_samples)
    
    pred_order = y_true[y_pred.argsort()]
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    G_pred = np.sum(L_mid - L_pred)
    
    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(true_order)
    G_true = np.sum(L_mid - L_true)
    
    return G_pred / G_true
    

In [None]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

bayes_dtrain = xgb.DMatrix(X_train, y_train)
bayes_dvalid = xgb.DMatrix(X_valid, y_valid)

* 하이퍼파라미터 범위 설정

In [None]:
param_bounds = {
    'max_depth': (4, 8),
    'subsample': (0.6, 0.9),
    'colsample_bytree': (0.7, 1.0),
    'min_child_weight': (5, 7),
    'gamma': (8, 11),
    'reg_alpha': (7, 9),
    'reg_lambda': (1.1, 1.5),
    'scale_pos_weight': (1.4, 1.6)
}

fixed_params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.02,
    'random_state': 1991
}

In [None]:
def eval_function(max_depth, subsample, colsample_bytree, min_child_weight, reg_alpha, gamma, reg_lambda, scale_pos_weight):
    params = {
        'max_depth': int(round(max_depth)),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'scale_pos_weight': scale_pos_weight
    }
    
    params.update(fixed_params)
    
    print('하이퍼 파라미터: ', params)
    
    xgb_model = xgb.train(params=params, dtrain=bayes_dtrain, num_boost_round=2000, evals=[(bayes_dvalid, 'bayes_dvalid')], maximize=True, feval=gini,
                         early_stopping_rounds=200, verbose_eval=False)
    
    best_iter = xgb_model.best_iteration
    preds = xgb_model.predict(bayes_dvalid, iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')
    
    return gini_score

In [None]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=0)

optimizer.maximize(init_points=3, n_iter=6)

In [None]:
max_params = optimizer.max['params']
max_params

In [None]:
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params.update(fixed_params)
max_params

In [None]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

oof_val_preds = np.zeros(X.shape[0])
oof_test_preds = np.zeros(X_test.shape[0])

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    print('#'*40, f'폴드 {idx+1} / 폴드 {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    dtest = xgb.DMatrix(X_test)
    
    xgb_model = xgb.train(params=max_params, dtrain=dtrain, num_boost_round=2000, evals=[(dvalid, 'valid')], maximize=True, feval=gini,
                         early_stopping_rounds=200, verbose_eval=100)
    
    best_iter = xgb_model.best_iteration
    oof_test_preds += xgb_model.predict(dtest, iteration_range=(0, best_iter))/ folds.n_splits
    
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    
    print(f'폴드 {idx+1} 지니계수: {gini_score}\n')

In [None]:
print('OOF 검증 데이터 지니계수:', eval_gini(y, oof_val_preds))

In [None]:
submission['target'] = oof_test_preds
submission.to_csv('submission.csv')