- [안전 운전자 예측 경진대회 링크](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction)
- [모델링 코드 참고 링크](https://www.kaggle.com/xiaozhouwang/2nd-place-lightgbm-solution)

## 8.4 성능개선1: LightGBM 모델

In [1]:
%config Completer.use_jedi = False

import pandas as pd

data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

### 8.4.1 피처 엔지니어링

* 데이터 합치기

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target',axis=1)

all_features = all_data.columns

* 명목형 피처 원-핫 인코딩

In [3]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

* 파생 피처 추가

In [4]:
# 데이터 하나당 결측값 개수를 파생 피처로 추가
all_data['num_missing'] = (all_data == -1).sum(axis=1)

In [5]:
# 명목형 피처, calc 분류의 피처를 제외한 피처
remaining_features = [feature for feature in all_features if('cat' not in feature and 'calc' not in feature)]

# num_missing을 remaining_features에 추가
remaining_features.append('num_missing')

In [6]:
# 분류가 ind인 피처
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [7]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

In [8]:
all_data['ps_ind_02_cat'].value_counts()

 1    1079327
 2     309747
 3      70172
 4      28259
-1        523
Name: ps_ind_02_cat, dtype: int64

In [9]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

{1: 1079327, 2: 309747, 3: 70172, 4: 28259, -1: 523}

In [10]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x: val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [11]:
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

* 필요 없는 피처 제거

In [12]:
from scipy import sparse

# 필요 없는 피처들
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
                 'ps_car_14']

all_data_remaining = all_data[remaining_features + cat_count_features].drop(drop_features, axis=1)

# 데이터 합치기
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining), encoded_cat_matrix], format='csr')

* 명목형 피처에 원-핫 인코딩을 적용했다.
* 데이터 하나당 가지고 있는 결측값 개수를 새로운 피처로 만들었다.
* 모든 ind 피처 값을 연결해서 새로운 명목형 피처를 만들었다.
* 명목형 피처의 고윳값별 개수를 새로운 피처로 만들었다.
* 필요 없는 피처를 제거했다.

* 데이터 나누기

In [13]:
num_train= len(train)

X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [39]:
import numpy as np

def eval_gini(y_true, y_pred):
    # 실제값과 예측값의 크기가 같은지 확인 (값이 다르면 오류 발생)
    assert y_true.shape == y_pred.shape

    n_samples = y_true.shape[0]                      # 데이터 개수
    L_mid = np.linspace(1 / n_samples, 1, n_samples) # 대각선 값

    # 1) 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()] # y_pred 크기순으로 y_true 값 정렬
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) # 로렌츠 곡선
    G_pred = np.sum(L_mid - L_pred)       # 예측 값에 대한 지니계수

    # 2) 예측이 완벽할 때 지니계수
    true_order = y_true[y_true.argsort()] # y_true 크기순으로 y_true 값 정렬
    L_true = np.cumsum(true_order) / np.sum(true_order) # 로렌츠 곡선
    G_true = np.sum(L_mid - L_true)       # 예측이 완벽할 때 지니계수

    # 정규화된 지니계수
    return G_pred / G_true

In [40]:
# LightGBM용 gini() 함수
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True # 반환값

### 8.4.2 하이퍼파라미터 최적화

* 데이터셋 준비  
베이지안 최적화를 위한 데이터셋 준비

In [28]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

bayes_dtrain = lgb.Dataset(X_train, y_train)
bayes_dvalid = lgb.Dataset(X_valid, y_valid)

In [32]:
# 베이지안 최적화를 위한 하이퍼파라미터 범위
param_bounds = {
    'num_leaves': (30, 40),
    'lambda_l1': (0.7, 0.9),
    'lambda_l2': (0.9, 1),
    'feature_fraction': (0.6, 0.7),
    'bagging_fraction': (0.6, 0.9),
    'min_child_samples': (6, 10),
    'min_child_weight': (10, 40)
}

fixed_params = {
    'objective': 'binary',
    'learning_rate':0.005,
    'bagging_freq': 1,
    'force_row_wise': True,
    'random_state': 1991
}

In [35]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction, bagging_fraction, min_child_samples, min_child_weight):
    params ={
        'num_leaves': int(round(num_leaves)),
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'min_child_samples': int(round(min_child_samples)),
        'feature_pre_filter': False
    }
    
    params.update(fixed_params)
    
    print('하이퍼파라미터:', params)
    
    lgb_model = lgb.train(params=params,
                          train_set=bayes_dtrain,
                          num_boost_round=2500,
                          valid_sets=bayes_dvalid,
                          feval=gini,
                          early_stopping_rounds=300,
                          verbose_eval=False)
    
    preds = lgb_model.predict(X_valid)
    
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')
    
    return gini_score

In [36]:
from bayes_opt import BayesianOptimization

optimzer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=0)

In [41]:
optimzer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'num_leaves': 39, 'lambda_l1': 0.7766883037651555, 'lambda_l2': 0.9791725038082665, 'feature_fraction': 0.6963662760501029, 'bagging_fraction': 0.867531900234624, 'min_child_samples': 8, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수 : 0.2825910724443365

| [0m 1       [0m | [0m 0.2826  [0m | [0m 0.8675  [0m | [0m 0.6964  [0m 

In [43]:
max_params = optimzer.max['params']
max_params

{'bagging_fraction': 0.6675659367709144,
 'feature_fraction': 0.6952194054160209,
 'lambda_l1': 0.829932662510154,
 'lambda_l2': 0.9309382272206734,
 'min_child_samples': 7.8802172313670225,
 'min_child_weight': 28.604068940720285,
 'num_leaves': 31.937139259095634}

In [44]:
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))

In [45]:
max_params.update(fixed_params)

In [46]:
max_params

{'bagging_fraction': 0.6675659367709144,
 'feature_fraction': 0.6952194054160209,
 'lambda_l1': 0.829932662510154,
 'lambda_l2': 0.9309382272206734,
 'min_child_samples': 8,
 'min_child_weight': 28.604068940720285,
 'num_leaves': 32,
 'objective': 'binary',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'force_row_wise': True,
 'random_state': 1991}

### 8.4.3 모델 훈련 및 성능 검증

In [48]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

oof_val_preds = np.zeros(X.shape[0])
oof_test_preds = np.zeros(X_test.shape[0])

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    print('#'*40, f'폴드 {idx+1}/ 폴드 {folds.n_splits}', '#'*40)
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_valid, y_valid)
    
    lgb_model = lgb.train(params=max_params,
                          train_set=dtrain,
                          num_boost_round=2500,
                          valid_sets=dvalid,
                          feval=gini,
                          early_stopping_rounds=300,
                          verbose_eval=100)
    
    oof_test_preds += lgb_model.predict(X_test)/folds.n_splits
    oof_val_preds[valid_idx] += lgb_model.predict(X_valid)
    
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'폴드 {idx+1} 지니계수: {gini_score}\n')

######################################## 폴드 1/ 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.15429	valid_0's gini: 0.265543
[200]	valid_0's binary_logloss: 0.153238	valid_0's gini: 0.272793
[300]	valid_0's binary_logloss: 0.152647	valid_0's gini: 0.277665
[400]	valid_0's binary_logloss: 0.152282	valid_0's gini: 0.28162
[500]	valid_0's binary_logloss: 0.152039	valid_0's gini: 0.285037
[600]	valid_0's binary_logloss: 0.15187	valid_0's gini: 0.287802
[700]	valid_0's binary_logloss: 0.151756	valid_0's gini: 0.289643
[800]	valid_0's binary_logloss: 0.151675	valid_0's 

In [49]:
print('OOF 검증 데이터 지니계수 :', eval_gini(y, oof_val_preds))

OOF 검증 데이터 지니계수 : 0.28896687232618584


### 8.4.4 예측 및 결과 제출

In [50]:
submission['target'] = oof_test_preds
submission.to_csv('submission.csv')

ValueError: Length of values (119043) does not match length of index (892816)