## import modeule

In [None]:
!pip install bayesian-optimization

In [None]:
!pip install eli5

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss, mean_squared_error

import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

## data load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/contest/lg_auto_driving/data/train.csv')

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

## feature engineering

In [None]:
# rfecv의 결과로 column 5개 제거
train_x = train_x.drop(['X_02','X_04','X_23','X_47','X_48'], axis=1)

In [None]:
# 6시그마 기준으로 outlier인 것과 아닌 것을 1,0으로 판별한 column 추가
for i in train_x.columns:
  ucl = train_x[i].mean()+3*train_x[i].std()
  lcl = train_x[i].mean()-3*train_x[i].std()
  for k,j in enumerate(train_x[i]):
    if j > ucl or j < lcl:
        train_x.loc[k, f'{i}_ct_outlier'] = 1 
    else:
        train_x.loc[k, f'{i}_ct_outlier'] = 0

## modeling1 - lgbm

In [None]:
# 평가 성능지표
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(gt.iloc[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [None]:
# to optimize hyperparameter
lgbm_parameter_bounds = {
        'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
                      }


def lgbm_bo(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda):
    lgbm_params = {
                'num_leaves' : int(round(num_leaves)),
                'learning_rate' : learning_rate,
                'n_estimators' : int(round(n_estimators)),
                'subsample' : subsample,
                'colsample_bytree' : colsample_bytree,
                'reg_alpha' : reg_alpha,
                'reg_lambda' : reg_lambda     
              }
    
    lgbm = MultiOutputRegressor(LGBMRegressor(**lgbm_params))
    
    X_train, X_valid, y_train, y_valid = train_test_split(train_x,train_y,test_size = 0.2, )
    
    lgbm.fit(X_train,y_train)

    score = lg_nrmse(y_valid, lgbm.predict(X_valid))
    return score

In [None]:
BO_lgbm = BayesianOptimization(f = lgbm_bo, pbounds = lgbm_parameter_bounds,random_state = 42)
BO_lgbm.maximize(init_points = 5, n_iter = 10)

In [None]:
lgbm_model = MultiOutputRegressor(LGBMRegressor(
    
    colsample_bytree=0.5924 ,
    learning_rate=0.00474, n_estimators=628, 
    num_leaves=187, reg_alpha=0.6505, 
    reg_lambda=47.44, subsample=0.9656

)).fit(train_x, train_y)

## postprocessing

In [None]:
# 일부만을 뽑아서 중요도를 뽑기 위해서 train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [None]:
# permutation importance를 통해 weight가 0인 column들은 삭제
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(lgbm_model, scoring = "neg_mean_squared_error", random_state = 42).fit(x_val, y_val)
eli5.show_weights(perm, top = 80, feature_names = x_val.columns.tolist())

In [None]:
# outlier 새로 뽑은 것 추가, X_10 column 삭제
outlier_data = train_x.filter(like='outlier')
train_x = train_x.drop(outlier_data, axis=1)
train_x = train_x.drop('X_10', axis=1)
train_x = train_x.join(outlier_data[['X_32_ct_outlier','X_30_ct_outlier','X_55_ct_outlier','X_49_ct_outlier','X_29_ct_outlier','X_08_ct_outlier']])

## modeling2 - lgbm

In [None]:
lgbm_model = MultiOutputRegressor(LGBMRegressor(
    
    colsample_bytree=0.5924 ,
    learning_rate=0.00474, n_estimators=628, 
    num_leaves=187, reg_alpha=0.6505, 
    reg_lambda=47.44, subsample=0.9656

)).fit(train_x, train_y)

## Inference

In [None]:
test_x = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/contest/lg_auto_driving/data/test.csv').drop(columns=['ID'])
test_x = test_x.drop(['X_02','X_04','X_23','X_47','X_48'], axis=1)

In [None]:
# test dataset에도 train dataset에 한 것과 같은 전처리해주기
# 6시그마 기준으로 outlier인 것과 아닌 것을 1,0으로 판별한 column 추가
for i in test_x.columns:
  ucl = test_x[i].mean()+3*test_x[i].std()
  lcl = test_x[i].mean()-3*test_x[i].std()
  
  for k,j in enumerate(test_x[i]):
    if j > ucl or j < lcl:
        test_x.loc[k, f'{i}_ct_outlier'] = 1 
    else:
        test_x.loc[k, f'{i}_ct_outlier'] = 0

In [None]:
# outlier 새로 뽑은 것 추가, X_10 column 삭제
outlier_data = test_x.filter(like='outlier')
test_x = test_x.drop(outlier_data, axis=1)
test_x = test_x.drop('X_10', axis=1)
test_x = test_x.join(outlier_data[['X_32_ct_outlier','X_30_ct_outlier','X_55_ct_outlier','X_49_ct_outlier','X_29_ct_outlier','X_08_ct_outlier']])

In [None]:
preds = lgbm_model.predict(test_x)

## submit

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/contest/lg_auto_driving/data/sample_submission.csv')

In [None]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

In [None]:
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/contest/lg_auto_driving/submit_0826_5.csv', index=False)