In [27]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss, mean_squared_error

import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [28]:
# Seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) 

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(gt.iloc[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [4]:
train_df = pd.read_csv('./data/train.csv')

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [4]:
# 방열재료 1,2,3의 면적 대비 무게
# 방열재료 무게/면적
train_x['bangyul_1'] = train_x['X_03']/train_x['X_07']
train_x['bangyul_2'] = train_x['X_10']/train_x['X_08']
train_x['bangyul_3'] = train_x['X_11']/train_x['X_09']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['bangyul_1'] = train_x['X_03']/train_x['X_07']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['bangyul_2'] = train_x['X_10']/train_x['X_08']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['bangyul_3'] = train_x['X_11']/train_x['X_09']


In [5]:
# 회전수 당 삽입깊이
# 삽입깊이/회전수
train_x['roperdep_1'] = train_x['X_30']/train_x['X_34']
train_x['roperdep_2'] = train_x['X_31']/train_x['X_35']
train_x['roperdep_3'] = train_x['X_32']/train_x['X_36']
train_x['roperdep_4'] = train_x['X_33']/train_x['X_37']

train_x['roperdep_5'] = train_x['X_19']/train_x['X_34']
train_x['roperdep_6'] = train_x['X_20']/train_x['X_35']
train_x['roperdep_7'] = train_x['X_21']/train_x['X_36']
train_x['roperdep_8'] = train_x['X_22']/train_x['X_37']

# 회전수 당 삽입깊이의 평균
train_x['m_roperdep_1'] = (train_x['roperdep_1']+train_x['roperdep_5'])/2
train_x['m_roperdep_2'] = (train_x['roperdep_2']+train_x['roperdep_6'])/2
train_x['m_roperdep_3'] = (train_x['roperdep_3']+train_x['roperdep_7'])/2
train_x['m_roperdep_4'] = (train_x['roperdep_4']+train_x['roperdep_8'])/2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['roperdep_1'] = train_x['X_30']/train_x['X_34']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['roperdep_2'] = train_x['X_31']/train_x['X_35']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['roperdep_3'] = train_x['X_32']/train_x['X_36']
A value is trying to be set on a copy

In [15]:
# 라이브러리를 임포트합니다.
import lightgbm
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
from xgboost import XGBRegressor

# 선형 회귀 모델을 만듭니다.
xgb_reg = XGBRegressor()

# 재귀적으로 특성을 제거합니다.
rfecv = RFECV(estimator=xgb_reg, step=1, scoring="neg_mean_squared_error", n_jobs=-1)
rfecv.fit(train_x, train_y)

In [16]:
rfecv.transform(train_x)

array([[ 70.544     ,  67.47      , 101.892     , ...,   0.1886646 ,
          0.17532971,   0.18745189],
       [ 69.524     ,  65.17      , 101.944     , ...,   0.18570319,
          0.16912335,   0.18793503],
       [ 72.583     ,  64.07      , 103.153     , ...,   0.18376068,
          0.17365967,   0.18461538],
       ...,
       [ 68.504     ,  64.67      , 103.144     , ...,   0.18104777,
          0.17103236,   0.18345043],
       [ 66.465     ,  63.67      , 102.025     , ...,   0.17713178,
          0.17399691,   0.18350039],
       [ 66.465     ,  65.67      , 102.004     , ...,   0.1774942 ,
          0.16962906,   0.19191919]])

In [17]:
# 최선의 특성 개수
print('Optimal number of features :', rfecv.n_features_)

# 선택된 특성이 표시된 불리언 마스크
print('Best features :', train_x.columns[rfecv.support_])

# 특성의 순위 : 최고에서 최악까지
print('Ranking of features :', train_x.columns[rfecv.ranking_])

Optimal number of features : 65
Best features : Index(['X_01', 'X_03', 'X_05', 'X_06', 'X_07', 'X_08', 'X_09', 'X_10', 'X_11',
       'X_12', 'X_13', 'X_14', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_20',
       'X_21', 'X_22', 'X_24', 'X_25', 'X_26', 'X_27', 'X_28', 'X_29', 'X_30',
       'X_31', 'X_32', 'X_33', 'X_34', 'X_35', 'X_36', 'X_37', 'X_38', 'X_39',
       'X_40', 'X_41', 'X_42', 'X_43', 'X_44', 'X_45', 'X_46', 'X_49', 'X_50',
       'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56', 'bangyul_1',
       'bangyul_3', 'roperdep_1', 'roperdep_2', 'roperdep_3', 'roperdep_4',
       'roperdep_5', 'roperdep_6', 'roperdep_7', 'roperdep_8', 'm_roperdep_1',
       'm_roperdep_2', 'm_roperdep_3', 'm_roperdep_4'],
      dtype='object')
Ranking of features : Index(['X_02', 'X_04', 'X_02', 'X_07', 'X_02', 'X_02', 'X_02', 'X_02', 'X_02',
       'X_02',
       ...
       'X_16', 'X_17', 'X_18', 'X_06', 'X_03', 'roperdep_1', 'roperdep_2',
       'roperdep_3', 'roperdep_4', 'roperdep_5'],
    

---------------