## 1. Load library

In [1]:
# basic library
import pandas as pd
import random
import os
import numpy as np
import warnings
import time
import datetime

# option library
warnings.filterwarnings('ignore')

# preprocessing library
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# model library
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import lightgbm as lgb

# metrics library(평가지표)
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report

# validation library(검증)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate

# optuna library
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split

In [2]:
# Seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) 

## 2. Load Data

In [3]:
train_df = pd.read_csv('../train.csv')
test_df = pd.read_csv('../test.csv')

* Columns
 - Y_class : 제품 품질 상태(Target)
   + 0 : 적정 기준 미달 (부적합)
   + 1 : 적합
   + 2 : 적정 기준 초과 (부적합)
 - Y_Quality : 제품 품질 관련 정량적 수치
 - TIMESTAMP : 제품이 공정에 들어간 시각
 - LINE : 제품이 들어간 공정 LINE 종류  ('T050304','T050307','T100304','T010306','T010305')
 - PRODUCT_CODE : 제품의 CODE 번호('A_31', 'T_31', 'O_31')
 - X_1 ~ X_2875 : 공정 과정에서 추출되어 비식별화된 변수

## 3. 데이터 확인

In [5]:
# train데이터 
train_df

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,TRAIN_004,1,0.531590,2022-06-13 5:47,T050304,A_31,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,TRAIN_593,1,0.526546,2022-09-08 14:30,T100306,T_31,2.0,95.0,0.0,45.0,...,,,,,,,,,,
594,TRAIN_594,0,0.524022,2022-09-08 22:38,T050304,A_31,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,TRAIN_595,0,0.521289,2022-09-08 22:47,T050304,A_31,,,,,...,,,,,,1.0,,,,
596,TRAIN_596,1,0.531375,2022-09-08 14:38,T100304,O_31,40.0,94.0,0.0,45.0,...,,,,,,,,,,


In [6]:
#test 데이터
test_df

Unnamed: 0,PRODUCT_ID,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TEST_000,2022-09-09 2:01,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
1,TEST_001,2022-09-09 2:09,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
2,TEST_002,2022-09-09 8:42,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
3,TEST_003,2022-09-09 10:56,T010305,A_31,,,,,,,...,,,,,,,,,,
4,TEST_004,2022-09-09 11:04,T010306,A_31,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TEST_305,2022-11-05 11:18,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
306,TEST_306,2022-11-05 16:39,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
307,TEST_307,2022-11-05 16:47,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
308,TEST_308,2022-11-05 20:53,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,


> #### 데이터 공정라인과 제품번호 확인

In [26]:
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    print(np.unique(test_x[i]))

[0 1 2 3 4 5]
[0 1 2]


## 4. Data PreProcessing

In [56]:
# test data와 train data 에서 분류할 colums LINE,PRODUCT_CODE,X데이터만 남김
test_x = test_df.drop(columns = ['PRODUCT_ID','TIMESTAMP'])
train_x = train_df.drop(columns = ['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality'])

In [57]:
# Train데이터에서 분류된 Y_Class train_y
train_y = train_df['Y_Class']

In [37]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T100306,T_31,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,T100304,T_31,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,T100304,T_31,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,T010305,A_31,,,,,,,,,...,,,,,,,,,,
4,T010306,A_31,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,T100304,T_31,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,T100306,T_31,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


In [38]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,T050304,A_31,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,T050307,A_31,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,T050304,A_31,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,T050307,A_31,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,T050304,A_31,,,,,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
594,T050304,A_31,,,,,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,T050304,A_31,,,,,,,,,...,,,,,,1.0,,,,
596,T100304,O_31,40.0,94.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,


In [39]:
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

#### Label Encoding

In [58]:
qual_col = ['LINE','PRODUCT_CODE'] # 변환 할 피처

In [59]:
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 

- qualitative to quantitative : 정성적 데이터(비정형데이터, 문자나 언어의 텍스트 파일)을 정량적 데이터로 수치화하는 전처리 작업
- le.fit() : 학습 시 고유 피처들을 학습하여 고유번호를 지정
- le.transform() : 각 피처값을 해당하는 학습된 고유값으로 변환
- .classes_ : 고유번호 0번부터 고유값에 대한 원본 데이터 값을 가지고 있음
- np.append(대상 어레이,추가할 값, 파라미터축)

In [60]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2,0,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,3,0,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,2,0,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,3,0,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,2,0,,,,,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5,2,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
594,2,0,,,,,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,2,0,,,,,,,,,...,,,,,,1.0,,,,
596,4,1,40.0,94.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,


In [61]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,5,2,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
1,4,2,2.0,93.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
2,4,2,2.0,95.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
3,0,0,,,,,,,,,...,,,,,,,,,,
4,1,0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,5,2,2.0,91.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,
306,4,2,2.0,96.0,0.0,45.0,11.0,0.0,45.0,10.0,...,,,,,,,,,,
307,5,2,2.0,91.0,0.0,45.0,10.0,0.0,50.0,10.0,...,,,,,,,,,,
308,5,2,2.0,95.0,0.0,45.0,10.0,0.0,51.0,10.0,...,,,,,,,,,,


#### 정규화

In [62]:
columns_x = train_x.columns.str.contains('X')
print(columns_x)

[False False  True ...  True  True  True]


In [63]:
x_col = train_x.columns[train_x.columns.str.contains('X')].tolist()
print(x_col[:10])

['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10']


- .columns.str.contains("a") : a라는 문자가 들어있는 문자열 모두 찾음

In [64]:
train_x[x_col]

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,,,,,,,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,,,,,,,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,,,,,,,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,
3,,,,,,,,,,,...,37.74,39.17,52.17,30.58,71.78,,,,,
4,,,,,,,,,,,...,38.70,41.89,46.93,33.09,76.97,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,2.0,95.0,0.0,45.0,10.0,0.0,50.0,10.0,52.0,2.0,...,,,,,,,,,,
594,,,,,,,,,,,...,49.47,53.07,50.89,55.10,66.49,1.0,,,,
595,,,,,,,,,,,...,,,,,,1.0,,,,
596,40.0,94.0,0.0,45.0,11.0,0.0,45.0,10.0,31.0,2.0,...,,,,,,,,,,


In [65]:
scaler = MinMaxScaler()
scaler.fit(train_x[x_col])
  
train_x[x_col] = scaler.transform(train_x[x_col])
test_x[x_col] = scaler.transform(test_x[x_col])
  
train_x.head()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2,0,,,,,,,,,...,0.256757,0.248647,0.0,0.122283,0.890487,,,,,
1,3,0,,,,,,,,,...,0.240754,0.300866,0.407899,0.164742,0.60177,,,,,
2,2,0,,,,,,,,,...,0.251422,0.133929,0.355835,0.205163,0.922566,,,,,
3,3,0,,,,,,,,,...,0.199858,0.20211,0.704129,0.003057,0.559181,,,,,
4,2,0,,,,,,,,,...,0.233997,0.275703,0.515978,0.088315,0.846239,,,,,


#### 결측값(NaN) 평균으로 채우기

In [66]:
train_x = train_x.fillna(train_x.mean())# mean 연산시 nan값은 분석 대상에서 제외
test_x = test_x.fillna(train_x.mean())
# train_x = train_x.fillna(0)# mean 연산시 nan값은 분석 대상에서 제외
# test_x = test_x.fillna(0)

In [49]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.256757,0.248647,0.000000,0.122283,0.890487,0.0,0.0,0.0,0.0,0.0
1,3,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.240754,0.300866,0.407899,0.164742,0.601770,0.0,0.0,0.0,0.0,0.0
2,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.251422,0.133929,0.355835,0.205163,0.922566,0.0,0.0,0.0,0.0,0.0
3,3,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.199858,0.202110,0.704129,0.003057,0.559181,0.0,0.0,0.0,0.0,0.0
4,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.233997,0.275703,0.515978,0.088315,0.846239,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5,2,0.009804,0.533333,0.0,0.0,0.0,0.0,0.294118,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
594,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.616999,0.578193,0.658169,0.835938,0.266593,0.0,0.0,0.0,0.0,0.0
595,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
596,4,1,0.382353,0.466667,0.0,0.0,1.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


#### 전체 결측값(Null)인 X변수 Drop하기

In [20]:
# 결측값 확인
train_x.isnull()

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
594,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
595,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
596,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True


In [21]:
train_x.isnull().sum()

LINE              0
PRODUCT_CODE      0
X_1               0
X_2               0
X_3               0
               ... 
X_2871            0
X_2872          598
X_2873          598
X_2874          598
X_2875          598
Length: 2877, dtype: int64

In [67]:
# 결측값있는 열 전체 삭제
train_x = train_x.dropna(axis =1)
test_x = test_x.dropna(axis=1)

In [51]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.256757,0.248647,0.000000,0.122283,0.890487,0.0,0.0,0.0,0.0,0.0
1,3,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.240754,0.300866,0.407899,0.164742,0.601770,0.0,0.0,0.0,0.0,0.0
2,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.251422,0.133929,0.355835,0.205163,0.922566,0.0,0.0,0.0,0.0,0.0
3,3,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.199858,0.202110,0.704129,0.003057,0.559181,0.0,0.0,0.0,0.0,0.0
4,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.233997,0.275703,0.515978,0.088315,0.846239,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5,2,0.009804,0.533333,0.0,0.0,0.0,0.0,0.294118,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
594,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.616999,0.578193,0.658169,0.835938,0.266593,0.0,0.0,0.0,0.0,0.0
595,2,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
596,4,1,0.382353,0.466667,0.0,0.0,1.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


In [24]:
test_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,5,2,0.009804,0.466667,0.0,0.0,0.00000,0.0,0.352941,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
1,4,2,0.009804,0.400000,0.0,0.0,1.00000,0.0,0.000000,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
2,4,2,0.009804,0.533333,0.0,0.0,1.00000,0.0,0.000000,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
3,0,0,0.013821,0.541547,0.0,0.0,0.39255,0.0,0.223664,0.048711,...,0.843251,0.940000,0.930103,0.880435,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
4,1,0,0.013821,0.541547,0.0,0.0,0.39255,0.0,0.223664,0.048711,...,0.783745,0.920000,0.919557,0.880435,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,5,2,0.009804,0.266667,0.0,0.0,0.00000,0.0,0.352941,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
306,4,2,0.009804,0.600000,0.0,0.0,1.00000,0.0,0.000000,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
307,5,2,0.009804,0.266667,0.0,0.0,0.00000,0.0,0.294118,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0
308,5,2,0.009804,0.533333,0.0,0.0,0.00000,0.0,0.352941,0.000000,...,0.545584,0.505582,0.520356,0.500698,0.664555,0.592741,0.612072,0.719083,0.275426,0.0


## 5. Modeling

In [25]:
train_x.shape, train_y.shape, test_x.shape

((598, 2795), (598,), (310, 2795))

In [71]:
rf_train_xx,rf_valid_xx,rf_train_yy,rf_valid_yy = train_test_split(train_x,train_y,test_size = 0.2, shuffle = True, random_state=42)

In [90]:
params = {'max_depth': 5, 'min_samples_leaf': 7, 'min_samples_split': 6, 'n_estimators': 2000}
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators=500, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795).fit(rf_train_xx,rf_train_yy)

In [86]:
rf_cls.predict(rf_valid_xx)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2,
       1, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1], dtype=int64)

In [91]:
print(rf_cls.oob_score_)

0.7531380753138075


In [88]:
print(rf_cls.score(rf_valid_xx,rf_valid_yy))

0.75


In [106]:
rf_cls = RandomForestClassifier(max_depth = 4,n_estimators=500, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7531380753138075 0.7166666666666667


In [107]:
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators=500, min_samples_leaf=4,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7594142259414226 0.725


In [108]:
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators=2000, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7573221757322176 0.7333333333333333


In [110]:
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators= 2000, min_samples_leaf=4,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7594142259414226 0.725


In [111]:
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators=1000, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7510460251046025 0.7333333333333333


In [113]:
rf_cls = RandomForestClassifier(n_estimators=1000, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7510460251046025 0.7166666666666667


In [112]:
rf_cls = RandomForestClassifier(max_depth = 15,n_estimators=1000, min_samples_leaf=5,oob_score = True, min_samples_split=9, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7552301255230126 0.7166666666666667


In [105]:
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators=500, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7594142259414226 0.7166666666666667


* * *

In [114]:
rf_cls = RandomForestClassifier(max_depth = 5,n_estimators=1000, min_samples_leaf=5,oob_score = True, min_samples_split=7, max_features=2795,random_state=42).fit(rf_train_xx,rf_train_yy)
print(rf_cls.predict(rf_valid_xx))
print(rf_cls.oob_score_,rf_cls.score(rf_valid_xx,rf_valid_yy))

[1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 0 1 0 1 1 1 1 1 1 2 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 0 1 1 1]
0.7510460251046025 0.7333333333333333


In [115]:
rf_pred=rf_cls.predict(test_x)

In [116]:
submit = pd.read_csv('../sample_submission.csv')
submit['Y_Class'] = rf_pred
submit.to_csv('./ffinal_rf_pred.csv',index=False)

#### RandomForestClassifier

In [77]:
start = time.time() # 시작
params = {
    'n_estimators':[500, 1000, 1500, 2000, 2500, 3300],
    'max_depth' : [5, 6, 8, 10, 12], 
    'min_samples_leaf' : [5, 6, 7, 8, 9],
    'min_samples_split' : [6, 7, 8, 9, 10, 12]
}
rf_cls = RandomForestClassifier(**params, oob_score= True,,random_state = 37, n_jobs = 1, verbose =2)
grid_cv = GridSearchCV(rf_cls, param_grid = params, cv = 2, scoring = 'accuracy',n_jobs=1, verbose =2 )
grid_cv.fit(train_x, train_y)

# 최적의 파라미터 모델을 이용하여 예측값 생성
grid_estimator = grid_cv.best_estimator_
rf_pred = grid_estimator.predict(test_x)
# 후보 파라미터 성능 검증 결과 출력
print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))
sec = time.time()-start # 종료 - 시작 (걸린 시간)
times = str(datetime.timedelta(seconds=sec))
short = times.split(".")[0] # 초 단위 까지만
print(f"{times} sec")
print(f"{short} sec")

Fitting 2 folds for each of 900 candidates, totalling 1800 fits
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=500; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=500; total time=   1.1s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=1000; total time=   1.9s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=1000; total time=   2.2s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=1500; total time=   3.0s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=1500; total time=   3.2s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=2000; total time=   4.1s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=2000; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=5, min_samples_split=6, n_estimators=2500; total time=   5.8s
[CV] END max_depth=5, min_sam

[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=6, n_estimators=2500; total time=   4.9s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=6, n_estimators=3300; total time=   6.6s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=6, n_estimators=3300; total time=   7.4s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=500; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=500; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=1000; total time=   2.0s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=1000; total time=   1.9s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=1500; total time=   2.9s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=1500; total time=   3.1s
[CV] END max_depth=5, min_samples_leaf=6, min_samples_split=7, n_estimators=2000; total time=

[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=7, n_estimators=2000; total time=   4.7s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=7, n_estimators=2500; total time=   5.2s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=7, n_estimators=2500; total time=   5.1s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=7, n_estimators=3300; total time=   6.4s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=7, n_estimators=3300; total time=   6.6s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=8, n_estimators=500; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=8, n_estimators=500; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=8, n_estimators=1000; total time=   2.0s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=8, n_estimators=1000; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=7, min_samples_split=8, n_estimators=1500; total time=

[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=1500; total time=   2.9s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=2000; total time=   3.8s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=2000; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=2500; total time=   4.8s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=2500; total time=   5.0s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=3300; total time=   6.3s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=8, n_estimators=3300; total time=   6.6s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=9, n_estimators=500; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=9, n_estimators=500; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=9, n_estimators=1000; total time=

[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=1000; total time=   2.0s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=1500; total time=   2.8s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=1500; total time=   3.0s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=2000; total time=   3.8s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=2000; total time=   4.0s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=2500; total time=   4.8s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=2500; total time=   4.9s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=3300; total time=   6.3s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=9, n_estimators=3300; total time=   6.7s
[CV] END max_depth=5, min_samples_leaf=9, min_samples_split=10, n_estimators=500; total tim

[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=500; total time=   1.0s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=500; total time=   1.0s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   2.1s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=1000; total time=   2.3s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=1500; total time=   4.0s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=1500; total time=   4.6s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=2000; total time=   4.8s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=2000; total time=   5.5s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=2500; total time=   7.4s
[CV] END max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=2500; t

[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=10, n_estimators=2500; total time=   5.2s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=10, n_estimators=3300; total time=   6.8s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=10, n_estimators=3300; total time=   6.9s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=500; total time=   1.1s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=500; total time=   1.0s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=1000; total time=   2.0s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=1000; total time=   2.1s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=1500; total time=   3.0s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=1500; total time=   3.4s
[CV] END max_depth=6, min_samples_leaf=6, min_samples_split=12, n_estimators=2000; t

[CV] END max_depth=6, min_samples_leaf=7, min_samples_split=12, n_estimators=2000; total time=   4.1s
[CV] END max_depth=6, min_samples_leaf=7, min_samples_split=12, n_estimators=2000; total time=   4.2s
[CV] END max_depth=6, min_samples_leaf=7, min_samples_split=12, n_estimators=2500; total time=   5.3s
[CV] END max_depth=6, min_samples_leaf=7, min_samples_split=12, n_estimators=2500; total time=   5.0s
[CV] END max_depth=6, min_samples_leaf=7, min_samples_split=12, n_estimators=3300; total time=   6.5s
[CV] END max_depth=6, min_samples_leaf=7, min_samples_split=12, n_estimators=3300; total time=   7.5s
[CV] END max_depth=6, min_samples_leaf=8, min_samples_split=6, n_estimators=500; total time=   1.0s
[CV] END max_depth=6, min_samples_leaf=8, min_samples_split=6, n_estimators=500; total time=   1.1s
[CV] END max_depth=6, min_samples_leaf=8, min_samples_split=6, n_estimators=1000; total time=   2.1s
[CV] END max_depth=6, min_samples_leaf=8, min_samples_split=6, n_estimators=1000; total

[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=1000; total time=   2.0s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=1500; total time=   3.0s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=1500; total time=   3.0s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=2000; total time=   4.0s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=2000; total time=   4.1s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=2500; total time=   5.0s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=2500; total time=   6.5s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=3300; total time=   6.7s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=3300; total time=   6.8s
[CV] END max_depth=6, min_samples_leaf=9, min_samples_split=7, n_estimators=500; total time

[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=500; total time=   1.1s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=500; total time=   1.1s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=1000; total time=   2.2s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=1000; total time=   2.2s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=1500; total time=   3.3s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=1500; total time=   3.3s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=2000; total time=   4.6s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=2000; total time=   4.6s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=2500; total time=   5.8s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=7, n_estimators=2500; total time=

[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=7, n_estimators=3300; total time=   7.5s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=7, n_estimators=3300; total time=   8.0s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=500; total time=   1.1s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=500; total time=   1.1s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=1000; total time=   2.3s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=1000; total time=   2.2s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=1500; total time=   3.5s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=1500; total time=   3.5s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=2000; total time=   4.6s
[CV] END max_depth=8, min_samples_leaf=6, min_samples_split=8, n_estimators=2000; total time=

[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=8, n_estimators=2500; total time=   5.5s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=8, n_estimators=2500; total time=   5.5s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=8, n_estimators=3300; total time=   7.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=8, n_estimators=3300; total time=   8.0s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=9, n_estimators=500; total time=   1.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=9, n_estimators=500; total time=   1.0s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=9, n_estimators=1000; total time=   2.6s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=9, n_estimators=1000; total time=   2.2s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=9, n_estimators=1500; total time=   3.1s
[CV] END max_depth=8, min_samples_leaf=7, min_samples_split=9, n_estimators=1500; total time=

[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=2000; total time=   4.3s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=2000; total time=   4.2s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=2500; total time=   5.3s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=2500; total time=   5.7s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=3300; total time=   7.6s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=9, n_estimators=3300; total time=   6.9s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=10, n_estimators=500; total time=   1.0s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=10, n_estimators=500; total time=   1.0s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=10, n_estimators=1000; total time=   2.0s
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=10, n_estimators=1000; total t

[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=1000; total time=   2.1s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=1500; total time=   3.1s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=1500; total time=   3.1s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=2000; total time=   4.5s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=2000; total time=   4.3s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=2500; total time=   5.4s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=2500; total time=   5.6s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=3300; total time=   7.3s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=10, n_estimators=3300; total time=   7.2s
[CV] END max_depth=8, min_samples_leaf=9, min_samples_split=12, n_estimators=500; 

[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=500; total time=   1.2s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=500; total time=   1.2s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   2.6s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=1500; total time=   3.7s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=1500; total time=   3.4s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=2000; total time=   5.3s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=2000; total time=   4.8s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimators=2500; total time=   5.8s
[CV] END max_depth=10, min_samples_leaf=5, min_samples_split=12, n_estimato

[CV] END max_depth=10, min_samples_leaf=6, min_samples_split=12, n_estimators=2500; total time=   6.0s
[CV] END max_depth=10, min_samples_leaf=6, min_samples_split=12, n_estimators=3300; total time=   7.8s
[CV] END max_depth=10, min_samples_leaf=6, min_samples_split=12, n_estimators=3300; total time=   8.0s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=500; total time=   1.0s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=500; total time=   1.1s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=1000; total time=   2.3s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=1000; total time=   2.4s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=1500; total time=   3.4s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=1500; total time=   3.4s
[CV] END max_depth=10, min_samples_leaf=7, min_samples_split=6, n_estimators=2000

[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=6, n_estimators=2000; total time=   4.3s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=6, n_estimators=2000; total time=   4.5s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=6, n_estimators=2500; total time=   5.5s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=6, n_estimators=2500; total time=   5.4s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=6, n_estimators=3300; total time=   7.3s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=6, n_estimators=3300; total time=   7.5s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=7, n_estimators=500; total time=   1.1s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=7, n_estimators=500; total time=   1.0s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=7, n_estimators=1000; total time=   2.1s
[CV] END max_depth=10, min_samples_leaf=8, min_samples_split=7, n_estimators=1000; t

[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=1000; total time=   2.2s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=1500; total time=   3.1s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=1500; total time=   3.2s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=2000; total time=   4.1s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=2000; total time=   4.3s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=2500; total time=   5.5s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=2500; total time=   5.8s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=3300; total time=   6.7s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=3300; total time=   6.7s
[CV] END max_depth=10, min_samples_leaf=9, min_samples_split=8, n_estimators=500; 

[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=500; total time=   1.1s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=500; total time=   1.1s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=1000; total time=   2.4s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=1000; total time=   2.4s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=1500; total time=   3.7s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=1500; total time=   3.5s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=2000; total time=   4.5s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=2000; total time=   4.4s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=2500; total time=   5.5s
[CV] END max_depth=12, min_samples_leaf=5, min_samples_split=8, n_estimators=2500; t

[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=8, n_estimators=2500; total time=   5.8s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=8, n_estimators=3300; total time=   7.6s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=8, n_estimators=3300; total time=   7.5s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=500; total time=   1.2s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=500; total time=   1.1s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=1000; total time=   2.6s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=1000; total time=   2.3s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=1500; total time=   3.5s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=1500; total time=   3.5s
[CV] END max_depth=12, min_samples_leaf=6, min_samples_split=9, n_estimators=2000; t

[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=9, n_estimators=2000; total time=   5.9s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=9, n_estimators=2000; total time=   6.2s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=9, n_estimators=2500; total time=   8.8s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=9, n_estimators=2500; total time=   6.9s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=9, n_estimators=3300; total time=   8.6s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=9, n_estimators=3300; total time=   8.6s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=10, n_estimators=500; total time=   1.3s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=10, n_estimators=500; total time=   1.2s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=10, n_estimators=1000; total time=   2.5s
[CV] END max_depth=12, min_samples_leaf=7, min_samples_split=10, n_estimators=100

[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=1000; total time=   3.0s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=1500; total time=   4.5s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=1500; total time=   4.5s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=2000; total time=   6.0s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=2000; total time=   6.1s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=2500; total time=   7.5s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=2500; total time=   7.6s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=3300; total time=   9.9s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=10, n_estimators=3300; total time=  10.3s
[CV] END max_depth=12, min_samples_leaf=8, min_samples_split=12, n_estima

[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=500; total time=   1.5s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=500; total time=   1.4s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=1000; total time=   2.9s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=1000; total time=   2.9s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=1500; total time=   4.4s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=1500; total time=   4.5s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=2000; total time=   5.7s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=2000; total time=   5.8s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimators=2500; total time=   7.3s
[CV] END max_depth=12, min_samples_leaf=9, min_samples_split=12, n_estimato

- n_jobs : 모델 학습 및 예측에 사용되는 병렬 작업의 수 -1의 경우 cpu의 모든 코어 사용
- n_estimators : 생성할 트리의 개수(default = 100)
- min_samples_leaf : 노드를 분할하기 위해 리프 노드가 가져야 할 최소한의 샘플 개수 
  > 값을 작게 설정하면 모델 복잡성 증가:과적합 위험 값을 증가 시키면 모델의 편향은 증가하고 분산은 감소. 모델의 복잡도가 낮아지지만 일부 손실 발생
- min_samples_split : 노드를 분할 하기 위해 필요한 최소한의 샘플 개수
  > 샘플 수가 이보다 작아지면 분할하지 않고 리프 노드로 설정 // 값이 작을 수록 복잡성 증가. 데이터 작은 변동에 민감반응 과적합위험.
- max_features : 노드를 분할 할 때 고려 할 특성의 무작위 하위 집합의 크기
  > 값이 작으면 분산 감소, 편향증가, 복잡도 감소 , 모델 성능 감소
- max_depth = 트리의 최대 깊이(defualt = none) : 깊이가 제한된 트리는 각각의 결정 경계를 더 간단하게 만듬. 모델 과적합 방지
  > 값이 작으면 모델의 단순성을 증가. 분산감소 편향 증가 => 데이터의 다양한 패턴 잡아내지 못하고 일반화 능력 향상됨
- feaure_importances_ : 랜덤포레스트는 다수의 의사결정 트리 앙상블하여 모델 구성. 각 피처들을 사용하여 분할 수행하고 이 과정에서 피처들의 중요도 계산

- best_estimator : gridsearchCV 객체에서 최적의 추정기(estimator)를 반환하는 속성
