In [274]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.utils import get_file, plot_model


from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [275]:
train_df = pd.read_csv('Dacon_data/train.csv')
test_df = pd.read_csv('Dacon_data/test.csv')

In [276]:
train_df.head(3)

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,


# Data Pre-processing

In [277]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)
train_df.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,0.0,0.0,0.0,0.0,...,37.74,39.17,52.17,30.58,71.78,0.0,0.0,0.0,0.0,0.0
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,0.0,0.0,0.0,0.0,...,38.7,41.89,46.93,33.09,76.97,0.0,0.0,0.0,0.0,0.0


In [278]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID'])
train_x.head(3)

Unnamed: 0,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2022-06-13 5:14,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,2022-06-13 5:22,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,2022-06-13 5:30,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


In [279]:
# TIMESTAMP를 월별로 저장
train_x['TIMESTAMP'] = pd.to_datetime(train_x['TIMESTAMP']).dt.strftime('%m').astype('int64')
test_x['TIMESTAMP'] = pd.to_datetime(test_x['TIMESTAMP']).dt.strftime('%m').astype('int64')
train_x.head(3)

Unnamed: 0,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,6,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,6,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,6,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


In [281]:
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

### 원핫인코딩

In [282]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[1])],remainder='passthrough')
# 원핫인코더에서 다중공선성 문제를 해결하고자 drop='first'
# 어떤데이터를 가지고 원핫할건지를 선택 나는 LINE을가지고 원핫하고자하기에 [1]
# 나머지칼럼들은 그대로 두기위해 remainder='passthrough'
train_x = ct_1.fit_transform(train_x)

pd.DataFrame(train_x).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2872,2873,2874,2875,2876,2877,2878,2879,2880,2881
0,0.0,1.0,0.0,0.0,0.0,6,A_31,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,6,A_31,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,6,A_31,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


In [283]:
ct_2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[6])],remainder='passthrough')
# CODE를을가지고 원핫하고자하기에 [6]
train_x = ct_2.fit_transform(train_x)
pd.DataFrame(train_x).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


In [284]:
pd.DataFrame(train_x)[7].unique()

array([6, 7, 8, 9], dtype=object)

In [285]:
ct_3 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[7])],remainder='passthrough')
# TIME을가지고 원핫하고자하기에 [7]
train_x = ct_3.fit_transform(train_x)
pd.DataFrame(train_x).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2875,2876,2877,2878,2879,2880,2881,2882,2883,2884
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


In [286]:
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[1])],remainder='passthrough')
# 원핫인코더에서 다중공선성 문제를 해결하고자 drop='first'
# 어떤데이터를 가지고 원핫할건지를 선택 나는 LINE을가지고 원핫하고자하기에 [1]
# 나머지칼럼들은 그대로 두기위해 remainder='passthrough'
test_x = ct_1.fit_transform(test_x)

pd.DataFrame(test_x).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2872,2873,2874,2875,2876,2877,2878,2879,2880,2881
0,0.0,0.0,0.0,0.0,1.0,9,T_31,2.0,94.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,9,T_31,2.0,93.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,9,T_31,2.0,95.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [287]:
ct_2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[6])],remainder='passthrough')
# CODE를을가지고 원핫하고자하기에 [6]
test_x = ct_2.fit_transform(test_x)
pd.DataFrame(test_x).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,9,2.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9,2.0,93.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9,2.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [288]:
pd.DataFrame(test_x)[7].unique()

array([9, 10, 11], dtype=object)

In [289]:
ct_3 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[7])],remainder='passthrough')
# TIME을가지고 원핫하고자하기에 [7]
test_x = ct_3.fit_transform(test_x)
pd.DataFrame(test_x).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2874,2875,2876,2877,2878,2879,2880,2881,2882,2883
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [290]:
train_x = pd.DataFrame(train_x)
test_x = pd.DataFrame(test_x)

In [291]:
train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2875,2876,2877,2878,2879,2880,2881,2882,2883,2884
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,37.74,39.17,52.17,30.58,71.78,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,38.7,41.89,46.93,33.09,76.97,0.0,0.0,0.0,0.0,0.0


In [292]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Columns: 2885 entries, 0 to 2884
dtypes: object(2885)
memory usage: 13.2+ MB


In [293]:
train_x = train_x.astype('float')
test_x = test_x.astype('float')
pd.DataFrame(train_x).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Columns: 2885 entries, 0 to 2884
dtypes: float64(2885)
memory usage: 13.2 MB


## 정규화

In [295]:
train_x_stats = train_x.describe()
train_x_stats = train_x_stats.transpose()
train_x_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,598.0,0.408027,0.491880,0.0,0.0,0.0,1.0,1.0
1,598.0,0.396321,0.489542,0.0,0.0,0.0,1.0,1.0
2,598.0,0.063545,0.244145,0.0,0.0,0.0,0.0,1.0
3,598.0,0.010033,0.099747,0.0,0.0,0.0,0.0,1.0
4,598.0,0.573579,0.494971,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
2880,598.0,0.165552,0.371989,0.0,0.0,0.0,0.0,1.0
2881,598.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2882,598.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2883,598.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


In [296]:
def normalizarion(x):
    return (x - train_x_stats['mean']) / train_x_stats['std']

normed_train_x = normalizarion(train_x)
normed_test_x = normalizarion(test_x)

In [297]:
normed_train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2875,2876,2877,2878,2879,2880,2881,2882,2883,2884
0,-0.829526,-0.809575,-0.260276,-0.100589,-1.158813,-0.363805,2.579829,-0.274615,-0.642666,-0.640071,...,1.607322,1.573125,1.29375,1.296366,2.669139,-0.445045,,,,
1,-0.829526,-0.809575,-0.260276,-0.100589,-1.158813,-0.363805,-0.386974,3.635376,-0.642666,-0.640071,...,1.583872,1.668226,1.899446,1.360035,2.460013,-0.445045,,,,
2,-0.829526,-0.809575,-0.260276,-0.100589,-1.158813,-0.363805,2.579829,-0.274615,-0.642666,-0.640071,...,1.599505,1.3642,1.822134,1.420648,2.692375,-0.445045,,,,
3,-0.829526,-0.809575,-0.260276,-0.100589,-1.158813,-0.363805,-0.386974,3.635376,-0.642666,-0.640071,...,1.523944,1.488372,2.339322,1.117583,2.429165,-0.445045,,,,
4,-0.829526,-0.809575,-0.260276,-0.100589,-1.158813,-0.363805,2.579829,-0.274615,-0.642666,-0.640071,...,1.573971,1.6224,2.059934,1.245431,2.637089,-0.445045,,,,


In [298]:
pd.DataFrame(train_x).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Columns: 2885 entries, 0 to 2884
dtypes: float64(2885)
memory usage: 13.2 MB


In [300]:
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

In [301]:
def build_model():
    input  = Input(shape=(len(train_x.keys())), name = 'input')
    hidden1 = Dense(64, activation='relu', name='dense1')(input)
    hidden2 = Dense(64, activation='relu', name='dense2')(hidden1)
    output = Dense(1, name='output')(hidden2)
    
    model = Model(inputs=[input], outputs=output)
    
    model.compile(loss='mse',
                 optimizer=Adam(0.001), # lr=0.001
                 metrics=['mae', 'mse'])
    return model

In [302]:
model = build_model()

In [303]:
model.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 2885)]            0         
                                                                 
 dense1 (Dense)              (None, 64)                184704    
                                                                 
 dense2 (Dense)              (None, 64)                4160      
                                                                 
 output (Dense)              (None, 3)                 195       
                                                                 
Total params: 189,059
Trainable params: 189,059
Non-trainable params: 0
_________________________________________________________________


In [304]:
epochs = 100

history = model.fit(normed_train_x, train_y, epochs=epochs, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# loss가 nan이 나오는이유로
* 인풋에 nan이 있음
* lr이 너무 큼
* log(0)이 되어서

In [201]:
history.history.keys()

dict_keys(['loss', 'mae', 'mse', 'val_loss', 'val_mae', 'val_mse'])

In [202]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist

Unnamed: 0,loss,mae,mse,val_loss,val_mae,val_mse,epoch
0,,,,,,,0
1,,,,,,,1
2,,,,,,,2
3,,,,,,,3
4,,,,,,,4
...,...,...,...,...,...,...,...
995,,,,,,,995
996,,,,,,,996
997,,,,,,,997
998,,,,,,,998


In [None]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    
    plt.figure(figsize=(12,6))
    
    plt.subplot(1, 2, 1)
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.plot(hist['epoch'], hist['mae'], label=[Train Error])
    plt.plot(hist['epoch'], hist['val_mae'], label=[Val Error])
    plt.ylim([0,5])
    plt.legend()
    
        plt.subplot(1, 2, 2)
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.plot(hist['epoch'], hist['mse'], label=[Train Error])
    plt.plot(hist['epoch'], hist['val_mse'], label=[Val Error])
    plt.ylim([0,20])
    plt.legend()
    
plt_history(history)

# Model Fit
### lgb는 트레이닝과 사이킷럿 두가지 API가 존재하는데 트레이닝은 lgb패키지를 불러올때 기본적으로 제공하는 문법을 사용하는것, 사이킷런은 대부분 사용하는데 이 API는 트레이닝의 .train과 다르게 사이킷럿과 문법체계가 같은 .fit을 사용한다.

In [36]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=0)
# 훈련 80 테스트 20으로 분리.

In [37]:
import lightgbm as lgb

model = lgb.LGBMRegressor(random_state=0, n_estimators=100)
# n_estimators는 lgb가 부스팅모델의 일종이라 순차적으로 모델을 만든다.
# 따라서 몇개의 모델을 반복해 순차적으로 만들것인지를 의미. 

In [38]:
model.fit(x_train, y_train)

LGBMRegressor(random_state=0)

In [39]:
model.score(x_train, y_train)

0.9930791909151178

In [40]:
model.score(x_test, y_test) # 테스트 스코어가 상당히 낮다

0.28634782540202586

## KFold를 해보자

In [41]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
# StratifiedKFold는 일반적으로 분류에서의 교차 검증, KFold는 연속된 숫자값인 회귀문제에서 용이

In [42]:
sk_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [43]:
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,8.0,0.127626,0.755056,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,8.0,0.127626,0.840168,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,0.557601,0.946559,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


In [46]:
model = lgb.LGBMRegressor(random_state=0, n_estimators=1000)
models = []

cnt_iter = 0
cv_accuracy = []
for train_idx, ts_idx in sk_fold.split(x_train, y_train):
    # train_idx에는 테스트할 데이터들의 index 값이 list로 저장되고 ts_idx에는 검증할 데이터들의 index값이 list 형태로 저장된다.
    # x_train에는 features 중에서 train_index 값의 인덱스들이 x_train에 저장되는 식이다.
    # train_index = [0,1,2]면 X_train에는 [featuers[0],features[1],features[2]]가 저장됨
    
    x_tr, x_ts = x_train.iloc[train_idx], x_train.iloc[ts_idx]
    y_tr, y_ts = y_train.iloc[train_idx], y_train.iloc[ts_idx]
    model.fit(x_train, y_train)
    # x_train, y_train 을 이용해 학습시킨다.
    
    pred = model.predict(x_test)
    # 검증 데이터를 이용해 예측
    
    cnt_iter += 1
    
    # 정답과 비교하여 정확도 저장
    accuracy = np.round(accuracy_score(y_ts, pred), 4)
    train_size = x_tr.shape[0]
    test_size = x_ts.shape[0]
    print('\n#{0} 교차 검증 정확도 : {1}, 학습데이터 크기: {2}, 검증데이터 크기: {3}'.format(cnt_iter, accuracy, train_size, test_size))
    print('#{0} 검증 세트 인덱스: {1}'.format(cnt_iter, ts_index))
    # 정확도 결과값을 cv_accuracy에 저장
    cv_accuracy.append(accuracy)
# 정확도 들의 평균으로 정확도 평가
print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))
#     models.append(model.fit(x_t, y_t, eval_set=(x_val, y_val), early_stopping_rounds=100, verbose=100))

NameError: name 'accuracy_score' is not defined

In [None]:
model.score(x_train, y_train)

In [None]:
model.score(x_test, y_test) # kfold하니 더떨어짐...

### 하이퍼파라미터 튜닝 Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = lgb.LGBMRegressor(random_state=0, n_estimators=1000)

params = {
    'learning rate' : [0.1, 0.01, 0.003],
    'min_child_sample' : [20,30]}
gs = GridSearchCV(estimator=model,
                 param_grid=params,
                 scoring='neg_mean_squared_error',
                 cv=k_fold)

In [None]:
gs.fit(x_train, y_train)

In [None]:
gs.best_params_

### 위의 lr과 min-child_sample의 베스트값을 넣고 다시 모델링

In [None]:
model = lgb.LGBMRegressor(random_state=0, n_estimators=1000, learning_rate=0.1, min_child_sample=20)
models = []

for train_idx, val_idx in k_fold.split(x_train):
    x_t, x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_t, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val, y_val), early_stopping_rounds=100, verbose=100))

In [None]:
model.score(x_train, y_train)

In [None]:
model.score(x_test, y_test) # 에휴..