In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Data Load

In [20]:
train_df = pd.read_csv('Dacon_data/train.csv')
test_df = pd.read_csv('Dacon_data/test.csv')

In [21]:
train_df.head(3)

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,,,,,...,39.34,40.89,32.56,34.09,77.77,,,,,
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,,,,,...,38.89,42.82,43.92,35.34,72.55,,,,,
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,,,,,...,39.19,36.65,42.47,36.53,78.35,,,,,


# Data Pre-processing

In [22]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)
train_df.head()

Unnamed: 0,PRODUCT_ID,Y_Class,Y_Quality,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,TRAIN_000,1,0.533433,2022-06-13 5:14,T050304,A_31,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,TRAIN_001,2,0.541819,2022-06-13 5:22,T050307,A_31,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,TRAIN_002,1,0.531267,2022-06-13 5:30,T050304,A_31,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0
3,TRAIN_003,2,0.537325,2022-06-13 5:39,T050307,A_31,0.0,0.0,0.0,0.0,...,37.74,39.17,52.17,30.58,71.78,0.0,0.0,0.0,0.0,0.0
4,TRAIN_004,1,0.53159,2022-06-13 5:47,T050304,A_31,0.0,0.0,0.0,0.0,...,38.7,41.89,46.93,33.09,76.97,0.0,0.0,0.0,0.0,0.0


In [23]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID'])
train_x.head(3)

Unnamed: 0,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2022-06-13 5:14,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,2022-06-13 5:22,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,2022-06-13 5:30,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


In [24]:
# TIMESTAMP를 월별로 저장
train_x['TIMESTAMP'] = pd.to_datetime(train_x['TIMESTAMP']).dt.strftime('%m').astype('int64')
test_x['TIMESTAMP'] = pd.to_datetime(test_x['TIMESTAMP']).dt.strftime('%m').astype('int64')
train_x.head(3)

Unnamed: 0,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,6,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0
1,6,T050307,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38.89,42.82,43.92,35.34,72.55,0.0,0.0,0.0,0.0,0.0
2,6,T050304,A_31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.19,36.65,42.47,36.53,78.35,0.0,0.0,0.0,0.0,0.0


### 정규화

In [25]:
from sklearn.preprocessing import StandardScaler

# 정규화를 위해 StandardScaler.
# 신경망 학습시 데이터셋의 값이 들쑥날쑥하거나, 매우 큰 경우에는 cost의 값이 발산하여 정상적인 학습이 이루어지지 않아
sc = StandardScaler().fit(train_x[train_x.columns[3:]])

# train_x에 있는 데이터에 맞춰 정규화를 진행. 
train_x[train_x.columns[3:]] = sc.fit_transform(train_x[train_x.columns[3:]])

test_x[test_x.columns[3:]] = sc.transform(test_x[test_x.columns[3:]])

test_x

Unnamed: 0,TIMESTAMP,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,9,T100306,T_31,0.127626,0.818890,0.0,0.844670,0.766018,0.0,0.927147,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
1,9,T100304,T_31,0.127626,0.797612,0.0,0.844670,0.960697,0.0,0.680110,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
2,9,T100304,T_31,0.127626,0.840168,0.0,0.844670,0.960697,0.0,0.680110,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
3,9,T010305,A_31,-0.302348,-1.181260,0.0,-1.183895,-1.180768,0.0,-1.172670,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
4,9,T010306,A_31,-0.302348,-1.181260,0.0,-1.183895,-1.180768,0.0,-1.172670,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,11,T100306,T_31,0.127626,0.755056,0.0,0.844670,0.766018,0.0,0.927147,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
306,11,T100304,T_31,0.127626,0.861447,0.0,0.844670,0.960697,0.0,0.680110,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
307,11,T100306,T_31,0.127626,0.755056,0.0,0.844670,0.766018,0.0,0.885975,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
308,11,T100306,T_31,0.127626,0.840168,0.0,0.844670,0.766018,0.0,0.927147,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


### 원핫인코딩

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[1])],remainder='passthrough')
# 원핫인코더에서 다중공선성 문제를 해결하고자 drop='first'
# 어떤데이터를 가지고 원핫할건지를 선택 나는 LINE을가지고 원핫하고자하기에 [1]
# 나머지칼럼들은 그대로 두기위해 remainder='passthrough'
train_x = ct_1.fit_transform(train_x)

pd.DataFrame(train_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,-0.302348,-1.18126,...,1.608668,1.574442,1.294833,1.297452,2.671373,-0.445418,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,-0.302348,-1.18126,...,1.585198,1.669622,1.901036,1.361174,2.462073,-0.445418,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,-0.302348,-1.18126,...,1.600845,1.365342,1.82366,1.421838,2.694629,-0.445418,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,-0.302348,-1.18126,...,1.525219,1.489618,2.34128,1.118519,2.431199,-0.445418,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6,-0.302348,-1.18126,...,1.575288,1.623758,2.061658,1.246474,2.639297,-0.445418,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9,0.127626,0.840168,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
594,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9,-0.302348,-1.18126,...,2.137001,2.175113,2.272976,2.368498,2.219091,2.245084,0.0,0.0,0.0,0.0
595,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9,-0.302348,-1.18126,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,2.245084,0.0,0.0,0.0,0.0
596,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9,8.297142,0.81889,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


In [29]:
ct_2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[6])],remainder='passthrough')
# CODE를을가지고 원핫하고자하기에 [6]
train_x = ct_2.fit_transform(train_x)
pd.DataFrame(train_x)
# 아래데이터를 보면 0,1 column은 CODE를 원핫, 2,3,4,5,6은 LINE을 원핫 한것이다.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,-0.302348,-1.18126,...,1.608668,1.574442,1.294833,1.297452,2.671373,-0.445418,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6,-0.302348,-1.18126,...,1.585198,1.669622,1.901036,1.361174,2.462073,-0.445418,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,-0.302348,-1.18126,...,1.600845,1.365342,1.82366,1.421838,2.694629,-0.445418,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6,-0.302348,-1.18126,...,1.525219,1.489618,2.34128,1.118519,2.431199,-0.445418,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6,-0.302348,-1.18126,...,1.575288,1.623758,2.061658,1.246474,2.639297,-0.445418,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,1.0,1.0,0.0,0.0,0.0,0.0,0.0,9,0.127626,0.840168,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
594,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,-0.302348,-1.18126,...,2.137001,2.175113,2.272976,2.368498,2.219091,2.245084,0.0,0.0,0.0,0.0
595,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9,-0.302348,-1.18126,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,2.245084,0.0,0.0,0.0,0.0
596,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9,8.297142,0.81889,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[1])],remainder='passthrough')
# 원핫인코더에서 다중공선성 문제를 해결하고자 drop='first'
# 어떤데이터를 가지고 원핫할건지를 선택 나는 LINE을가지고 원핫하고자하기에 [1]
# 나머지칼럼들은 그대로 두기위해 remainder='passthrough'
test_x = ct_1.fit_transform(test_x)

pd.DataFrame(test_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2872,2873,2874,2875,2876,2877,2878,2879,2880,2881
0,0.0,0.0,0.0,0.0,1.0,9,T_31,0.127626,0.81889,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,9,T_31,0.127626,0.797612,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,9,T_31,0.127626,0.840168,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,9,A_31,-0.302348,-1.18126,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,9,A_31,-0.302348,-1.18126,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,0.0,0.0,0.0,0.0,1.0,11,T_31,0.127626,0.755056,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
306,0.0,0.0,0.0,1.0,0.0,11,T_31,0.127626,0.861447,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
307,0.0,0.0,0.0,0.0,1.0,11,T_31,0.127626,0.755056,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
308,0.0,0.0,0.0,0.0,1.0,11,T_31,0.127626,0.840168,0.0,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


In [31]:
ct_2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[6])],remainder='passthrough')
# CODE를을가지고 원핫하고자하기에 [6]
test_x = ct_2.fit_transform(test_x)
pd.DataFrame(test_x)
# 아래데이터를 보면 0,1 column은 CODE를 원핫, 2,3,4,5,6은 LINE을 원핫 한것이다.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,9,0.127626,0.81889,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9,0.127626,0.797612,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9,0.127626,0.840168,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,-0.302348,-1.18126,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9,-0.302348,-1.18126,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,0.0,1.0,0.0,0.0,0.0,0.0,1.0,11,0.127626,0.755056,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
306,0.0,1.0,0.0,0.0,0.0,1.0,0.0,11,0.127626,0.861447,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
307,0.0,1.0,0.0,0.0,0.0,0.0,1.0,11,0.127626,0.755056,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
308,0.0,1.0,0.0,0.0,0.0,0.0,1.0,11,0.127626,0.840168,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


In [32]:
pd.DataFrame(train_x).dtypes

0       object
1       object
2       object
3       object
4       object
         ...  
2878    object
2879    object
2880    object
2881    object
2882    object
Length: 2883, dtype: object

In [33]:
train_x = train_x.astype('float')
pd.DataFrame(train_x).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Columns: 2883 entries, 0 to 2882
dtypes: float64(2883)
memory usage: 13.2 MB


# Model Fit
### lgb는 트레이닝과 사이킷럿 두가지 API가 존재하는데 트레이닝은 lgb패키지를 불러올때 기본적으로 제공하는 문법을 사용하는것, 사이킷런은 대부분 사용하는데 이 API는 트레이닝의 .train과 다르게 사이킷럿과 문법체계가 같은 .fit을 사용한다.

In [34]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=0)
# 훈련 80 테스트 20으로 분리.

In [35]:
import lightgbm as lgb

model = lgb.LGBMRegressor(random_state=0, n_estimators=100)
# n_estimators는 lgb가 부스팅모델의 일종이라 순차적으로 모델을 만든다.
# 따라서 몇개의 모델을 반복해 순차적으로 만들것인지를 의미. 

In [36]:
model.fit(x_train, y_train)

LGBMRegressor(random_state=0)

In [37]:
model.score(x_train, y_train)

0.9930791909151178

In [38]:
model.score(x_test, y_test) # 테스트 스코어가 상당히 낮다

0.28634782540202586

## KFold를 해보자

In [39]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
# StratifiedKFold는 일반적으로 분류에서의 교차 검증, KFold는 연속된 숫자값인 회귀문제에서 용이

In [40]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)

In [41]:
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2873,2874,2875,2876,2877,2878,2879,2880,2881,2882
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,0.127626,0.755056,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,0.127626,0.840168,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9.0,0.557601,0.946559,...,-0.443122,-0.442094,-0.442664,-0.440386,-0.446886,-0.445418,0.0,0.0,0.0,0.0


In [42]:
model = lgb.LGBMRegressor(random_state=0, n_estimators=1000)
models = []

for train_idx, val_idx in k_fold.split(x_train):
    x_t, x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_t, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val, y_val), early_stopping_rounds=100, verbose=100))



[100]	valid_0's l2: 0.319548
[100]	valid_0's l2: 0.225382
[100]	valid_0's l2: 0.223172
[100]	valid_0's l2: 0.259836
[100]	valid_0's l2: 0.226513


In [43]:
model.score(x_train, y_train)

0.8511364969788134

In [44]:
model.score(x_test, y_test) # kfold하니 더떨어짐...

0.2521664321713575

### 하이퍼파라미터 튜닝 Grid Search

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
model = lgb.LGBMRegressor(random_state=0, n_estimators=1000)

params = {
    'learning rate' : [0.1, 0.01, 0.003],
    'min_child_sample' : [20,30]}
gs = GridSearchCV(estimator=model,
                 param_grid=params,
                 scoring='neg_mean_squared_error',
                 cv=k_fold)

In [47]:
gs.fit(x_train, y_train)



GridSearchCV(cv=KFold(n_splits=5, random_state=0, shuffle=True),
             estimator=LGBMRegressor(n_estimators=1000, random_state=0),
             param_grid={'learning rate': [0.1, 0.01, 0.003],
                         'min_child_sample': [20, 30]},
             scoring='neg_mean_squared_error')

In [48]:
gs.best_params_

{'learning rate': 0.1, 'min_child_sample': 20}

### 위의 lr과 min-child_sample의 베스트값을 넣고 다시 모델링

In [49]:
model = lgb.LGBMRegressor(random_state=0, n_estimators=1000, learning_rate=0.1, min_child_sample=20)
models = []

for train_idx, val_idx in k_fold.split(x_train):
    x_t, x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_t, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val, y_val), early_stopping_rounds=100, verbose=100))



[100]	valid_0's l2: 0.319548
[100]	valid_0's l2: 0.225382
[100]	valid_0's l2: 0.223172
[100]	valid_0's l2: 0.259836
[100]	valid_0's l2: 0.226513


In [50]:
model.score(x_train, y_train)

0.8511364969788134

In [51]:
model.score(x_test, y_test) # 에휴..

0.2521664321713575