### Import library

In [91]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import gdown
import joblib

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import lightgbm as lgb

import eli5
from eli5.sklearn import PermutationImportance

import os
import glob

# 코드 셀 실행 후 경고를 무시
# import warnings
# warnings.filterwarnings(action='ignore')

### Data Load

In [92]:
train_path = '/root/train.csv'
test_path  = '/root/test.csv'
dt = pd.read_csv(train_path)
dt_test = pd.read_csv(test_path)

In [93]:
dt['구'] = dt['시군구'].map(lambda x : x.split()[1])
dt['동'] = dt['시군구'].map(lambda x : x.split()[2])

dt_test['구'] = dt_test['시군구'].map(lambda x : x.split()[1])
dt_test['동'] = dt_test['시군구'].map(lambda x : x.split()[2])

In [94]:
mean_val_by_gu = dt.groupby("구")["target"].mean().sort_values(ascending=False)
mean_val_by_dong = dt.groupby("동")["target"].mean().sort_values(ascending=False)
std_val_by_gu = dt.groupby("구")["target"].std().sort_values(ascending=False)
std_val_by_dong = dt.groupby("동")["target"].std().sort_values(ascending=False)

### Data preprocessing

In [95]:
# train/test 구분을 위한 칼럼을 하나 만들어 줍니다.
dt['is_test'] = 0
dt_test['is_test'] = 1
df = pd.concat([dt, dt_test])     # 하나의 데이터로 만들어줍니다.

In [96]:
# 필요 없어 보이는 columns 제거
drop_col = ['부번', 'k-전화번호', 'k-팩스번호', 'k-관리방식', 'k-복도유형', 'k-시행사', 'k-사용검사일-사용승인일', 'k-홈페이지', 'k-등록일자', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '단지신청일', 'k-관리비부과면적', '주차대수', '건축면적', '해제사유발생일', '단지소개기존clob', 'k-135㎡초과', '중개사소재지', '등기신청일자', '거래유형']
df.drop(drop_col, axis=1, inplace=True)

In [97]:
# 시군구 feaeture 처리
# 구와 동에 대한 Feature 수정
# df['구'] = df['시군구'].apply(lambda x:x.split()[1])
# df['동'] = df['시군구'].apply(lambda x:x.split()[2])
omg = ['용산구', '강남구', '서초구', '송파구', '성동구', '종로구']
is_omg = []
for x in df['구'].tolist():
    if x in omg:
        is_omg.append(1)
    else:
        is_omg.append(0)
df['개비싸'] = is_omg
df.loc[~df['구'].isin(omg), '동'] = 'Unknown' 

# del df['시군구']

In [98]:
# df['계약년'] = df['계약년월'].astype('str').map(lambda x : x[:4])
# df['계약월'] = df['계약년월'].astype('str').map(lambda x : x[4:])

In [99]:
df['계약년'] = df['계약년월'].map(lambda x : x // 100)
df['계약월'] = df['계약년월'].map(lambda x : x % 100)

df["date"] = pd.to_datetime(df['계약년'].astype(str) + '-' + df['계약월'].astype(str) + '-' + df['계약일'].astype(str))

In [100]:
df['is_public'] = df["k-전용면적별세대현황(60㎡이하)"].notnull()

In [101]:
df.columns

Index(['시군구', '번지', '본번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도', '도로명',
       'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-난방방식', 'k-전체동수', 'k-전체세대수',
       'k-건설사(시공사)', 'k-연면적', 'k-주거전용면적', 'k-전용면적별세대현황(60㎡이하)',
       'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', '좌표X', '좌표Y', 'target', '구',
       '동', 'is_test', '개비싸', '계약년', '계약월', 'date', 'is_public'],
      dtype='object')

In [102]:
drop_cols_k = ['k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)',  'k-난방방식', 'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-연면적']

df.drop(columns=drop_cols_k, inplace=True)

In [103]:
# 본번, 부번의 경우 float로 되어있지만 범주형 변수의 의미를 가지므로 object(string) 형태로 바꾸어주고 아래 작업을 진행하겠습니다.
df['본번'] = df['본번'].astype('str')

In [104]:
order1_mean_gu = mean_val_by_gu.index.tolist()
order2_mean_dong = mean_val_by_dong.index.tolist()
order3_std_gu = std_val_by_gu.index.tolist()
order4_std_dong = std_val_by_dong.index.tolist()

In [105]:
# Create a dictionary to map district names to their corresponding label encoded values
gu_mapping1 = {district: label for label, district in enumerate(order1_mean_gu)}
gu_mapping2 = {district: label for label, district in enumerate(order3_std_gu)}
dong_mapping1 = {dong: label for label, dong in enumerate(order2_mean_dong)}
dong_mapping2 = {dong: label for label, dong in enumerate(order4_std_dong)}

df['구_encoded'] = df["구"].map(gu_mapping1)
df['구_std'] = df["구"].map(gu_mapping2)
df['동_encoded'] = df["동"].map(dong_mapping1)
df['동_std'] = df["동"].map(dong_mapping2)

In [106]:
seoul_bds = ["강남구", "영등포구", "용산구"]

df["has_bds"] = df["구"].apply(lambda x : 1 if x in seoul_bds else 0)

In [107]:
df['yrs_diff_built_contract'] = df['계약년'] - df['건축년도']
df['built_in3yrs'] = df['yrs_diff_built_contract'].apply(lambda x : 1 if x <= 3 else 0)
df['built_in5yrs'] = df['yrs_diff_built_contract'].apply(lambda x : 1 if x <= 5 else 0)
df['built_over30yrs'] = df['yrs_diff_built_contract'].apply(lambda x : 1 if x >= 30 else 0)
df['built_over35yrs'] = df['yrs_diff_built_contract'].apply(lambda x : 1 if x >= 35 else 0)
df['built_over40yrs'] = df['yrs_diff_built_contract'].apply(lambda x : 1 if x >= 40 else 0)

In [108]:
df['road'] = df['도로명'].apply(lambda x : x.split()[0] if len(x.split()) > 0 else x)
df['on_main_st'] = df['road'].apply(lambda x: not any(char.isdigit() for char in x))
del df['road']

#### extra data

##### 구별 특성

- 시군구 단위 용도지역 현황 (2022)

In [109]:
df_zones_original = pd.read_csv("./extra_data/2022_zoning_seoul.csv", encoding='euc-kr')

In [110]:
df_zones_original['주거지역_전용주거_비율'] = df_zones_original['주거지역_전용주거_소계'] / df_zones_original['도시지역 면적']
df_zones_original['주거지역_일반주거_비율'] = df_zones_original['주거지역_일반주거지역_소계'] / df_zones_original['도시지역 면적']
df_zones_original['주거지역_준주거_비율'] = df_zones_original['주거지역_준주거지역'] / df_zones_original['도시지역 면적']
df_zones_original['중심상업_비율'] = df_zones_original['상업지역_중심상업'] / df_zones_original['도시지역 면적']
df_zones_original['일반상업_비율'] = df_zones_original['상업지역_일반상업'] / df_zones_original['도시지역 면적']
df_zones_original['근린상업_비율'] = df_zones_original['상업지역_근린상업'] / df_zones_original['도시지역 면적']

In [111]:
df_zone_ratio = df_zones_original[['시군구', '주거지역_비율', '상업지역_비율', '공업지역_비율', '녹지지역_비율', '주거지역_전용주거_비율', '주거지역_일반주거_비율', '주거지역_준주거_비율',
       '중심상업_비율', '일반상업_비율', '근린상업_비율']]

In [112]:
df_zone_ratio.rename(columns={'시군구': '구'}, inplace=True)

In [113]:
df = pd.merge(left=df, right=df_zone_ratio, on='구', how='left')

- 2022_학생_1만명당_사설학원수

In [114]:
df_academies = pd.read_csv("extra_data/2022_학생_1만명당_사설학원수_20240326124719.csv", encoding='euc-kr')

In [115]:
df_academies.rename(columns={"자치구":"구"}, inplace=True)
df_academies.drop(columns=['초중고_학생수(명)', '사설학원수(개소)'], inplace=True)

In [116]:
df = pd.merge(left=df, right=df_academies, on='구', how='left')

#### 경제지표

- 기준금리

In [117]:
df_interest_rate = pd.read_csv("./extra_data/base_rate.csv")

In [118]:
df_interest_rate.rename(columns={'날짜': 'date'}, inplace=True)
df_interest_rate['date'] = pd.to_datetime(df_interest_rate['date'])

In [119]:
df = pd.merge(df, df_interest_rate, on = 'date', how='left')

- KOSPI

In [120]:
# Use glob to find all files that match the pattern
file_paths = glob.glob("./extra_data/KOSPI_*.csv")

# Initialize an empty list to store the dataframes
dfs = []

# Iterate over the file paths and read each CSV file into a dataframe
for file_path in file_paths:
    kospi_df_yr = pd.read_csv(file_path, encoding="euc-kr")
    dfs.append(kospi_df_yr)

In [121]:
kospi_df = pd.concat(dfs, ignore_index=True)
kospi_df = kospi_df.drop(columns=['시가', '고가', '저가'])

In [122]:
kospi_df.rename(columns={"일자":"date", "종가":"Close"}, inplace=True)
kospi_df["date"] = kospi_df['date'].str.replace('/', '-')
kospi_df["date"] = pd.to_datetime(kospi_df["date"])
kospi_df.sort_values(by='date', ascending=True, inplace=True)

In [123]:
kospi_df["KOSPI_MA5d"] = kospi_df["Close"].transform(lambda x: x.rolling(window=5).mean())
kospi_df["KOSPI_MA20d"] = kospi_df["Close"].transform(lambda x: x.rolling(window=20).mean())
kospi_df["KOSPI_MA60d"] = kospi_df["Close"].transform(lambda x: x.rolling(window=60).mean())
kospi_df["KOSPI_MA120d"] = kospi_df["Close"].transform(lambda x: x.rolling(window=120).mean())

In [124]:
kospi_df["Close_delay120d"] = kospi_df["Close"].shift(120)

In [125]:
df = pd.merge(left=df, right=kospi_df, on='date', how='left')

In [126]:
kospi_df.drop(columns='date', inplace=True)

In [127]:
df[kospi_df.columns.tolist()] = df[kospi_df.columns.tolist()].interpolate(method='linear', axis=0)

- unempl

In [128]:
unempl_rate = pd.read_csv("./extra_data/서울시_월별실업률_2007_2024.csv", encoding="euc-kr")

In [129]:
unempl_rate["시점"] = unempl_rate["시점"]*100
unempl_rate["시점"] = unempl_rate["시점"].astype(int)
unempl_rate.rename(columns={"계" : "unempl_rate_total", "남자": "unempl_male", "여자":"unempl_female"}, inplace=True)

In [130]:
df = pd.merge(left=df, right=unempl_rate, left_on="계약년월", right_on="시점", how="left")
df.drop(columns=['시점'], inplace=True)

#### del cols before modeling

In [131]:
df.columns

Index(['시군구', '번지', '본번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도', '도로명',
       'k-주거전용면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)',
       'k-85㎡~135㎡이하', '좌표X', '좌표Y', 'target', '구', '동', 'is_test', '개비싸',
       '계약년', '계약월', 'date', 'is_public', '구_encoded', '구_std', '동_encoded',
       '동_std', 'has_bds', 'yrs_diff_built_contract', 'built_in3yrs',
       'built_in5yrs', 'built_over30yrs', 'built_over35yrs', 'built_over40yrs',
       'on_main_st', '주거지역_비율', '상업지역_비율', '공업지역_비율', '녹지지역_비율',
       '주거지역_전용주거_비율', '주거지역_일반주거_비율', '주거지역_준주거_비율', '중심상업_비율', '일반상업_비율',
       '근린상업_비율', '학생1만명당_사설학원수(개소)', '기준금리', 'Close', '대비', '등락률', '거래량',
       '거래대금', '상장시가총액', 'KOSPI_MA5d', 'KOSPI_MA20d', 'KOSPI_MA60d',
       'KOSPI_MA120d', 'Close_delay120d', 'unempl_rate_total', 'unempl_male',
       'unempl_female'],
      dtype='object')

In [132]:
drop_cols_before_train = ['date', '좌표X', '좌표Y', '구', '동', '계약월', '계약일', '계약년']

df.drop(columns=drop_cols_before_train, inplace=True)

df.columns = df.columns.str.replace('[^\w\s]', '')
df.columns = df.columns.str.replace(' ', '_')

In [133]:
df['k전용면적별세대현황60이하'] = df['k전용면적별세대현황60이하'].fillna(0)
df['k전용면적별세대현황6085이하'] = df['k전용면적별세대현황6085이하'].fillna(0)
df['k85135이하'] = df['k85135이하'].fillna(0)

In [134]:
# 'time_col' 데이터를 문자열 형태로 변환
df['계약년월'] = df['계약년월'].astype(str)
# 문자열 형태 데이터를 datetime 형태로 변환
df['계약년월'] = pd.to_datetime(df['계약년월'], format='%Y%m')
# 변환 확인
print(df['계약년월'].head())

0   2017-12-01
1   2017-12-01
2   2017-12-01
3   2018-01-01
4   2018-01-01
Name: 계약년월, dtype: datetime64[ns]


In [141]:
# 먼저, 연속형 변수와 범주형 변수를 위 info에 따라 분리해주겠습니다.
continuous_columns = []
categorical_columns = []

for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        continuous_columns.append(column)
    else:
        categorical_columns.append(column)

print("연속형 변수:", continuous_columns)
print("범주형 변수:", categorical_columns)

# 수치형 데이터를 어떻게 채워야 될지 모르겠음 -> 걍 빼.
# 좌표X, 좌표Y 를 리니어로 채우는건 혼동을 줄 수 있는 데이터임
df.drop(columns=['k주거전용면적'], inplace=True)

# 범주형 변수에 대한 보간
df[categorical_columns] = df[categorical_columns].fillna('NULL')

연속형 변수: ['전용면적', '층', '건축년도', 'k주거전용면적', 'k전용면적별세대현황60이하', 'k전용면적별세대현황6085이하', 'k85135이하', 'target', 'is_test', '개비싸', 'is_public', '구_encoded', '구_std', '동_encoded', '동_std', 'has_bds', 'yrs_diff_built_contract', 'built_in3yrs', 'built_in5yrs', 'built_over30yrs', 'built_over35yrs', 'built_over40yrs', 'on_main_st', '주거지역_비율', '상업지역_비율', '공업지역_비율', '녹지지역_비율', '주거지역_전용주거_비율', '주거지역_일반주거_비율', '주거지역_준주거_비율', '중심상업_비율', '일반상업_비율', '근린상업_비율', '학생1만명당_사설학원수개소', '기준금리', 'Close', '대비', '등락률', '거래량', '거래대금', '상장시가총액', 'KOSPI_MA5d', 'KOSPI_MA20d', 'KOSPI_MA60d', 'KOSPI_MA120d', 'Close_delay120d', 'unempl_rate_total', 'unempl_male', 'unempl_female']
범주형 변수: ['시군구', '번지', '본번', '아파트명', '계약년월', '도로명']


# Train

In [142]:
df_train = df.loc[df['is_test']==0, :]
df_test = df.loc[df['is_test']==1, :]

df_train.drop(['is_test'], axis=1, inplace=True)
df_test.drop(['is_test'], axis=1, inplace=True)
print(df_train.shape, df_test.shape)

(1118822, 53) (9272, 53)


In [143]:
# dt_test의 target은 일단 0으로 임의로 채워주도록 하겠습니다.
df_test['target'] = 0

In [144]:
# 변수 삭제 및 파생변수 제작으로 추가된 변수들이 존재하기에, 다시한번 연속형과 범주형 칼럼을 분리해주겠습니다.
continuous_columns_v2 = []
categorical_columns_v2 = []

for column in df_train.columns:
    if column == '계약년월':
        continue
    if pd.api.types.is_numeric_dtype(df_train[column]):
        continuous_columns_v2.append(column)
    else:
        categorical_columns_v2.append(column)

print("연속형 변수:", continuous_columns_v2)
print("범주형 변수:", categorical_columns_v2)

연속형 변수: ['전용면적', '층', '건축년도', 'k전용면적별세대현황60이하', 'k전용면적별세대현황6085이하', 'k85135이하', 'target', '개비싸', 'is_public', '구_encoded', '구_std', '동_encoded', '동_std', 'has_bds', 'yrs_diff_built_contract', 'built_in3yrs', 'built_in5yrs', 'built_over30yrs', 'built_over35yrs', 'built_over40yrs', 'on_main_st', '주거지역_비율', '상업지역_비율', '공업지역_비율', '녹지지역_비율', '주거지역_전용주거_비율', '주거지역_일반주거_비율', '주거지역_준주거_비율', '중심상업_비율', '일반상업_비율', '근린상업_비율', '학생1만명당_사설학원수개소', '기준금리', 'Close', '대비', '등락률', '거래량', '거래대금', '상장시가총액', 'KOSPI_MA5d', 'KOSPI_MA20d', 'KOSPI_MA60d', 'KOSPI_MA120d', 'Close_delay120d', 'unempl_rate_total', 'unempl_male', 'unempl_female']
범주형 변수: ['시군구', '번지', '본번', '아파트명', '도로명']


In [145]:
# 아래에서 범주형 변수들을 대상으로 레이블인코딩을 진행해 주겠습니다.

# 각 변수에 대한 LabelEncoder를 저장할 딕셔너리
label_encoders = {}

# Implement Label Encoding
for col in tqdm( categorical_columns_v2 ):
    lbl = LabelEncoder()

    # Label-Encoding을 fit
    lbl.fit( df_train[col].astype(str) )
    df_train[col] = lbl.transform(df_train[col].astype(str))
    label_encoders[col] = lbl           # 나중에 후처리를 위해 레이블인코더를 저장해주겠습니다.

    # Test 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가해줍니다.
    for label in np.unique(df_test[col]):
      if label not in lbl.classes_: # unseen label 데이터인 경우
        lbl.classes_ = np.append(lbl.classes_, label) # 미처리 시 ValueError발생하니 주의하세요!

    df_test[col] = lbl.transform(df_test[col].astype(str))

100%|██████████| 5/5 [00:01<00:00,  3.02it/s]


In [146]:
def preprocess_feature_name(feature_name):
  """특수 문자를 제거하고 소문자로 변환합니다."""
  feature_name = feature_name.replace("-", "_")
  feature_name = feature_name.replace(",", "_")
  feature_name = feature_name.replace(".", "_")
  feature_name = feature_name.replace("(", "_")
  feature_name = feature_name.replace(")", "_")
  feature_name = feature_name.lower()
  return feature_name

def apply_preprocessed_feature_names(df_train):
  """데이터 프레임의 feature 이름을 수정합니다."""
  df_train.columns = [preprocess_feature_name(feature) for feature in df_train.columns]
  return df_train

# 데이터 프레임에 적용
df_train = apply_preprocessed_feature_names(df_train.copy())
df_test = apply_preprocessed_feature_names(df_test.copy())

In [147]:
df_train

Unnamed: 0,시군구,번지,본번,아파트명,전용면적,계약년월,층,건축년도,도로명,k전용면적별세대현황60이하,...,거래대금,상장시가총액,kospi_ma5d,kospi_ma20d,kospi_ma60d,kospi_ma120d,close_delay120d,unempl_rate_total,unempl_male,unempl_female
0,0,4732,1149,328,79.97,2017-12-01,3,1987,6176,20.0,...,5484859.0,1.602767e+09,2482.428,2511.16000,2474.657667,2429.492500,2372.640,4.4,4.3,4.5
1,0,4732,1149,328,79.97,2017-12-01,4,1987,6176,20.0,...,4801820.0,1.587460e+09,2460.630,2478.32800,2488.367667,2437.279667,2382.560,4.4,4.3,4.5
2,0,4732,1149,328,54.98,2017-12-01,5,1987,6176,20.0,...,4274352.0,1.605821e+09,2440.374,2468.15800,2491.502167,2438.526083,2394.480,4.4,4.3,4.5
3,0,4732,1149,328,79.97,2018-01-01,4,1987,6176,20.0,...,6019622.0,1.617634e+09,2459.500,2468.86900,2495.173667,2440.168833,2388.350,4.4,4.3,4.6
4,0,4732,1149,328,79.97,2018-01-01,2,1987,6176,20.0,...,6404886.0,1.634778e+09,2488.652,2468.42400,2498.354000,2442.897833,2382.100,4.4,4.3,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,258,3082,822,215,59.94,2007-07-01,11,1998,4816,171.0,...,7069439.0,9.458545e+08,1887.768,1808.68100,1684.931000,1560.047917,1383.210,3.9,4.5,3.0
1118818,258,3082,822,215,59.94,2007-08-01,10,1998,4816,171.0,...,6651635.0,9.389746e+08,1861.231,1809.86350,1758.822500,1623.958208,1403.395,3.9,4.6,2.8
1118819,258,3082,822,215,84.83,2007-08-01,20,1998,4816,171.0,...,6233831.0,9.320947e+08,1834.694,1811.04600,1832.714000,1687.868500,1423.580,3.9,4.6,2.8
1118820,258,3082,822,215,84.83,2007-09-01,8,1998,4816,171.0,...,4572966.5,7.990292e+08,1605.746,1610.28675,1615.687417,1525.040417,1339.355,3.8,4.6,2.7


### Holdout Using optuna

In [148]:
y_train = df_train['target']
X_train = df_train.drop(['target', '계약년월'], axis=1)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2023)

In [151]:
import optuna

def objective(trial):
    # Define the hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 1000, 5000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    num_leaves = trial.suggest_int('num_leaves', 100, 1000)
    min_child_samples = trial.suggest_int('min_child_samples', 50, 500)
    feature_fraction = trial.suggest_float('feature_fraction', 0.5, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.5, 1.0)
    lambda_l1 = trial.suggest_float('lambda_l1', 1e-8, 1.0, log=True)  # Increase L1 regularization
    lambda_l2 = trial.suggest_float('lambda_l2', 1e-8, 1.0, log=True)  # Increase L2 regularization

    # Create the LGBMRegressor model with the suggested hyperparameters
    gbm = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, num_leaves=num_leaves,
                            min_child_samples=min_child_samples, feature_fraction=feature_fraction,
                            bagging_fraction=bagging_fraction, lambda_l1=lambda_l1, lambda_l2=lambda_l2, verbosity=-1)

    # Train the model
    gbm.fit(
        X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50),
                   lgb.log_evaluation(period=100, show_stdv=True)]
        )
    
    # Compute the validation RMSE
    val_rmse = gbm.best_score_['valid_1']['rmse']
    return val_rmse

def optimize_hyperparameters(n_trials=100, early_stopping=50, log_evaluation=100):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_trial.params

    # Create the final model with the best hyperparameters
    best_model = lgb.LGBMRegressor(**best_params)

    # Train the final model with early stopping and evaluation logging
    best_model.fit(X_train, y_train,
                   eval_set=[(X_train, y_train), (X_val, y_val)],
                   eval_metric='rmse',
                   callbacks=[lgb.early_stopping(stopping_rounds=early_stopping),
                              lgb.log_evaluation(period=log_evaluation, show_stdv=True)])

    return best_model

In [152]:
best_model = optimize_hyperparameters(n_trials=5)

[I 2024-03-28 06:39:47,039] A new study created in memory with name: no-name-4f9df222-deed-4eb0-bb6c-5116906de528


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 8477.71	training's l2: 7.18716e+07	valid_1's rmse: 9114.28	valid_1's l2: 8.30702e+07
[200]	training's rmse: 7195.62	training's l2: 5.17769e+07	valid_1's rmse: 8046.67	valid_1's l2: 6.47489e+07
[300]	training's rmse: 6552.6	training's l2: 4.29365e+07	valid_1's rmse: 7568.75	valid_1's l2: 5.72859e+07
[400]	training's rmse: 6162.64	training's l2: 3.79781e+07	valid_1's rmse: 7334.3	valid_1's l2: 5.3792e+07
[500]	training's rmse: 5880.9	training's l2: 3.4585e+07	valid_1's rmse: 7181.08	valid_1's l2: 5.15679e+07
[600]	training's rmse: 5636.31	training's l2: 3.1768e+07	valid_1's rmse: 7055.12	valid_1's l2: 4.97747e+07
[700]	training's rmse: 5438.65	training's l2: 2.95789e+07	valid_1's rmse: 6963.52	valid_1's l2: 4.84906e+07
[800]	training's rmse: 5274.44	training's l2: 2.78197e+07	valid_1's rmse: 6898.03	valid_1's l2: 4.75828e+07
[900]	training's rmse: 5114.92	training's l2: 2.61624e+07	valid_1's rmse: 6837.54

[I 2024-03-28 06:41:21,995] Trial 0 finished with value: 6485.449358486862 and parameters: {'n_estimators': 2433, 'max_depth': 20, 'num_leaves': 358, 'min_child_samples': 323, 'feature_fraction': 0.5759342713633693, 'bagging_fraction': 0.8965251740522262, 'lambda_l1': 0.05785980606010088, 'lambda_l2': 1.0294531525585964e-05}. Best is trial 0 with value: 6485.449358486862.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7341.18	training's l2: 5.38929e+07	valid_1's rmse: 8291.87	valid_1's l2: 6.8755e+07
[200]	training's rmse: 6284.54	training's l2: 3.94954e+07	valid_1's rmse: 7512.67	valid_1's l2: 5.64402e+07
[300]	training's rmse: 5796.53	training's l2: 3.35998e+07	valid_1's rmse: 7219.22	valid_1's l2: 5.21171e+07
[400]	training's rmse: 5470.05	training's l2: 2.99214e+07	valid_1's rmse: 7036.9	valid_1's l2: 4.95179e+07
[500]	training's rmse: 5197.57	training's l2: 2.70147e+07	valid_1's rmse: 6910.93	valid_1's l2: 4.77609e+07
[600]	training's rmse: 4994.16	training's l2: 2.49416e+07	valid_1's rmse: 6822.79	valid_1's l2: 4.65505e+07
[700]	training's rmse: 4781.42	training's l2: 2.2862e+07	valid_1's rmse: 6736.05	valid_1's l2: 4.53743e+07
[800]	training's rmse: 4621.99	training's l2: 2.13628e+07	valid_1's rmse: 6687.37	valid_1's l2: 4.47209e+07
[900]	training's rmse: 4470.1	training's l2: 1.99818e+07	valid_1's rmse: 6632.

[I 2024-03-28 06:43:13,717] Trial 1 finished with value: 6364.59395661742 and parameters: {'n_estimators': 4356, 'max_depth': 13, 'num_leaves': 755, 'min_child_samples': 122, 'feature_fraction': 0.8954270282670834, 'bagging_fraction': 0.8697700254324433, 'lambda_l1': 0.0008132011408533237, 'lambda_l2': 2.8166205283122015e-05}. Best is trial 1 with value: 6364.59395661742.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7784.06	training's l2: 6.05916e+07	valid_1's rmse: 8597.06	valid_1's l2: 7.39094e+07
[200]	training's rmse: 6593.41	training's l2: 4.34731e+07	valid_1's rmse: 7681.2	valid_1's l2: 5.90008e+07
[300]	training's rmse: 6026	training's l2: 3.63127e+07	valid_1's rmse: 7339.24	valid_1's l2: 5.38645e+07
[400]	training's rmse: 5639.71	training's l2: 3.18063e+07	valid_1's rmse: 7148.56	valid_1's l2: 5.11019e+07
[500]	training's rmse: 5346.22	training's l2: 2.85821e+07	valid_1's rmse: 7029.74	valid_1's l2: 4.94172e+07
[600]	training's rmse: 5101.1	training's l2: 2.60213e+07	valid_1's rmse: 6934.19	valid_1's l2: 4.8083e+07
[700]	training's rmse: 4895.64	training's l2: 2.39673e+07	valid_1's rmse: 6868.06	valid_1's l2: 4.71703e+07
[800]	training's rmse: 4709.41	training's l2: 2.21785e+07	valid_1's rmse: 6810.63	valid_1's l2: 4.63846e+07
[900]	training's rmse: 4557.11	training's l2: 2.07673e+07	valid_1's rmse: 6766.93

[I 2024-03-28 06:45:16,363] Trial 2 finished with value: 6553.692509352966 and parameters: {'n_estimators': 2562, 'max_depth': 29, 'num_leaves': 670, 'min_child_samples': 310, 'feature_fraction': 0.7742859654134888, 'bagging_fraction': 0.6557540009505841, 'lambda_l1': 0.00014995691079037161, 'lambda_l2': 1.4001819993029724e-08}. Best is trial 1 with value: 6364.59395661742.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7825.17	training's l2: 6.12333e+07	valid_1's rmse: 8603.64	valid_1's l2: 7.40226e+07
[200]	training's rmse: 6646.27	training's l2: 4.41729e+07	valid_1's rmse: 7688.34	valid_1's l2: 5.91105e+07
[300]	training's rmse: 6088.3	training's l2: 3.70673e+07	valid_1's rmse: 7334.15	valid_1's l2: 5.37897e+07
[400]	training's rmse: 5710.36	training's l2: 3.26082e+07	valid_1's rmse: 7137.24	valid_1's l2: 5.09402e+07
[500]	training's rmse: 5411.46	training's l2: 2.92839e+07	valid_1's rmse: 6997.73	valid_1's l2: 4.89682e+07
[600]	training's rmse: 5193.36	training's l2: 2.6971e+07	valid_1's rmse: 6916.59	valid_1's l2: 4.78393e+07
[700]	training's rmse: 5002.21	training's l2: 2.50221e+07	valid_1's rmse: 6847.26	valid_1's l2: 4.6885e+07
[800]	training's rmse: 4830.12	training's l2: 2.33301e+07	valid_1's rmse: 6787.55	valid_1's l2: 4.60709e+07
[900]	training's rmse: 4676.76	training's l2: 2.18721e+07	valid_1's rmse: 6747

[I 2024-03-28 06:47:56,560] Trial 3 finished with value: 6510.777502230608 and parameters: {'n_estimators': 3435, 'max_depth': 27, 'num_leaves': 667, 'min_child_samples': 313, 'feature_fraction': 0.8268430365901533, 'bagging_fraction': 0.9194302441841035, 'lambda_l1': 0.1281863344746334, 'lambda_l2': 0.00933909430747634}. Best is trial 1 with value: 6364.59395661742.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 8519.74	training's l2: 7.2586e+07	valid_1's rmse: 9172.11	valid_1's l2: 8.41276e+07
[200]	training's rmse: 7328.38	training's l2: 5.37052e+07	valid_1's rmse: 8176.03	valid_1's l2: 6.68475e+07
[300]	training's rmse: 6778.44	training's l2: 4.59472e+07	valid_1's rmse: 7777.03	valid_1's l2: 6.04821e+07
[400]	training's rmse: 6406.73	training's l2: 4.10462e+07	valid_1's rmse: 7538.44	valid_1's l2: 5.68281e+07
[500]	training's rmse: 6130.27	training's l2: 3.75802e+07	valid_1's rmse: 7381	valid_1's l2: 5.44791e+07
[600]	training's rmse: 5907.49	training's l2: 3.48984e+07	valid_1's rmse: 7265.52	valid_1's l2: 5.27878e+07
[700]	training's rmse: 5730.45	training's l2: 3.2838e+07	valid_1's rmse: 7182.08	valid_1's l2: 5.15823e+07
[800]	training's rmse: 5564.33	training's l2: 3.09618e+07	valid_1's rmse: 7105	valid_1's l2: 5.0481e+07
[900]	training's rmse: 5416.97	training's l2: 2.93436e+07	valid_1's rmse: 7042.03	va

[I 2024-03-28 06:50:10,612] Trial 4 finished with value: 6636.410582997472 and parameters: {'n_estimators': 3007, 'max_depth': 30, 'num_leaves': 491, 'min_child_samples': 486, 'feature_fraction': 0.9469740532746139, 'bagging_fraction': 0.56747212391, 'lambda_l1': 1.587831839352549e-08, 'lambda_l2': 0.21814311158632324}. Best is trial 1 with value: 6364.59395661742.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5838
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 51
[LightGBM] [Info] Start training from score 58000.483999
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7341.18	training's l2: 5.38929e+07	valid_1's rmse: 8291.87	valid_1's l2: 6.8755e+07
[200]	training's rmse: 6284.54	training's l2: 3.94954e+07	valid_1's rmse: 7512.67	valid_1's l2: 5.64402e+07
[300]	training's rmse: 5796.53	training's l2: 3.35998e+07	valid_1's rmse: 7219.22	valid_1's l2: 5.21171e+07
[400]	training's rmse: 5470.05	training's l2: 2.99214e+07	valid_1's rmse: 7036.9	valid_1's l2: 4.95179e+07
[500]	training's rmse: 5197.57	training's l2: 2.70147e+07	valid_1's rmse: 6910.93	valid_1's l2: 4.

In [153]:
best_model.best_score_

defaultdict(collections.OrderedDict,
            {'training': OrderedDict([('rmse', 2579.562193534013),
                          ('l2', 6654141.110310011)]),
             'valid_1': OrderedDict([('rmse', 6364.593956615157),
                          ('l2', 40508056.23258218)])})

In [154]:
X_test = df_test.drop(['target', '계약년월'], axis=1)
real_test_pred = best_model.predict(X_test)
preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
preds_df.to_csv('holdout_optuna_output.csv', index=False)



In [155]:
preds_df.median()

target    92986.5
dtype: float64

### holdout

In [156]:
def holdout_lgb(X, y, test=pd.DataFrame, test_size=0.2, random_state=2023 ,gbm = lgb.LGBMRegressor(n_estimators=1000), early_stopping=10, log_evaluation=10):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
    gbm.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping),
                   lgb.log_evaluation(period=log_evaluation, show_stdv=True)])
    
    if not test.empty:
        real_test_pred = gbm.predict(test)
        preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
        preds_df.to_csv('holdout_output.csv', index=False)

        # return preds_df

    return gbm
        
    

In [157]:
y_train = df_train['target']
X_train = df_train.drop(['target', '계약년월'], axis=1)
X_test = df_test.drop(['target', '계약년월'], axis=1)
gbm = lgb.LGBMRegressor(n_estimators=2000, max_depth=20, num_leaves=100,
                            min_child_samples=60, feature_fraction=0.8,
                            bagging_fraction=0.8)
gbm = holdout_lgb(X_train,y_train, test=X_test, gbm=gbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.117658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5838
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 51
[LightGBM] [Info] Start training from score 58000.483999
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 23703	training's l2: 5.61832e+08	valid_1's rmse: 24032.5	valid_1's l2: 5.7756e+08
[20]	training's rmse: 16515.1	training's l2: 2.72748e+08	valid_1's rmse: 16861	valid_1's l2: 2.84292e+08
[30]	training's rmse: 13832.4	training's l2: 1.91334e+08	valid_1's rmse: 14211.7	valid_1's l2: 2.01972e+08
[40]	training's rmse: 12443.4	training's l2: 1.54839e+08	valid_1's rmse: 12831.4	valid_1's l2: 1.64645e+08
[50]	training's rmse: 11575.8	training's l2: 1.34e+08	valid_1's rmse: 11977.6	valid_1's l2: 1.43464e+08
[60]	training's rmse: 10873.5	training's l2: 1.18232e+08	valid_1'

In [158]:
gbm.best_score_['valid_1']['rmse']

6368.889805103728

## 데이터 분리

In [159]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold, TimeSeriesSplit

### K-Fold

In [160]:
def k_fold_lgb(X, y, test=pd.DataFrame, n_splits=5, gbm=lgb.LGBMRegressor(n_estimators=1000), early_stopping=10, log_evaluation=10):
    # Kfold 함수를 선언합니다.
    kf = KFold(n_splits=n_splits)
    # 학습 데이터를 Kfold로 나눕니다.
    train_folds = kf.split(X, y)

    fold_save_files = []

    for fold_idx, (train_idx, valid_idx) in enumerate(train_folds):
        print(f"--------{fold_idx}번째 fold의 학습을 시작합니다.--------")

        # index를 통해 fold의 학습세트를 가져옵니다.
        X_train_fold = X.iloc[train_idx, :]
        Y_train_fold = y[train_idx]

        # index를 통해 fold의 평가세트를 가져옵니다.
        X_valid_fold = X.iloc[valid_idx, :]
        Y_valid_fold = y[valid_idx]

        # fold의 데이터로 학습을 진행합니다.
        gbm.fit(X_train_fold, Y_train_fold,                                               # 학습 데이터를 입력합니다.
            eval_set=[(X_train_fold, Y_train_fold), (X_valid_fold, Y_valid_fold)], # 평가셋을 지정합니다.
            eval_metric ='rmse',                                                               # 평가과정에서 사용할 평가함수를 지정합니다.
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping),                                  # 10번의 성능향상이 없을 경우, 학습을 멈춥니다.
                    lgb.log_evaluation(period=log_evaluation, show_stdv=True)]                           # 매 iteration마다 학습결과를 출력합니다.
        )

        # 각 fold별 학습한 모델을 저장합니다.
        file_name = f"kfold{fold_idx}_gbm.pkl"
        joblib.dump(gbm, file_name)
        print(f"--------{fold_idx}번째 fold는 {file_name}에 저장되었습니다.--------\n\n")
        fold_save_files.append(file_name)

    # 저장한 학습모델들을 불러와, Testset에 대한 추론을 진행합니다.
    # 각 fold의 예측결과를 평균을 취하는 방식으로 진행합니다.
    if not test.empty:
        total_predicts = np.zeros(len(X_test))

        for file_name in fold_save_files:
            gbm_trained = joblib.load(file_name)
            fold_predicts = gbm_trained.predict(X_test)

            total_predicts += fold_predicts / len(fold_save_files)
        
        # 앞서 예측한 예측값들을 저장합니다.
        preds_df = pd.DataFrame(total_predicts.astype(int), columns=["target"])
        preds_df.to_csv('k-fold_output.csv', index=False)

        return preds_df

In [161]:
k_fold_lgb(X_train, y_train, X_test)

--------0번째 fold의 학습을 시작합니다.--------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5815
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 51
[LightGBM] [Info] Start training from score 57132.425269
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 25736.7	training's l2: 6.62376e+08	valid_1's rmse: 26119	valid_1's l2: 6.82204e+08
[20]	training's rmse: 19137.5	training's l2: 3.66245e+08	valid_1's rmse: 22297.8	valid_1's l2: 4.97192e+08
[30]	training's rmse: 16350.4	training's l2: 2.67337e+08	valid_1's rmse: 21327.7	valid_1's l2: 4.54872e+08
[40]	training's rmse: 14931.9	training's l2: 2.22962e+08	valid_1's rmse: 20775.8	valid_1's l2: 4.31632e+08
[50]	training's rmse: 14002.5	training's l2: 1.9607e+08	valid_1's

Unnamed: 0,target
0,221036
1,320124
2,322847
3,281652
4,230702
...,...
9267,83854
9268,84728
9269,92842
9270,85030


### Best 3 K-Fold

In [162]:
top_3_files = ["kfold2_gbm.pkl", "kfold3_gbm.pkl", "kfold4_gbm.pkl"]
total_predicts = np.zeros(len(X_test))

for file_name in top_3_files:
    gbm_trained = joblib.load(file_name)
    fold_predicts = gbm_trained.predict(X_test)

    total_predicts += fold_predicts / len(top_3_files)

# 앞서 예측한 예측값들을 저장합니다.
preds_df = pd.DataFrame(total_predicts.astype(int), columns=["target"])
preds_df.to_csv('k-fold_output.csv', index=False)


### Time series Split

In [163]:
def time_series_lgb(X, y, test=pd.DataFrame, n_splits=5, gbm=lgb.LGBMRegressor(n_estimators=1000), early_stopping=10, log_evaluation=10):
    # TimeSeriesSplit 함수를 선언합니다.
    kf = TimeSeriesSplit(n_splits=n_splits)
    train_folds = kf.split(X, y)

    fold_save_files = []

    for fold_idx, (train_idx, valid_idx) in enumerate(train_folds):
        display(f"--------{fold_idx}번째 fold의 학습을 시작합니다.--------")

        # index를 통해 fold의 학습세트를 가져옵니다.
        X_train_fold = X.iloc[train_idx, :]
        Y_train_fold = y[train_idx]

        # index를 통해 fold의 평가세트를 가져옵니다.
        X_valid_fold = X.iloc[valid_idx, :]
        Y_valid_fold = y[valid_idx]

        # fold의 데이터로 학습을 진행합니다.
        gbm = lgb.LGBMRegressor(n_estimators=1000)
        gbm.fit(X_train_fold, Y_train_fold,                                               # 학습 데이터를 입력합니다.
            eval_set=[(X_train_fold, Y_train_fold), (X_valid_fold, Y_valid_fold)], # 평가셋을 지정합니다.
            eval_metric ='rmse',                                                               # 평가과정에서 사용할 평가함수를 지정합니다.
            callbacks=[lgb.early_stopping(stopping_rounds=10),                                  # 10번의 성능향상이 없을 경우, 학습을 멈춥니다.
                    lgb.log_evaluation(period=10, show_stdv=True)]                           # 매 iteration마다 학습결과를 출력합니다.
        )

        # 각 fold별 학습한 모델을 저장합니다.
        file_name = f"timeseries_fold{fold_idx}_gbm.pkl"
        joblib.dump(gbm, file_name)
        display(f"--------{fold_idx}번째 fold는 {file_name}에 저장되었습니다.--------\n\n")
        fold_save_files.append(file_name)

    # 저장한 학습모델들을 불러와, Testset에 대한 추론을 진행합니다.
    # 각 fold의 예측결과를 평균을 취하는 방식으로 진행합니다.
    if not test.empty:
        total_predicts = np.zeros(len(X_test))

        for file_name in fold_save_files:
            gbm_trained = joblib.load(file_name)
            fold_predicts = gbm_trained.predict(X_test)

            total_predicts += fold_predicts / len(fold_save_files)
        
        # 앞서 예측한 예측값들을 저장합니다.
        preds_df = pd.DataFrame(total_predicts.astype(int), columns=["target"])
        preds_df.to_csv('time-series_output.csv', index=False)

        return preds_df

In [164]:
X_train = df_train.drop(['target'], axis=1)
Y_train = df_train['target']

X_train = X_train.sort_values(by='계약년월') # 시간순으로 정렬합니다.
Y_train = Y_train.reindex(X_train.index) # 정렬된 X_train의 인덱스에 맞추어 Y_train도 정렬해줍니다.

X_train = X_train.reset_index(drop=True) # 인덱스를 재정렬 해줍니다.
Y_train = Y_train.reset_index(drop=True)

del X_train['계약년월'] # 시간에 대한 정보를 지웁니다.

X_test = df_test.drop(['target', '계약년월'], axis=1)

time_series_lgb(X_train, Y_train, X_test)

'--------0번째 fold의 학습을 시작합니다.--------'

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5612
[LightGBM] [Info] Number of data points in the train set: 186472, number of used features: 51
[LightGBM] [Info] Start training from score 41193.008543
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 16038.2	training's l2: 2.57222e+08	valid_1's rmse: 16887.1	valid_1's l2: 2.85174e+08
[20]	training's rmse: 11916.3	training's l2: 1.41999e+08	valid_1's rmse: 12934.1	valid_1's l2: 1.67291e+08
[30]	training's rmse: 10398.2	training's l2: 1.08122e+08	valid_1's rmse: 11861.9	valid_1's l2: 1.40705e+08
[40]	training's rmse: 9598.42	training's l2: 9.21296e+07	valid_1's rmse: 11494.3	valid_1's l2: 1.32118e+08
[50]	training's rmse: 9068.77	training's l2: 8.22425e+07	valid_1's rmse: 11284.1	valid_1's l2: 1.273

'--------0번째 fold는 timeseries_fold0_gbm.pkl에 저장되었습니다.--------\n\n'

'--------1번째 fold의 학습을 시작합니다.--------'

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014357 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5672
[LightGBM] [Info] Number of data points in the train set: 372942, number of used features: 51
[LightGBM] [Info] Start training from score 43548.686345
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 16444.7	training's l2: 2.70427e+08	valid_1's rmse: 15577.3	valid_1's l2: 2.42652e+08
[20]	training's rmse: 12229.2	training's l2: 1.49554e+08	valid_1's rmse: 11783.2	valid_1's l2: 1.38844e+08
[30]	training's rmse: 10696.6	training's l2: 1.14416e+08	valid_1's rmse: 10723.2	valid_1's l2: 1.14987e+08
[40]	training's rmse: 9888.36	training's l2: 9.77796e+07	valid_1's rmse: 10153.7	valid_1's l2: 1.03098e+08
[50]	training's rmse: 9376.89	training's l2: 8.7926e+07	valid_1's rmse: 9803.24	valid_1's l2: 9.6103

'--------1번째 fold는 timeseries_fold1_gbm.pkl에 저장되었습니다.--------\n\n'

'--------2번째 fold의 학습을 시작합니다.--------'

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5724
[LightGBM] [Info] Number of data points in the train set: 559412, number of used features: 51
[LightGBM] [Info] Start training from score 44352.484621
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 16205.1	training's l2: 2.62605e+08	valid_1's rmse: 21648	valid_1's l2: 4.68638e+08
[20]	training's rmse: 11983.1	training's l2: 1.43594e+08	valid_1's rmse: 16922.5	valid_1's l2: 2.86371e+08
[30]	training's rmse: 10427.5	training's l2: 1.08733e+08	valid_1's rmse: 15052.6	valid_1's l2: 2.26582e+08
[40]	training's rmse: 9632.89	training's l2: 9.27925e+07	valid_1's rmse: 13910.6	valid_1's l2: 1.93505e+08
[50]	training's rmse: 9166.99	training's l2: 8.40337e+07	valid_1's rmse: 13342	valid_1's l2: 1.7801e+0

'--------2번째 fold는 timeseries_fold2_gbm.pkl에 저장되었습니다.--------\n\n'

'--------3번째 fold의 학습을 시작합니다.--------'

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5742
[LightGBM] [Info] Number of data points in the train set: 745882, number of used features: 51
[LightGBM] [Info] Start training from score 46766.465732
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 17261.3	training's l2: 2.97951e+08	valid_1's rmse: 38963.1	valid_1's l2: 1.51812e+09
[20]	training's rmse: 12726.4	training's l2: 1.6196e+08	valid_1's rmse: 33142	valid_1's l2: 1.09839e+09
[30]	training's rmse: 10974.7	training's l2: 1.20444e+08	valid_1's rmse: 30683.7	valid_1's l2: 9.41492e+08
[40]	training's rmse: 10072	training's l2: 1.01445e+08	valid_1's rmse: 28811.2	valid_1's l2: 8.30083e+08
[50]	training's rmse: 9490.87	training's l2: 9.00766e+07	valid_1's rmse: 27756.2	valid_1's l2: 7.70406e+0

'--------3번째 fold는 timeseries_fold3_gbm.pkl에 저장되었습니다.--------\n\n'

'--------4번째 fold의 학습을 시작합니다.--------'

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5776
[LightGBM] [Info] Number of data points in the train set: 932352, number of used features: 51
[LightGBM] [Info] Start training from score 51316.276133
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 20674.6	training's l2: 4.27439e+08	valid_1's rmse: 59279	valid_1's l2: 3.514e+09
[20]	training's rmse: 15350.7	training's l2: 2.35645e+08	valid_1's rmse: 52275.6	valid_1's l2: 2.73273e+09
[30]	training's rmse: 13179.8	training's l2: 1.73708e+08	valid_1's rmse: 48700.6	valid_1's l2: 2.37175e+09
[40]	training's rmse: 12019.8	training's l2: 1.44475e+08	valid_1's rmse: 46626.4	valid_1's l2: 2.17402e+09
[50]	training's rmse: 11240.8	training's l2: 1.26355e+08	valid_1's rmse: 45692.4	valid_1's l2: 2.0878e+0

'--------4번째 fold는 timeseries_fold4_gbm.pkl에 저장되었습니다.--------\n\n'

Unnamed: 0,target
0,93920
1,122350
2,157235
3,133352
4,100429
...,...
9267,44139
9268,44222
9269,53594
9270,45427
