### Import library

In [68]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import gdown
import joblib

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import lightgbm as lgb

import eli5
from eli5.sklearn import PermutationImportance

# 코드 셀 실행 후 경고를 무시
# import warnings
# warnings.filterwarnings(action='ignore')

### Data Load

In [69]:
train_path = '/data/ephemeral/home/train.csv'
test_path  = '/data/ephemeral/home/test.csv'
dt = pd.read_csv(train_path)
dt_test = pd.read_csv(test_path)
va1 = pd.read_csv('/data/ephemeral/home/upstage-ml-regression-3/eonseon/2023년_공동주택_공시가격_정보.csv',encoding='cp949')
df_bus = pd.read_csv("/data/ephemeral/home/bus_feature.csv")
df_metro = pd.read_csv("/data/ephemeral/home/subway_feature.csv")
coords = pd.read_csv('/data/ephemeral/home/upstage-ml-regression-3/eonseon/coords.csv')

### Data preprocessing

In [70]:
# train/test 구분을 위한 칼럼을 하나 만들어 줍니다.
dt['is_test'] = 0
dt_test['is_test'] = 1
df = pd.concat([dt, dt_test])     # 하나의 데이터로 만들어줍니다.

In [71]:
# 필요 없어 보이는 columns 제거
drop_col = ['부번', '계약일', 'k-전화번호', 'k-팩스번호', 'k-관리방식', 'k-복도유형', 'k-시행사', 'k-사용검사일-사용승인일', 'k-홈페이지', 'k-등록일자', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '단지신청일', 'k-관리비부과면적', '주차대수', '건축면적', '해제사유발생일', '단지소개기존clob', 'k-135㎡초과', '중개사소재지', '등기신청일자', '거래유형']
df.drop(drop_col, axis=1, inplace=True)

In [72]:
# 시군구 feaeture 처리
# 구와 동에 대한 Feature 수정
df['구'] = df['시군구'].apply(lambda x:x.split()[1])
df['동'] = df['시군구'].apply(lambda x:x.split()[2])
omg = ['용산구', '강남구', '서초구', '송파구', '성동구', '종로구']
is_omg = []
for x in df['구'].tolist():
    if x in omg:
        is_omg.append(1)
    else:
        is_omg.append(0)
df['개비싸'] = is_omg
df.loc[~df['구'].isin(omg), '동'] = 'Unknown' 

del df['시군구']

In [73]:
# 본번, 부번의 경우 float로 되어있지만 범주형 변수의 의미를 가지므로 object(string) 형태로 바꾸어주고 아래 작업을 진행하겠습니다.
df['본번'] = df['본번'].astype('str')

In [74]:
df['계약년'] = df['계약년월'].astype('str').map(lambda x : x[:4])
df['계약월'] = df['계약년월'].astype('str').map(lambda x : x[4:])

In [75]:
# 'time_col' 데이터를 문자열 형태로 변환
df['계약년월'] = df['계약년월'].astype(str)
# 문자열 형태 데이터를 datetime 형태로 변환
df['계약년월'] = pd.to_datetime(df['계약년월'], format='%Y%m')
# 변환 확인
print(df['계약년월'].head())

0   2017-12-01
1   2017-12-01
2   2017-12-01
3   2018-01-01
4   2018-01-01
Name: 계약년월, dtype: datetime64[ns]


In [76]:
# 먼저, 연속형 변수와 범주형 변수를 위 info에 따라 분리해주겠습니다.
continuous_columns = []
categorical_columns = []

for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        continuous_columns.append(column)
    else:
        categorical_columns.append(column)

print("연속형 변수:", continuous_columns)
print("범주형 변수:", categorical_columns)

# 수치형 데이터를 어떻게 채워야 될지 모르겠음 -> 걍 빼.
# 좌표X, 좌표Y 를 리니어로 채우는건 혼동을 줄 수 있는 데이터임
df.drop(columns=['k-전체동수', 'k-전체세대수', 'k-연면적', 'k-주거전용면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하',], inplace=True)

# 범주형 변수에 대한 보간
df[categorical_columns] = df[categorical_columns].fillna('NULL')

연속형 변수: ['전용면적(㎡)', '층', '건축년도', 'k-전체동수', 'k-전체세대수', 'k-연면적', 'k-주거전용면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', '좌표X', '좌표Y', 'target', 'is_test', '개비싸']
범주형 변수: ['번지', '본번', '아파트명', '계약년월', '도로명', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-난방방식', 'k-건설사(시공사)', '구', '동', '계약년', '계약월']


In [77]:
va1['도로명주소']= va1['도로명주소'].apply(lambda address: address.split()[2]+' '+address.split()[-1] )
va1.rename(columns={'도로명주소': '도로명',
                     '단지명':'아파트명'}, inplace=True)

In [78]:
va1 = va1[va1['시도']=='서울특별시']
va1 = va1.drop(columns={'기준월','법정동코드','읍면','특수지코드','특수지명','단지코드','동코드','호코드',})
va1['공시가격']=va1['공시가격']/10000

In [79]:
avg_price_by_road = va1.groupby('도로명')['공시가격'].mean().reset_index()
avg_price_by_road.columns = ['도로명', '평균공시가격']  # 열 이름 변경

df = pd.merge(df, avg_price_by_road, on='도로명', how='left')
df


Unnamed: 0,번지,본번,아파트명,전용면적(㎡),계약년월,층,건축년도,도로명,"k-단지분류(아파트,주상복합등등)",k-세대타입(분양형태),...,좌표X,좌표Y,target,is_test,구,동,개비싸,계약년,계약월,평균공시가격
0,658-1,658.0,개포6차우성,79.97,2017-12-01,3,1987,언주로 3,아파트,분양,...,127.05721,37.476763,124000.0,0,강남구,개포동,1,2017,12,120960.000000
1,658-1,658.0,개포6차우성,79.97,2017-12-01,4,1987,언주로 3,아파트,분양,...,127.05721,37.476763,123500.0,0,강남구,개포동,1,2017,12,120960.000000
2,658-1,658.0,개포6차우성,54.98,2017-12-01,5,1987,언주로 3,아파트,분양,...,127.05721,37.476763,91500.0,0,강남구,개포동,1,2017,12,120960.000000
3,658-1,658.0,개포6차우성,79.97,2018-01-01,4,1987,언주로 3,아파트,분양,...,127.05721,37.476763,130000.0,0,강남구,개포동,1,2018,01,120960.000000
4,658-1,658.0,개포6차우성,79.97,2018-01-01,2,1987,언주로 3,아파트,분양,...,127.05721,37.476763,117000.0,0,강남구,개포동,1,2018,01,120960.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,816,816.0,신내우디안1단지,84.65,2023-07-01,13,2014,신내역로1길 85,아파트,기타,...,127.10672,37.618870,,1,중랑구,Unknown,0,2023,07,39021.326676
1128090,816,816.0,신내우디안1단지,84.62,2023-07-01,12,2014,신내역로1길 85,아파트,기타,...,127.10672,37.618870,,1,중랑구,Unknown,0,2023,07,39021.326676
1128091,816,816.0,신내우디안1단지,101.65,2023-08-01,12,2014,신내역로1길 85,아파트,기타,...,127.10672,37.618870,,1,중랑구,Unknown,0,2023,08,39021.326676
1128092,816,816.0,신내우디안1단지,84.94,2023-09-01,18,2014,신내역로1길 85,아파트,기타,...,127.10672,37.618870,,1,중랑구,Unknown,0,2023,09,39021.326676


In [80]:
grouped_prices = va1.groupby('도로명')['공시가격'].apply(list).reset_index()
grouped_prices.columns = ['도로명', '공시가격리스트']  # 열 이름 변경

# 기존 데이터프레임에 공시가격리스트 열 추가
df = pd.merge(df, grouped_prices, on='도로명', how='left')
df

Unnamed: 0,번지,본번,아파트명,전용면적(㎡),계약년월,층,건축년도,도로명,"k-단지분류(아파트,주상복합등등)",k-세대타입(분양형태),...,좌표Y,target,is_test,구,동,개비싸,계약년,계약월,평균공시가격,공시가격리스트
0,658-1,658.0,개포6차우성,79.97,2017-12-01,3,1987,언주로 3,아파트,분양,...,37.476763,124000.0,0,강남구,개포동,1,2017,12,120960.000000,"[125300.0, 125300.0, 125300.0, 125300.0, 10390..."
1,658-1,658.0,개포6차우성,79.97,2017-12-01,4,1987,언주로 3,아파트,분양,...,37.476763,123500.0,0,강남구,개포동,1,2017,12,120960.000000,"[125300.0, 125300.0, 125300.0, 125300.0, 10390..."
2,658-1,658.0,개포6차우성,54.98,2017-12-01,5,1987,언주로 3,아파트,분양,...,37.476763,91500.0,0,강남구,개포동,1,2017,12,120960.000000,"[125300.0, 125300.0, 125300.0, 125300.0, 10390..."
3,658-1,658.0,개포6차우성,79.97,2018-01-01,4,1987,언주로 3,아파트,분양,...,37.476763,130000.0,0,강남구,개포동,1,2018,01,120960.000000,"[125300.0, 125300.0, 125300.0, 125300.0, 10390..."
4,658-1,658.0,개포6차우성,79.97,2018-01-01,2,1987,언주로 3,아파트,분양,...,37.476763,117000.0,0,강남구,개포동,1,2018,01,120960.000000,"[125300.0, 125300.0, 125300.0, 125300.0, 10390..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,816,816.0,신내우디안1단지,84.65,2023-07-01,13,2014,신내역로1길 85,아파트,기타,...,37.618870,,1,중랑구,Unknown,0,2023,07,39021.326676,"[42800.0, 41800.0, 44000.0, 43100.0, 44700.0, ..."
1128090,816,816.0,신내우디안1단지,84.62,2023-07-01,12,2014,신내역로1길 85,아파트,기타,...,37.618870,,1,중랑구,Unknown,0,2023,07,39021.326676,"[42800.0, 41800.0, 44000.0, 43100.0, 44700.0, ..."
1128091,816,816.0,신내우디안1단지,101.65,2023-08-01,12,2014,신내역로1길 85,아파트,기타,...,37.618870,,1,중랑구,Unknown,0,2023,08,39021.326676,"[42800.0, 41800.0, 44000.0, 43100.0, 44700.0, ..."
1128092,816,816.0,신내우디안1단지,84.94,2023-09-01,18,2014,신내역로1길 85,아파트,기타,...,37.618870,,1,중랑구,Unknown,0,2023,09,39021.326676,"[42800.0, 41800.0, 44000.0, 43100.0, 44700.0, ..."


In [81]:

# 좌표 CSV 파일에서 'street'을 인덱스로 설정
coords.set_index('street', inplace=True)

# dt 데이터프레임에서 null 값을 좌표 CSV 파일의 값으로 대체
for index, row in dt.iterrows():
    if pd.isnull(row['좌표X']) or pd.isnull(row['좌표Y']):
        street = row['도로명']
        if street in coords.index:
            dt.at[index, '좌표X'] = coords.loc[street, '좌표X']
            dt.at[index, '좌표Y'] = coords.loc[street, '좌표Y']

In [82]:
from math import radians, cos, sin, sqrt, atan2
from scipy.spatial import cKDTree
# Vincenty 공식을 사용하여 두 점 간의 거리를 계산하는 함수
def calculate_distance(lat1, lon1, lat2, lon2):
    # 지구의 반경 (단위: m)
    R = 6371e3
    
    # 위도 및 경도를 라디안으로 변환
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)
    
    # 두 점 간의 차이를 계산
    delta_lon = lon2_rad - lon1_rad
    
    # Vincenty 공식 계산
    a = sin((lat2_rad - lat1_rad) / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(delta_lon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    
    return distance
def nearest_metro_station_distance(dt_row, kdtree, df_metro):
    dt_coords = (dt_row['좌표Y'], dt_row['좌표X'])
    if np.isnan(dt_coords[0]) or np.isnan(dt_coords[1]):  # 좌표값이 null인 경우
        return np.nan  # NaN으로 반환하여 해당 위치에 대한 거리를 없음으로 표시
    else:
        nearest_idx = kdtree.query(dt_coords)[1]
        nearest_coords = (df_metro.loc[nearest_idx, '위도'], df_metro.loc[nearest_idx, '경도'])
        distance = calculate_distance(dt_coords[1], dt_coords[0], nearest_coords[1], nearest_coords[0])
        return distance

# 지하철 역세권 데이터프레임에 역세권 여부를 판별하는 열 추가하는 함수
def add_metro_station_proximity_column(dt, df_metro, proximity_threshold):
    # 지하철 역 데이터에서 좌표를 추출하여 KD 트리 인덱스 생성
    metro_station_coords = df_metro[['위도', '경도']].dropna().values  # null 값을 제외하고 좌표 추출
    kdtree_metro_station = cKDTree(metro_station_coords)
    
    # dt 데이터프레임에 역세권 여부를 판별하는 열 추가
    dt['가장가까운_지하철역_거리'] = dt.apply(nearest_metro_station_distance, args=(kdtree_metro_station, df_metro,), axis=1)
    
    # 최근접 지하철 역과의 거리가 기준 거리 이내인지 판별하여 역세권 여부를 나타내는 열 추가
    dt['지하철역세권'] = dt['가장가까운_지하철역_거리'] <= proximity_threshold

# 지하철 역세권을 판별하기 위한 지하철 역과의 최대 거리 설정 (예: 115m)
proximity_threshold_metro_station = 500

# 역세권 여부를 나타내는 열 추가
add_metro_station_proximity_column(df, df_metro, proximity_threshold_metro_station)

In [83]:
from math import radians, cos, sin, sqrt, atan2
from scipy.spatial import cKDTree
# Vincenty 공식을 사용하여 두 점 간의 거리를 계산하는 함수
def calculate_distance(lat1, lon1, lat2, lon2):
    # 지구의 반경 (단위: m)
    R = 6371e3
    
    # 위도 및 경도를 라디안으로 변환
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)
    
    # 두 점 간의 차이를 계산
    delta_lon = lon2_rad - lon1_rad
    
    # Vincenty 공식 계산
    a = sin((lat2_rad - lat1_rad) / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(delta_lon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    
    return distance

# 최근접 버스 정류장과의 거리 계산 함수
def nearest_bus_stop_distance(dt_row, kdtree, df_bus):
    dt_coords = (dt_row['좌표Y'], dt_row['좌표X'])
    if np.isnan(dt_coords[0]) or np.isnan(dt_coords[1]):  # 좌표값이 null인 경우
        return np.nan  # NaN으로 반환하여 해당 위치에 대한 거리를 없음으로 표시
    else:
        nearest_idx = kdtree.query(dt_coords)[1]
        nearest_coords = (df_bus.loc[nearest_idx, 'Y좌표'], df_bus.loc[nearest_idx, 'X좌표'])
        distance = calculate_distance(dt_coords[1], dt_coords[0], nearest_coords[1], nearest_coords[0])
        return distance

# 버스 정류장 세권 데이터프레임에 역세권 여부를 판별하는 열 추가하는 함수
def add_bus_stop_proximity_column(dt, df_bus, proximity_threshold):
    # 버스 정류장 데이터에서 좌표를 추출하여 KD 트리 인덱스 생성
    bus_stop_coords = df_bus[['Y좌표', 'X좌표']].dropna().values  # null 값을 제외하고 좌표 추출
    kdtree_bus_stop = cKDTree(bus_stop_coords)
    
    # dt 데이터프레임에 역세권 여부를 판별하는 열 추가
    dt['가장가까운_버스정류장_거리'] = dt.apply(nearest_bus_stop_distance, args=(kdtree_bus_stop, df_bus,), axis=1)
    
    # 최근접 버스 정류장과의 거리가 기준 거리 이내인지 판별하여 역세권 여부를 나타내는 열 추가
    dt['버스정류장세권'] = dt['가장가까운_버스정류장_거리'] <= proximity_threshold

# 버스 정류장 세권을 판별하기 위한 버스 정류장과의 최대 거리 설정 (예: 115m)
proximity_threshold_bus_stop = 100

# 역세권 여부를 나타내는 열 추가
add_bus_stop_proximity_column(df, df_bus, proximity_threshold_bus_stop)

In [84]:
df = df.drop(columns={'좌표X','좌표Y','가장가까운_지하철역_거리','가장가까운_버스정류장_거리'})
df['공시가격리스트']= df['공시가격리스트'].astype(str)

In [85]:
# '평균공시가격'이 null인 행을 구합니다.
null_rows = df[df['평균공시가격'].isnull()]

# '구'별 '평균공시가격'의 평균값을 구합니다.
avg_prices_by_district = df.groupby('구')['평균공시가격'].mean()

# '평균공시가격'이 null인 행의 '구' 값을 기준으로 평균값을 가져와서 채웁니다.
for index, row in null_rows.iterrows():
    district = row['구']
    avg_price = avg_prices_by_district[district]
    df.at[index, '평균공시가격'] = avg_price


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1128094 entries, 0 to 1128093
Data columns (total 23 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   번지                  1128094 non-null  object        
 1   본번                  1128094 non-null  object        
 2   아파트명                1128094 non-null  object        
 3   전용면적(㎡)             1128094 non-null  float64       
 4   계약년월                1128094 non-null  datetime64[ns]
 5   층                   1128094 non-null  int64         
 6   건축년도                1128094 non-null  int64         
 7   도로명                 1128094 non-null  object        
 8   k-단지분류(아파트,주상복합등등)  1128094 non-null  object        
 9   k-세대타입(분양형태)        1128094 non-null  object        
 10  k-난방방식              1128094 non-null  object        
 11  k-건설사(시공사)          1128094 non-null  object        
 12  target              1118822 non-null  float64       
 13  is_test     

# Train

In [87]:
df_train = df.loc[df['is_test']==0, :]
df_test = df.loc[df['is_test']==1, :]

df_train.drop(['is_test'], axis=1, inplace=True)
df_test.drop(['is_test'], axis=1, inplace=True)
print(df_train.shape, df_test.shape)

(1118822, 22) (9272, 22)


In [88]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118822 entries, 0 to 1118821
Data columns (total 22 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   번지                  1118822 non-null  object        
 1   본번                  1118822 non-null  object        
 2   아파트명                1118822 non-null  object        
 3   전용면적(㎡)             1118822 non-null  float64       
 4   계약년월                1118822 non-null  datetime64[ns]
 5   층                   1118822 non-null  int64         
 6   건축년도                1118822 non-null  int64         
 7   도로명                 1118822 non-null  object        
 8   k-단지분류(아파트,주상복합등등)  1118822 non-null  object        
 9   k-세대타입(분양형태)        1118822 non-null  object        
 10  k-난방방식              1118822 non-null  object        
 11  k-건설사(시공사)          1118822 non-null  object        
 12  target              1118822 non-null  float64       
 13  구           

In [89]:
# dt_test의 target은 일단 0으로 임의로 채워주도록 하겠습니다.
df_test['target'] = 0

In [90]:
# 변수 삭제 및 파생변수 제작으로 추가된 변수들이 존재하기에, 다시한번 연속형과 범주형 칼럼을 분리해주겠습니다.
continuous_columns_v2 = []
categorical_columns_v2 = []

for column in df_train.columns:
    if column == '계약년월':
        continue
    if pd.api.types.is_numeric_dtype(df_train[column]):
        continuous_columns_v2.append(column)
    else:
        categorical_columns_v2.append(column)

print("연속형 변수:", continuous_columns_v2)
print("범주형 변수:", categorical_columns_v2)

연속형 변수: ['전용면적(㎡)', '층', '건축년도', 'target', '개비싸', '평균공시가격', '지하철역세권', '버스정류장세권']
범주형 변수: ['번지', '본번', '아파트명', '도로명', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-난방방식', 'k-건설사(시공사)', '구', '동', '계약년', '계약월', '공시가격리스트']


In [91]:
# 아래에서 범주형 변수들을 대상으로 레이블인코딩을 진행해 주겠습니다.

# 각 변수에 대한 LabelEncoder를 저장할 딕셔너리
label_encoders = {}

# Implement Label Encoding
for col in tqdm( categorical_columns_v2 ):
    lbl = LabelEncoder()

    # Label-Encoding을 fit
    lbl.fit( df_train[col].astype(str) )
    df_train[col] = lbl.transform(df_train[col].astype(str))
    label_encoders[col] = lbl           # 나중에 후처리를 위해 레이블인코더를 저장해주겠습니다.

    # Test 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가해줍니다.
    for label in np.unique(df_test[col]):
      if label not in lbl.classes_: # unseen label 데이터인 경우
        lbl.classes_ = np.append(lbl.classes_, label) # 미처리 시 ValueError발생하니 주의하세요!

    df_test[col] = lbl.transform(df_test[col].astype(str))

100%|██████████| 13/13 [00:08<00:00,  1.59it/s]


In [92]:
def preprocess_feature_name(feature_name):
  """특수 문자를 제거하고 소문자로 변환합니다."""
  feature_name = feature_name.replace("-", "_")
  feature_name = feature_name.replace(",", "_")
  feature_name = feature_name.replace(".", "_")
  feature_name = feature_name.replace("(", "_")
  feature_name = feature_name.replace(")", "_")
  feature_name = feature_name.lower()
  return feature_name

def apply_preprocessed_feature_names(df_train):
  """데이터 프레임의 feature 이름을 수정합니다."""
  df_train.columns = [preprocess_feature_name(feature) for feature in df_train.columns]
  return df_train

# 데이터 프레임에 적용
df_train = apply_preprocessed_feature_names(df_train.copy())
df_test = apply_preprocessed_feature_names(df_test.copy())

### Holdout Using optuna

In [93]:
y_train = df_train['target']
X_train = df_train.drop(['target', '계약년월'], axis=1)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2023)

In [96]:
import optuna

def objective(trial):
    # Define the hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 1000, 5000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    num_leaves = trial.suggest_int('num_leaves', 100, 1000)
    min_child_samples = trial.suggest_int('min_child_samples', 50, 500)
    feature_fraction = trial.suggest_float('feature_fraction', 0.5, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.5, 1.0)
    lambda_l1 = trial.suggest_float('lambda_l1', 1e-8, 1.0, log=True)  # Increase L1 regularization
    lambda_l2 = trial.suggest_float('lambda_l2', 1e-8, 1.0, log=True)  # Increase L2 regularization

    # Create the LGBMRegressor model with the suggested hyperparameters
    gbm = lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, num_leaves=num_leaves,
                            min_child_samples=min_child_samples, feature_fraction=feature_fraction,
                            bagging_fraction=bagging_fraction, lambda_l1=lambda_l1, lambda_l2=lambda_l2, verbosity=-1)

    # Train the model
    gbm.fit(
        X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], 
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50),
                   lgb.log_evaluation(period=100, show_stdv=True)]
        )
    
    # Compute the validation RMSE
    val_rmse = gbm.best_score_['valid_1']['rmse']
    return val_rmse

def optimize_hyperparameters(n_trials=100, early_stopping=50, log_evaluation=100):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    # Get the best hyperparameters
    best_params = study.best_trial.params

    # Create the final model with the best hyperparameters
    best_model = lgb.LGBMRegressor(**best_params)

    # Train the final model with early stopping and evaluation logging
    best_model.fit(X_train, y_train,
                   eval_set=[(X_train, y_train), (X_val, y_val)],
                   eval_metric='rmse',
                   callbacks=[lgb.early_stopping(stopping_rounds=early_stopping),
                              lgb.log_evaluation(period=log_evaluation, show_stdv=True)])

    return best_model

In [97]:
best_model = optimize_hyperparameters(n_trials=5)

[I 2024-03-28 06:52:19,901] A new study created in memory with name: no-name-33aea7dd-7c19-4973-bcd3-148302dcb114


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 6568.57	training's l2: 4.31461e+07	valid_1's rmse: 7019.71	valid_1's l2: 4.92764e+07
[200]	training's rmse: 5776.76	training's l2: 3.33709e+07	valid_1's rmse: 6359.98	valid_1's l2: 4.04493e+07
[300]	training's rmse: 5407.7	training's l2: 2.92432e+07	valid_1's rmse: 6113.91	valid_1's l2: 3.73799e+07
[400]	training's rmse: 5157.69	training's l2: 2.66017e+07	valid_1's rmse: 5974.83	valid_1's l2: 3.56986e+07
[500]	training's rmse: 4965.95	training's l2: 2.46606e+07	valid_1's rmse: 5882.54	valid_1's l2: 3.46043e+07
[600]	training's rmse: 4812.64	training's l2: 2.31615e+07	valid_1's rmse: 5812.9	valid_1's l2: 3.37898e+07
[700]	training's rmse: 4681.33	training's l2: 2.19148e+07	valid_1's rmse: 5761.49	valid_1's l2: 3.31948e+07
[800]	training's rmse: 4566.66	training's l2: 2.08544e+07	valid_1's rmse: 5719.21	valid_1's l2: 3.27093e+07
[900]	training's rmse: 4465.33	training's l2: 1.99392e+07	valid_1's rmse: 568

[I 2024-03-28 06:53:47,110] Trial 0 finished with value: 5532.3973767279285 and parameters: {'n_estimators': 2242, 'max_depth': 41, 'num_leaves': 607, 'min_child_samples': 290, 'feature_fraction': 0.9411590916872326, 'bagging_fraction': 0.8670462305906751, 'lambda_l1': 3.5514856830315364e-05, 'lambda_l2': 2.9015577495842256e-05}. Best is trial 0 with value: 5532.3973767279285.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7009.32	training's l2: 4.91305e+07	valid_1's rmse: 7402.34	valid_1's l2: 5.47947e+07
[200]	training's rmse: 6147.48	training's l2: 3.77915e+07	valid_1's rmse: 6649.42	valid_1's l2: 4.42147e+07
[300]	training's rmse: 5750.86	training's l2: 3.30724e+07	valid_1's rmse: 6331.9	valid_1's l2: 4.0093e+07
[400]	training's rmse: 5490.93	training's l2: 3.01504e+07	valid_1's rmse: 6151.6	valid_1's l2: 3.78422e+07
[500]	training's rmse: 5295.13	training's l2: 2.80384e+07	valid_1's rmse: 6030.05	valid_1's l2: 3.63615e+07
[600]	training's rmse: 5143.09	training's l2: 2.64514e+07	valid_1's rmse: 5941.7	valid_1's l2: 3.53038e+07
[700]	training's rmse: 5014.37	training's l2: 2.51439e+07	valid_1's rmse: 5872.16	valid_1's l2: 3.44823e+07
[800]	training's rmse: 4907.94	training's l2: 2.40879e+07	valid_1's rmse: 5821.04	valid_1's l2: 3.38845e+07
[900]	training's rmse: 4807.25	training's l2: 2.31097e+07	valid_1's rmse: 5775.

[I 2024-03-28 06:54:43,075] Trial 1 finished with value: 5534.3089809300955 and parameters: {'n_estimators': 2669, 'max_depth': 30, 'num_leaves': 265, 'min_child_samples': 276, 'feature_fraction': 0.7414567311622655, 'bagging_fraction': 0.6264071013362288, 'lambda_l1': 0.00012583640793611741, 'lambda_l2': 2.3031097991089302e-06}. Best is trial 0 with value: 5532.3973767279285.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 6616.12	training's l2: 4.3773e+07	valid_1's rmse: 7028.23	valid_1's l2: 4.9396e+07
[200]	training's rmse: 5821.74	training's l2: 3.38927e+07	valid_1's rmse: 6350.54	valid_1's l2: 4.03293e+07
[300]	training's rmse: 5441.38	training's l2: 2.96087e+07	valid_1's rmse: 6087.45	valid_1's l2: 3.7057e+07
[400]	training's rmse: 5191.96	training's l2: 2.69565e+07	valid_1's rmse: 5949.69	valid_1's l2: 3.53988e+07
[500]	training's rmse: 5005.2	training's l2: 2.5052e+07	valid_1's rmse: 5853.69	valid_1's l2: 3.42657e+07
[600]	training's rmse: 4847.24	training's l2: 2.34958e+07	valid_1's rmse: 5780.23	valid_1's l2: 3.34111e+07
[700]	training's rmse: 4713.93	training's l2: 2.22212e+07	valid_1's rmse: 5727.75	valid_1's l2: 3.28072e+07
[800]	training's rmse: 4598.73	training's l2: 2.11483e+07	valid_1's rmse: 5681.67	valid_1's l2: 3.22814e+07
[900]	training's rmse: 4500.13	training's l2: 2.02512e+07	valid_1's rmse: 5645.2

[I 2024-03-28 06:55:58,440] Trial 2 finished with value: 5483.72587112714 and parameters: {'n_estimators': 4079, 'max_depth': 42, 'num_leaves': 458, 'min_child_samples': 260, 'feature_fraction': 0.9849706347046054, 'bagging_fraction': 0.6855713592285373, 'lambda_l1': 0.594773996820749, 'lambda_l2': 2.051265596848302e-07}. Best is trial 2 with value: 5483.72587112714.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7157.95	training's l2: 5.12362e+07	valid_1's rmse: 7559.87	valid_1's l2: 5.71517e+07
[200]	training's rmse: 6324.75	training's l2: 4.00025e+07	valid_1's rmse: 6813.12	valid_1's l2: 4.64186e+07
[300]	training's rmse: 5927.33	training's l2: 3.51333e+07	valid_1's rmse: 6488.14	valid_1's l2: 4.20959e+07
[400]	training's rmse: 5668.58	training's l2: 3.21328e+07	valid_1's rmse: 6295.68	valid_1's l2: 3.96356e+07
[500]	training's rmse: 5479.04	training's l2: 3.00199e+07	valid_1's rmse: 6170.51	valid_1's l2: 3.80752e+07
[600]	training's rmse: 5321.91	training's l2: 2.83227e+07	valid_1's rmse: 6074.92	valid_1's l2: 3.69046e+07
[700]	training's rmse: 5193.8	training's l2: 2.69756e+07	valid_1's rmse: 6005.08	valid_1's l2: 3.60609e+07
[800]	training's rmse: 5080.27	training's l2: 2.58091e+07	valid_1's rmse: 5943.21	valid_1's l2: 3.53218e+07
[900]	training's rmse: 4976.52	training's l2: 2.47658e+07	valid_1's rmse: 58

[I 2024-03-28 06:58:10,953] Trial 3 finished with value: 5575.765720612968 and parameters: {'n_estimators': 2743, 'max_depth': 49, 'num_leaves': 807, 'min_child_samples': 489, 'feature_fraction': 0.9227132380790266, 'bagging_fraction': 0.6726790430553049, 'lambda_l1': 1.0030235921977392e-06, 'lambda_l2': 0.10098303459175997}. Best is trial 2 with value: 5483.72587112714.


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 6410.28	training's l2: 4.10917e+07	valid_1's rmse: 6875.5	valid_1's l2: 4.72725e+07
[200]	training's rmse: 5578.83	training's l2: 3.11234e+07	valid_1's rmse: 6211.55	valid_1's l2: 3.85833e+07
[300]	training's rmse: 5208.55	training's l2: 2.7129e+07	valid_1's rmse: 5965.96	valid_1's l2: 3.55927e+07
[400]	training's rmse: 4944.84	training's l2: 2.44515e+07	valid_1's rmse: 5825.79	valid_1's l2: 3.39398e+07
[500]	training's rmse: 4750.21	training's l2: 2.25645e+07	valid_1's rmse: 5738.91	valid_1's l2: 3.29351e+07
[600]	training's rmse: 4596.89	training's l2: 2.11314e+07	valid_1's rmse: 5675.69	valid_1's l2: 3.22135e+07
[700]	training's rmse: 4464.6	training's l2: 1.99326e+07	valid_1's rmse: 5628.58	valid_1's l2: 3.16809e+07
[800]	training's rmse: 4352.68	training's l2: 1.89458e+07	valid_1's rmse: 5596.12	valid_1's l2: 3.13166e+07
[900]	training's rmse: 4248.11	training's l2: 1.80464e+07	valid_1's rmse: 5567

[I 2024-03-28 06:59:25,268] Trial 4 finished with value: 5474.765664234993 and parameters: {'n_estimators': 3107, 'max_depth': 41, 'num_leaves': 618, 'min_child_samples': 225, 'feature_fraction': 0.5953590970583491, 'bagging_fraction': 0.774386775191464, 'lambda_l1': 3.231529211808769e-08, 'lambda_l2': 0.0008944864308569731}. Best is trial 4 with value: 5474.765664234993.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2270
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 20
[LightGBM] [Info] Start training from score 58000.483999
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 6410.28	training's l2: 4.10917e+07	valid_1's rmse: 6875.5	valid_1's l2: 4.72725e+07
[200]	training's rmse: 5578.83	training's l2: 3.11234e+07	valid_1's rmse: 6211.55	valid_1's l2: 3.85833e+07
[300]	training's rmse: 5208.55	training's l2: 2.7129e+07	valid_1's rmse: 5965.96	valid_1's l2: 3.55927e+07
[400]	training's rmse: 4944.84	training's l2: 2.44515e+07	valid_1's rmse: 5825.79	valid_1's l2: 3.39398e+07
[500]	training's rmse: 4750.21	training's l2: 2.25645e+07	valid_1's rmse: 5738.91	valid_1's l2: 3.

In [98]:
best_model.best_score_

defaultdict(collections.OrderedDict,
            {'training': OrderedDict([('rmse', 3670.875564257328),
                          ('l2', 13475327.40826155)]),
             'valid_1': OrderedDict([('rmse', 5474.765664234993),
                          ('l2', 29973059.07828642)])})

In [99]:
X_test = df_test.drop(['target', '계약년월'], axis=1)
real_test_pred = best_model.predict(X_test)
preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
preds_df.to_csv('holdout_optuna_output.csv', index=False)



In [100]:
preds_df.median()

target    84819.0
dtype: float64

### holdout

In [19]:
def holdout_lgb(X, y, test=pd.DataFrame, test_size=0.2, random_state=2023 ,gbm = lgb.LGBMRegressor(n_estimators=1000), early_stopping=10, log_evaluation=10):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
    gbm.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping),
                   lgb.log_evaluation(period=log_evaluation, show_stdv=True)])
    
    if not test.empty:
        real_test_pred = gbm.predict(test)
        preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
        preds_df.to_csv('holdout_output.csv', index=False)

        # return preds_df

    return gbm
        
    

In [20]:
y_train = df_train['target']
X_train = df_train.drop(['target', '계약년월'], axis=1)
X_test = df_test.drop(['target', '계약년월'], axis=1)
gbm = lgb.LGBMRegressor(n_estimators=2000, max_depth=20, num_leaves=100,
                            min_child_samples=60, feature_fraction=0.8,
                            bagging_fraction=0.8)
gbm = holdout_lgb(X_train,y_train, test=X_test, gbm=gbm)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1756
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 16
[LightGBM] [Info] Start training from score 58000.483999
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 25444.2	training's l2: 6.47406e+08	valid_1's rmse: 25742.9	valid_1's l2: 6.62699e+08
[20]	training's rmse: 17611.3	training's l2: 3.10159e+08	valid_1's rmse: 17923.7	valid_1's l2: 3.2126e+08
[30]	training's rmse: 14501.5	training's l2: 2.10295e+08	valid_1's rmse: 14855.7	valid_1's l2: 2.20693e+08
[40]	training's rmse: 13018.5	training's l2: 1.69481e+08	valid_1's rmse: 13411.8	valid_1's l2: 1.79876e+08
[50]	training's rmse: 12110.1	training's l2: 1.46655e+08	valid_1's rmse: 12526.3	valid_1's l2: 1.56907e+08
[60]	training's rmse: 11507	training's l2: 1.32412e+08	valid_1's rmse: 11933.8	valid_1's l2: 1.424

In [21]:
gbm.best_score_['valid_1']['rmse']

6059.92645528357

## 데이터 분리

In [33]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold, TimeSeriesSplit

### K-Fold

In [57]:
def k_fold_lgb(X, y, test=pd.DataFrame, n_splits=5, gbm=lgb.LGBMRegressor(n_estimators=1000), early_stopping=10, log_evaluation=10):
    # Kfold 함수를 선언합니다.
    kf = KFold(n_splits=n_splits)
    # 학습 데이터를 Kfold로 나눕니다.
    train_folds = kf.split(X, y)

    fold_save_files = []

    for fold_idx, (train_idx, valid_idx) in enumerate(train_folds):
        print(f"--------{fold_idx}번째 fold의 학습을 시작합니다.--------")

        # index를 통해 fold의 학습세트를 가져옵니다.
        X_train_fold = X.iloc[train_idx, :]
        Y_train_fold = y[train_idx]

        # index를 통해 fold의 평가세트를 가져옵니다.
        X_valid_fold = X.iloc[valid_idx, :]
        Y_valid_fold = y[valid_idx]

        # fold의 데이터로 학습을 진행합니다.
        gbm.fit(X_train_fold, Y_train_fold,                                               # 학습 데이터를 입력합니다.
            eval_set=[(X_train_fold, Y_train_fold), (X_valid_fold, Y_valid_fold)], # 평가셋을 지정합니다.
            eval_metric ='rmse',                                                               # 평가과정에서 사용할 평가함수를 지정합니다.
            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping),                                  # 10번의 성능향상이 없을 경우, 학습을 멈춥니다.
                    lgb.log_evaluation(period=log_evaluation, show_stdv=True)]                           # 매 iteration마다 학습결과를 출력합니다.
        )

        # 각 fold별 학습한 모델을 저장합니다.
        file_name = f"kfold{fold_idx}_gbm.pkl"
        joblib.dump(gbm, file_name)
        print(f"--------{fold_idx}번째 fold는 {file_name}에 저장되었습니다.--------\n\n")
        fold_save_files.append(file_name)

    # 저장한 학습모델들을 불러와, Testset에 대한 추론을 진행합니다.
    # 각 fold의 예측결과를 평균을 취하는 방식으로 진행합니다.
    if not test.empty:
        total_predicts = np.zeros(len(X_test))

        for file_name in fold_save_files:
            gbm_trained = joblib.load(file_name)
            fold_predicts = gbm_trained.predict(X_test)

            total_predicts += fold_predicts / len(fold_save_files)
        
        # 앞서 예측한 예측값들을 저장합니다.
        preds_df = pd.DataFrame(total_predicts.astype(int), columns=["target"])
        preds_df.to_csv('k-fold_output.csv', index=False)

        return preds_df

In [58]:
k_fold_lgb(X_train, y_train, X_test)

--------0번째 fold의 학습을 시작합니다.--------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1754
[LightGBM] [Info] Number of data points in the train set: 895057, number of used features: 16
[LightGBM] [Info] Start training from score 57132.425269
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 26721.6	training's l2: 7.14045e+08	valid_1's rmse: 28235.3	valid_1's l2: 7.97234e+08
[20]	training's rmse: 20413.5	training's l2: 4.1671e+08	valid_1's rmse: 21702.8	valid_1's l2: 4.71013e+08
[30]	training's rmse: 17537.8	training's l2: 3.07573e+08	valid_1's rmse: 18893.4	valid_1's l2: 3.56962e+08
[40]	training's rmse: 16009	training's l2: 2.56289e+08	valid_1's rmse: 17498.4	valid_1's l2: 3.06193e+08
[50]	training's rmse: 14981.8	training's l2: 2.24453e+08	valid_1's rmse: 16665.1	valid_1's l2: 2.77725e+08
[60]	training's rmse: 14294.3	training's l2: 2.04326e+08	valid_

Unnamed: 0,target
0,206857
1,266861
2,332427
3,265358
4,221849
...,...
9267,86171
9268,85626
9269,91401
9270,76948


### Best 3 K-Fold

In [63]:
top_3_files = ["kfold2_gbm.pkl", "kfold3_gbm.pkl", "kfold4_gbm.pkl"]
total_predicts = np.zeros(len(X_test))

for file_name in top_3_files:
    gbm_trained = joblib.load(file_name)
    fold_predicts = gbm_trained.predict(X_test)

    total_predicts += fold_predicts / len(top_3_files)

# 앞서 예측한 예측값들을 저장합니다.
preds_df = pd.DataFrame(total_predicts.astype(int), columns=["target"])
preds_df.to_csv('k-fold_output.csv', index=False)


### Time series Split

In [69]:
def time_series_lgb(X, y, test=pd.DataFrame, n_splits=5, gbm=lgb.LGBMRegressor(n_estimators=1000), early_stopping=10, log_evaluation=10):
    # TimeSeriesSplit 함수를 선언합니다.
    kf = TimeSeriesSplit(n_splits=n_splits)
    train_folds = kf.split(X, y)

    fold_save_files = []

    for fold_idx, (train_idx, valid_idx) in enumerate(train_folds):
        display(f"--------{fold_idx}번째 fold의 학습을 시작합니다.--------")

        # index를 통해 fold의 학습세트를 가져옵니다.
        X_train_fold = X.iloc[train_idx, :]
        Y_train_fold = y[train_idx]

        # index를 통해 fold의 평가세트를 가져옵니다.
        X_valid_fold = X.iloc[valid_idx, :]
        Y_valid_fold = y[valid_idx]

        # fold의 데이터로 학습을 진행합니다.
        gbm = lgb.LGBMRegressor(n_estimators=1000)
        gbm.fit(X_train_fold, Y_train_fold,                                               # 학습 데이터를 입력합니다.
            eval_set=[(X_train_fold, Y_train_fold), (X_valid_fold, Y_valid_fold)], # 평가셋을 지정합니다.
            eval_metric ='rmse',                                                               # 평가과정에서 사용할 평가함수를 지정합니다.
            callbacks=[lgb.early_stopping(stopping_rounds=10),                                  # 10번의 성능향상이 없을 경우, 학습을 멈춥니다.
                    lgb.log_evaluation(period=10, show_stdv=True)]                           # 매 iteration마다 학습결과를 출력합니다.
        )

        # 각 fold별 학습한 모델을 저장합니다.
        file_name = f"timeseries_fold{fold_idx}_gbm.pkl"
        joblib.dump(gbm, file_name)
        display(f"--------{fold_idx}번째 fold는 {file_name}에 저장되었습니다.--------\n\n")
        fold_save_files.append(file_name)

    # 저장한 학습모델들을 불러와, Testset에 대한 추론을 진행합니다.
    # 각 fold의 예측결과를 평균을 취하는 방식으로 진행합니다.
    if not test.empty:
        total_predicts = np.zeros(len(X_test))

        for file_name in fold_save_files:
            gbm_trained = joblib.load(file_name)
            fold_predicts = gbm_trained.predict(X_test)

            total_predicts += fold_predicts / len(fold_save_files)
        
        # 앞서 예측한 예측값들을 저장합니다.
        preds_df = pd.DataFrame(total_predicts.astype(int), columns=["target"])
        preds_df.to_csv('time-series_output.csv', index=False)

        return preds_df

In [70]:
X_train = df_train.drop(['target'], axis=1)
Y_train = df_train['target']

X_train = X_train.sort_values(by='계약년월') # 시간순으로 정렬합니다.
Y_train = Y_train.reindex(X_train.index) # 정렬된 X_train의 인덱스에 맞추어 Y_train도 정렬해줍니다.

X_train = X_train.reset_index(drop=True) # 인덱스를 재정렬 해줍니다.
Y_train = Y_train.reset_index(drop=True)

del X_train['계약년월'] # 시간에 대한 정보를 지웁니다.

X_test = df_test.drop(['target', '계약년월'], axis=1)

time_series_lgb(X_train, Y_train, X_test)

'--------0번째 fold의 학습을 시작합니다.--------'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1790
[LightGBM] [Info] Number of data points in the train set: 186472, number of used features: 16
[LightGBM] [Info] Start training from score 41193.008543
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 16785	training's l2: 2.81736e+08	valid_1's rmse: 17566.5	valid_1's l2: 3.08581e+08
[20]	training's rmse: 12765.5	training's l2: 1.62958e+08	valid_1's rmse: 13712.8	valid_1's l2: 1.88041e+08
[30]	training's rmse: 11191.3	training's l2: 1.25245e+08	valid_1's rmse: 12559.3	valid_1's l2: 1.57735e+08
[40]	training's rmse: 10345.2	training's l2: 1.07023e+08	valid_1's rmse: 12086	valid_1's l2: 1.4607e+08
[50]	training's rmse: 9750.8	training's l2: 9.50781e+07	valid_1's rmse: 11829.6	valid_1's l2: 1.39941e+08
[60]	training's rmse: 9352.37	training's l2: 8.74668e+07	valid_1's rmse: 11688.2	valid_1's l2: 1.36614e

'--------0번째 fold는 timeseries_fold0_gbm.pkl에 저장되었습니다.--------\n\n'

'--------1번째 fold의 학습을 시작합니다.--------'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1710
[LightGBM] [Info] Number of data points in the train set: 372942, number of used features: 16
[LightGBM] [Info] Start training from score 43548.686345
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 17310.4	training's l2: 2.99649e+08	valid_1's rmse: 16689.2	valid_1's l2: 2.7853e+08
[20]	training's rmse: 13052.2	training's l2: 1.7036e+08	valid_1's rmse: 12913	valid_1's l2: 1.66745e+08
[30]	training's rmse: 11448.1	training's l2: 1.31059e+08	valid_1's rmse: 11785.5	valid_1's l2: 1.38899e+08
[40]	training's rmse: 10545.3	training's l2: 1.11203e+08	valid_1's rmse: 11200.3	valid_1's l2: 1.25447e+08
[50]	training's rmse: 9931.93	training's l2: 9.86433e+07	valid_1's rmse: 10747.6	valid_1's l2: 1.15511e+08
[60]	training's rmse: 9532.99	training's l2: 9.08778e+07	valid_1's rmse: 10427.6	valid_1's l2: 1.0873

'--------1번째 fold는 timeseries_fold1_gbm.pkl에 저장되었습니다.--------\n\n'

'--------2번째 fold의 학습을 시작합니다.--------'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1729
[LightGBM] [Info] Number of data points in the train set: 559412, number of used features: 16
[LightGBM] [Info] Start training from score 44352.484621
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 17186.2	training's l2: 2.95365e+08	valid_1's rmse: 22757.8	valid_1's l2: 5.17919e+08
[20]	training's rmse: 13028.6	training's l2: 1.69745e+08	valid_1's rmse: 18161.8	valid_1's l2: 3.2985e+08
[30]	training's rmse: 11360.7	training's l2: 1.29066e+08	valid_1's rmse: 16391.2	valid_1's l2: 2.68673e+08
[40]	training's rmse: 10417.4	training's l2: 1.08522e+08	valid_1's rmse: 15265.1	valid_1's l2: 2.33024e+08
[50]	training's rmse: 9802.39	training's l2: 9.60868e+07	valid_1's rmse: 14442.5	valid_1's l2: 2.08587e+08
[60]	training's rmse: 9392.69	training's l2: 8.82227e+07	valid_1's rmse: 13892.3	valid_1's l2: 1.9

'--------2번째 fold는 timeseries_fold2_gbm.pkl에 저장되었습니다.--------\n\n'

'--------3번째 fold의 학습을 시작합니다.--------'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 745882, number of used features: 16
[LightGBM] [Info] Start training from score 46766.465732
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 18240.4	training's l2: 3.32713e+08	valid_1's rmse: 39607.6	valid_1's l2: 1.56876e+09
[20]	training's rmse: 13842.7	training's l2: 1.91619e+08	valid_1's rmse: 33651.5	valid_1's l2: 1.13242e+09
[30]	training's rmse: 12073	training's l2: 1.45757e+08	valid_1's rmse: 30134	valid_1's l2: 9.08059e+08
[40]	training's rmse: 11030.9	training's l2: 1.2168e+08	valid_1's rmse: 27874.9	valid_1's l2: 7.77007e+08
[50]	training's rmse: 10395.4	training's l2: 1.08065e+08	valid_1's rmse: 26598.5	valid_1's l2: 7.07478e+08
[60]	training's rmse: 9939.16	training's l2: 9.87869e+07	valid_1's rmse: 25692.8	valid_1's l2: 6.60122

'--------3번째 fold는 timeseries_fold3_gbm.pkl에 저장되었습니다.--------\n\n'

'--------4번째 fold의 학습을 시작합니다.--------'

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1747
[LightGBM] [Info] Number of data points in the train set: 932352, number of used features: 16
[LightGBM] [Info] Start training from score 51316.276133
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 21811.6	training's l2: 4.75744e+08	valid_1's rmse: 56580.1	valid_1's l2: 3.20131e+09
[20]	training's rmse: 16559.2	training's l2: 2.74207e+08	valid_1's rmse: 47350.4	valid_1's l2: 2.24206e+09
[30]	training's rmse: 14330.8	training's l2: 2.05371e+08	valid_1's rmse: 42334.4	valid_1's l2: 1.7922e+09
[40]	training's rmse: 13001.9	training's l2: 1.6905e+08	valid_1's rmse: 39504.8	valid_1's l2: 1.56063e+09
[50]	training's rmse: 12143.6	training's l2: 1.47466e+08	valid_1's rmse: 37816.4	valid_1's l2: 1.43008e+09
[60]	training's rmse: 11614.3	training's l2: 1.34891e+08	valid_1's rmse: 36871.5	valid_1's l2: 1.35

'--------4번째 fold는 timeseries_fold4_gbm.pkl에 저장되었습니다.--------\n\n'

Unnamed: 0,target
0,101047
1,127130
2,162975
3,137952
4,113692
...,...
9267,45284
9268,45050
9269,55072
9270,49209
