# 2023 전력사용량 예측 AI 경진대회 - cgahn0323

## Python & Library version check

In [1]:
!pip install sktime
!pip install lightgbm==3.2.1
!pip install bayesian-optimization

Collecting sktime
  Downloading sktime-0.22.0-py3-none-any.whl (17.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-base<0.6.0 (from sktime)
  Downloading scikit_base-0.5.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.7/118.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-base, sktime
Successfully installed scikit-base-0.5.1 sktime-0.22.0
Collecting lightgbm==3.2.1
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.0.0
    Uninstalling lightgbm-4.0.0:
      Successfully uninstalled lightgbm-4.0.0
Successfully installed lightgbm-3.2.1
Collecting bayesian-optimi

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import json
import zipfile
import random
import gc

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

import warnings
warnings.filterwarnings('ignore')

from sktime.utils.plotting import plot_series
import lightgbm as lgb

import gdown

In [3]:
print("Python version: {}".format(sys.version))

Python version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]


In [4]:
import pkg_resources

def list_installed_packages():
    installed_packages = pkg_resources.working_set
    installed_packages_list = sorted(["%s==%s" % (i.key, i.version) for i in installed_packages])
    for package in installed_packages_list:
        print(package)

list_installed_packages()

absl-py==1.4.0
aiohttp==3.8.5
aiosignal==1.3.1
alabaster==0.7.13
albumentations==1.3.1
altair==4.2.2
annotated-types==0.5.0
anyio==3.7.1
appdirs==1.4.4
argon2-cffi-bindings==21.2.0
argon2-cffi==23.1.0
array-record==0.4.1
arviz==0.15.1
astropy==5.3.2
astunparse==1.6.3
async-timeout==4.0.3
attrs==23.1.0
audioread==3.0.0
autograd==1.6.2
babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.11.2
bleach==6.0.0
blinker==1.4
blis==0.7.10
blosc2==2.0.0
bokeh==3.2.2
branca==0.6.0
build==0.10.0
cachecontrol==0.13.1
cachetools==5.3.1
catalogue==2.0.9
certifi==2023.7.22
cffi==1.15.1
chardet==5.2.0
charset-normalizer==3.2.0
chex==0.1.7
click-plugins==1.1.1
click==8.1.7
cligj==0.7.2
cloudpickle==2.2.1
cmake==3.27.2
cmdstanpy==1.1.0
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.1.1
cons==0.4.6
contextlib2==21.6.0
contourpy==1.1.0
convertdate==2.4.0
cryptography==41.0.3
cufflinks==0.17.3
cvxopt==1.3.2
cvxpy==1.3.2
cycler==0.11.0
cymem==2.0.7
cython==0.29.36
dask==2023.8.1
datascience==0

# Function Define

In [None]:
# 결측값 채우기 함수
def fillna_with_group_neighbor_mean(group):
    mask = group.isnull()
    prev = group.shift(1)
    next_ = group.shift(-1)
    group[mask] = ((prev + next_) / 2)[mask]
    return group

# 이상치 수정 함수
def fill_outlier_with_group_neighbor_ratio(group):
    group_median = np.median(group)
    group_std = np.std(group)
    down_mask = group < group_median - 3*group_std
    group[down_mask] = group_median - 3*group_std
    return group

# pre-processing 함수
def preprocess_and_create_features(train, test, month=-1, valid_cut_day=-1, test_cut_day=-1, lag=0, befor_815=True, for_Bayesian_Optimization=False):

    # merge train+test
    df = pd.concat([train,test],axis=0, ignore_index=True)

    # 결측값 처리
    df['풍속(m/s)'] = df.groupby(['건물번호'])['풍속(m/s)'].transform(fillna_with_group_neighbor_mean)
    df['습도(%)'] = df.groupby(['건물번호'])['습도(%)'].transform(fillna_with_group_neighbor_mean)
    df.fillna(0,inplace=True)

    # 이상값 처리
    if for_Bayesian_Optimization:
        df['전력소비량(kWh)'] = df.groupby(['건물번호'])['전력소비량(kWh)'].transform(fill_outlier_with_group_neighbor_ratio)

    #####################################################################################################

    # datetime feature
    df['일시'] = pd.to_datetime(df['일시'], format='%Y%m%d %H')
    df['wday'] = df['일시'].dt.weekday
    df["weekend"] = np.where(df["wday"] < 5, 0, 1)
    df['month'] = df['일시'].dt.month
    df['day'] = df['일시'].dt.day
    df['time'] = df['일시'].dt.hour
    df['wm'] = df['day'].apply(lambda x: np.ceil(x/7))

    #####################################################################################################

    # drop befor 6/6
    df = df.loc[((df['month']>6) | (df['day']>6))].copy()

    # sort & drop
    df.sort_values('일시', inplace=True)
    df.drop(['num_date_time','일시'],axis=1,inplace=True)

    #####################################################################################################

    # Statistics feature

    if befor_815:
        power_cal_df = df.loc[((df['month']<8) | (df['day']<15))].copy()
    else:
        power_cal_df = df

    time_wday_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'wday'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'time_wday_mean'})
    time_wday_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'wday'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'time_wday_min'})
    time_wday_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'wday'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'time_wday_max'})
    time_wday_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'wday'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'time_wday_std'})
    df = pd.merge(df,time_wday_mean, on=['건물번호', 'time', 'wday'], how='left')
    df = pd.merge(df,time_wday_min, on=['건물번호', 'time', 'wday'], how='left')
    df = pd.merge(df,time_wday_max, on=['건물번호', 'time', 'wday'], how='left')
    df = pd.merge(df,time_wday_std, on=['건물번호', 'time', 'wday'], how='left')

    time_weekend_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'weekend'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'time_weekend_mean'})
    time_weekend_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'weekend'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'time_weekend_min'})
    time_weekend_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'weekend'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'time_weekend_max'})
    time_weekend_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time', 'weekend'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'time_weekend_std'})
    df = pd.merge(df,time_weekend_mean, on=['건물번호', 'time', 'weekend'], how='left')
    df = pd.merge(df,time_weekend_min, on=['건물번호', 'time', 'weekend'], how='left')
    df = pd.merge(df,time_weekend_max, on=['건물번호', 'time', 'weekend'], how='left')
    df = pd.merge(df,time_weekend_std, on=['건물번호', 'time', 'weekend'], how='left')

    time_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'time_mean'})
    time_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'time_min'})
    time_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'time_max'})
    time_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'time'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'time_std'})
    df = pd.merge(df,time_mean, on=['건물번호', 'time'], how='left')
    df = pd.merge(df,time_min, on=['건물번호', 'time'], how='left')
    df = pd.merge(df,time_max, on=['건물번호', 'time'], how='left')
    df = pd.merge(df,time_std, on=['건물번호', 'time'], how='left')

    weekend_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'weekend'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'weekend_mean'})
    weekend_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'weekend'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'weekend_min'})
    weekend_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'weekend'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'weekend_max'})
    weekend_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'weekend'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'weekend_std'})
    df = pd.merge(df,weekend_mean, on=['건물번호', 'weekend'], how='left')
    df = pd.merge(df,weekend_min, on=['건물번호', 'weekend'], how='left')
    df = pd.merge(df,weekend_max, on=['건물번호', 'weekend'], how='left')
    df = pd.merge(df,weekend_std, on=['건물번호', 'weekend'], how='left')

    wday_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wday'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'wday_mean'})
    wday_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wday'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'wday_min'})
    wday_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wday'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'wday_max'})
    wday_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wday'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'wday_std'})
    df = pd.merge(df,wday_mean, on=['건물번호', 'wday'], how='left')
    df = pd.merge(df,wday_min, on=['건물번호', 'wday'], how='left')
    df = pd.merge(df,wday_max, on=['건물번호', 'wday'], how='left')
    df = pd.merge(df,wday_std, on=['건물번호', 'wday'], how='left')

    wm_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wm'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'wm_mean'})
    wm_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wm'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'wm_min'})
    wm_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wm'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'wm_max'})
    wm_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'wm'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'wm_std'})
    df = pd.merge(df,wm_mean, on=['건물번호', 'wm'], how='left')
    df = pd.merge(df,wm_min, on=['건물번호', 'wm'], how='left')
    df = pd.merge(df,wm_max, on=['건물번호', 'wm'], how='left')
    df = pd.merge(df,wm_std, on=['건물번호', 'wm'], how='left')

    month_mean = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'month'], aggfunc = np.mean).reset_index().rename(columns={'전력소비량(kWh)':'month_mean'})
    month_min = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'month'], aggfunc = np.min).reset_index().rename(columns={'전력소비량(kWh)':'month_min'})
    month_max = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'month'], aggfunc = np.max).reset_index().rename(columns={'전력소비량(kWh)':'month_max'})
    month_std = pd.pivot_table(power_cal_df, values = '전력소비량(kWh)', index = ['건물번호', 'month'], aggfunc = np.std).reset_index().rename(columns={'전력소비량(kWh)':'month_std'})
    df = pd.merge(df,month_mean, on=['건물번호', 'month'], how='left')
    df = pd.merge(df,month_min, on=['건물번호', 'month'], how='left')
    df = pd.merge(df,month_max, on=['건물번호', 'month'], how='left')
    df = pd.merge(df,month_std, on=['건물번호', 'month'], how='left')

    #####################################################################################################

    # 기온 / 습도 feature
    df["기온_mean"] = df.groupby(["건물번호"])["기온(C)"].transform('mean')
    df["기온_std"] = df.groupby(["건물번호"])["기온(C)"].transform('std')

    df["기온_time_mean"] = df.groupby(["건물번호","time"])["기온(C)"].transform('mean')
    df["기온_time_std"] = df.groupby(["건물번호","time"])["기온(C)"].transform('std')

    df["기온_wday_time_mean"] = df.groupby(["건물번호","wday","time"])["기온(C)"].transform('mean')
    df["기온_wday_time_std"] = df.groupby(["건물번호","wday","time"])["기온(C)"].transform('std')

    df["습도_mean"] = df.groupby(["건물번호"])["습도(%)"].transform('mean')
    df["습도_std"] = df.groupby(["건물번호"])["습도(%)"].transform('std')

    df["습도_time_mean"] = df.groupby(["건물번호","time"])["습도(%)"].transform('mean')
    df["습도_time_std"] = df.groupby(["건물번호","time"])["습도(%)"].transform('std')

    df["습도_wday_time_mean"] = df.groupby(["건물번호","wday","time"])["습도(%)"].transform('mean')
    df["습도_wday_time_std"] = df.groupby(["건물번호","wday","time"])["습도(%)"].transform('std')

    df["강수량_mean"] = df.groupby(["건물번호"])["강수량(mm)"].transform('mean')
    df["강수량_std"] = df.groupby(["건물번호"])["강수량(mm)"].transform('std')

    df["강수량_time_mean"] = df.groupby(["건물번호","time"])["강수량(mm)"].transform('mean')
    df["강수량_time_std"] = df.groupby(["건물번호","time"])["강수량(mm)"].transform('std')

    df["강수량_wday_time_mean"] = df.groupby(["건물번호","wday","time"])["강수량(mm)"].transform('mean')
    df["강수량_wday_time_std"] = df.groupby(["건물번호","wday","time"])["강수량(mm)"].transform('std')

    df["강수량_rolling_24"] = df.groupby(["건물번호"])["강수량(mm)"].transform(lambda x: x.rolling(24, min_periods=1).mean())
    df["강수량_rolling_24_shift_24"] = df.groupby(["건물번호"])["강수량_rolling_24"].transform(lambda x: x.shift(24))

    #####################################################################################################

    # THI feature
    df['THI'] = 9/5*df["기온(C)"] - 0.55*(1-df["습도(%)"]/100)*(9/5*df["습도(%)"]-26)+32

    # CDH feature
    df['CDH_temp>20'] = [temp if temp > 20 else 0 for temp in df["기온(C)"]]
    df['CDH_temp>23'] = [temp if temp > 23 else 0 for temp in df["기온(C)"]]
    df['CDH_temp>26'] = [temp if temp > 26 else 0 for temp in df["기온(C)"]]
    df['CDH_temp>29'] = [temp if temp > 29 else 0 for temp in df["기온(C)"]]
    df['CDH_temp>20_rolling=12'] = df.groupby(["건물번호"])["CDH_temp>20"].transform(lambda x: x.rolling(12, min_periods=1).mean())
    df['CDH_temp>20_rolling=24'] = df.groupby(["건물번호"])["CDH_temp>20"].transform(lambda x: x.rolling(24, min_periods=1).mean())
    df['CDH_temp>23_rolling=12'] = df.groupby(["건물번호"])["CDH_temp>23"].transform(lambda x: x.rolling(12, min_periods=1).mean())
    df['CDH_temp>23_rolling=24'] = df.groupby(["건물번호"])["CDH_temp>23"].transform(lambda x: x.rolling(24, min_periods=1).mean())
    df['CDH_temp>26_rolling=12'] = df.groupby(["건물번호"])["CDH_temp>26"].transform(lambda x: x.rolling(12, min_periods=1).mean())
    df['CDH_temp>26_rolling=24'] = df.groupby(["건물번호"])["CDH_temp>26"].transform(lambda x: x.rolling(24, min_periods=1).mean())
    df['CDH_temp>29_rolling=12'] = df.groupby(["건물번호"])["CDH_temp>29"].transform(lambda x: x.rolling(12, min_periods=1).mean())
    df['CDH_temp>29_rolling=24'] = df.groupby(["건물번호"])["CDH_temp>29"].transform(lambda x: x.rolling(24, min_periods=1).mean())
    df.drop(['CDH_temp>20','CDH_temp>23','CDH_temp>26','CDH_temp>29'], axis=1, inplace=True)

    #####################################################################################################

    # Shift & Recursive feature
    df['shift_d7']   = df.groupby(['건물번호'])['전력소비량(kWh)'].transform(lambda x: x.shift((1+lag)*24))
    df['shift_d7.1']   = df.groupby(['건물번호'])['전력소비량(kWh)'].transform(lambda x: x.shift((1+lag)*24+1))
    df['shift_d7.2']   = df.groupby(['건물번호'])['전력소비량(kWh)'].transform(lambda x: x.shift((1+lag)*24+2))

    df['shift_time_d1']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift((1+lag)))
    df['shift_time_d2']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift((2+lag)))
    df['shift_time_d3']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift((3+lag)))

    df['기온_time_shift_d1']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift(1))
    df['습도_time_shift_d1']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift(1))
    df['기온_time_shift_d2']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift(2))
    df['습도_time_shift_d2']   = df.groupby(['건물번호','time'])['전력소비량(kWh)'].transform(lambda x: x.shift(2))

    #####################################################################################################

    # 태양광 feature drop
    df.drop(['일조(hr)','일사(MJ/m2)'], axis=1, inplace=True)

    #####################################################################################################

    # cyclical encoding
    df['sin_time'] = np.sin(2*np.pi*df['time']/24)
    df['cos_time'] = np.cos(2*np.pi*df['time']/24)

    #####################################################################################################

    if for_Bayesian_Optimization:
        train_ = df.loc[(df['month']<8) | (df['day']<11)].dropna()
        valid_ = df.loc[(df['month']==8) & (df['day']>=11) & ~((df['month']==8) & (df['day']==15))]
        test_ = df.loc[(df['month']==8) & (df['day']>=11) & ~((df['month']==8) & (df['day']==15))]

    elif month > 0 and valid_cut_day > 0 and test_cut_day > 0:
        train_ = df[(df['month']<month) | (df['day']<valid_cut_day)].dropna()
        valid_ = df[(df['month']==month) & (df['day']>=valid_cut_day) & (df['day']<test_cut_day)]
        test_ = df[(df['month']==month) & (df['day']>=test_cut_day)]
    else:
        train_ = df[ (((df['month']<8) | (df['day']<4)) | ((df['month']>8) | (df['day']>=11))) & ((df['month']<8) | (df['day']<25)) ].dropna()
        valid_ = df[(df['month']==8) & (df['day']>=4) & (df['day']<11)]
        test_ = df[(df['month']==8) & (df['day']>=25)]

    return df, train_, valid_, test_

def extract_zip(input_path ,output_path):
    print("Start extracting zip file.")
    with zipfile.ZipFile(input_path, 'r') as zip_file:
        zip_file.extractall(output_path)

def lgbm_smape(y_pred, dataset):
    y_true = dataset.get_label()
    v = 2 * abs(y_pred - y_true) / (abs(y_pred) + abs(y_true))
    smape_val = np.mean(v) * 100
    return 'SMAPE', smape_val, False

def hamming_distance(a, b):
    return sum(x != y for x, y in zip(a, b))

def read_params(b_type, candidate_num=20, model_num=10):
    with open(f"{root_path}/b_num_{b_type}_parmas.json") as json_file:
        temp_list = []
        for line in json_file:
            temp = json.loads(line)
            temp_list.append((temp['target'],temp['params']))
        temp_list = sorted(temp_list, key=lambda x: x[0], reverse=True)

        feature_bool_list = [
            np.array([_[-1] for _ in sorted([(int(key[1:]),round(best_params[key])) for key in best_params if key[0]=='f' and len(key)<4])])
            for best_target, best_params in temp_list
            ]

        matrix = feature_bool_list[:candidate_num]

        selected_indexs = [0]
        selected_rows = [matrix[0]]
        selected_params = [temp_list[0][-1]]
        while len(selected_rows) < model_num:
            target_i, max_hamming_sum = -1E6, -1E6
            for i in range(len(matrix)):
                if i not in selected_indexs:
                    current_hamming_sum = sum(hamming_distance(matrix[i], row) for row in selected_rows)
                    if current_hamming_sum > max_hamming_sum:
                        target_i = i
                        max_hamming_sum = current_hamming_sum
            selected_indexs.append(target_i)
            selected_rows.append(matrix[target_i])
            selected_params.append(temp_list[target_i][-1])

        feature_marker_matrix = []
        for row in selected_rows:
            feature_marker_matrix.append(list(range(12)) + [_+12 for _ in range(len(row)) if row[_]==1])

    return feature_marker_matrix, selected_params

# Set Root_Path & Unzip DataSet

In [None]:
# root_path 생성
root_path = './data/'
if not os.path.exists(root_path):
    os.mkdir(root_path)

In [None]:
# Download open.zip
google_path = 'https://drive.google.com/uc?id='
file_id = '1TrPq66cNYpc6SxGy5MYCRzA4MbT-XehU'
output_path = f"{root_path}open.zip"
gdown.download(google_path+file_id,output_path,quiet=False)

# Unzip files
extract_zip(output_path,root_path)

Downloading...
From: https://drive.google.com/uc?id=1TrPq66cNYpc6SxGy5MYCRzA4MbT-XehU
To: /content/data/open.zip
100%|██████████| 2.80M/2.80M [00:00<00:00, 192MB/s]

Start extracting zip file.





# Bayesian_Optimization for Feature Selection & Hyper Parameter tuning

In [None]:
# Read DataSet
sample_submission = pd.read_csv(f"{root_path}sample_submission.csv")
train = pd.read_csv(f"{root_path}train.csv")
test = pd.read_csv(f"{root_path}test.csv")
test['전력소비량(kWh)'] = float('nan')

In [None]:
# DataSet Setting for Bayesian_Optimization
test = train.loc[(train['일시']>'20220811'),test.columns.tolist()]
test_ans = test[['num_date_time','전력소비량(kWh)']].copy()
train = train.loc[train['일시']<'20220811']

In [None]:
# Pre-Processing Target DataSet for Bayesian_Optimization
df_0, train_0, valid_0, test_0 = preprocess_and_create_features(train, test, for_Bayesian_Optimization=True)

In [None]:
def LGB(num,
        f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,
        f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,
        f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,
        f61,f62,f63,f64,f65,f66,f67,f68,f69,
        subsample,
        subsample_freq,
        learning_rate,
        num_leaves,
        min_data_in_leaf,
        feature_fraction,
        max_bin
        ):

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mape',
        'subsample': subsample, # 1
        'subsample_freq': int(subsample_freq), # 1
        'learning_rate': 10**learning_rate, # 0.1
        'num_leaves': int(2**num_leaves-1), # 6
        'min_data_in_leaf': int(2**min_data_in_leaf-1), # 6
        'feature_fraction': feature_fraction, # 0.5
        'max_bin': int(2**max_bin), # 100,
        'n_estimators': 30000,
        'boost_from_average': False,
        'verbose': -1,
        'seed' : 777
    }

    functions_marker = [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,
                        f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,
                        f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,
                        f61,f62,f63,f64,f65,f66,f67,f68,f69]
    functions_marker = [1 for _ in range(12)] + list(map(lambda a: round(a),functions_marker))

    # Modeling
    full_columns_list = df_0.columns
    features_preprocessing = [full_columns_list[_] for _ in range(len(functions_marker)) if functions_marker[_] == 1]
    train_ = train_0.loc[df_0['건물번호']==num, features_preprocessing].copy()
    valid_ = valid_0.loc[df_0['건물번호']==num, features_preprocessing].copy()
    test_ = test_0.loc[df_0['건물번호']==num, features_preprocessing].copy()

    X_train, y_train = train_.drop('전력소비량(kWh)',axis=1), train_['전력소비량(kWh)']
    X_valid, y_valid = valid_.drop('전력소비량(kWh)',axis=1), valid_['전력소비량(kWh)']
    X_test, y_test = test_.drop("전력소비량(kWh)", axis=1), test_['전력소비량(kWh)']

    # Create model
    model = lgb.train(params=params,
            train_set=lgb.Dataset(X_train, y_train),
            valid_sets=lgb.Dataset(X_valid, y_valid),
            feval=lgbm_smape,
            verbose_eval=500,
            early_stopping_rounds=100)

    # Test predict
    pred_eva = model.predict(X_test)
    ans = X_test[['건물번호','month','day','time']].copy()
    ans['answer'] = pred_eva
    ans = ans.sort_values(['건물번호','month','day','time']).reset_index(drop=True)
    ans['num_date_time'] = ans['건물번호'].astype(str) + "_2022" + ans['month'].map(lambda x: str(x).zfill(2)) + ans['day'].map(lambda x: str(x).zfill(2)) + ' ' + ans['time'].map(lambda x: str(x).zfill(2))

    plot_df = pd.concat([train,test],axis=0, ignore_index=True)
    plot_df = plot_df[plot_df['num_date_time'].isin(ans['num_date_time'])]
    plot_df.loc[:,'pred'] = ans['answer'].values
    plot_df.loc[:,'real'] = test_ans.loc[test_ans['num_date_time'].isin(ans['num_date_time']),'전력소비량(kWh)'].values
    smape_val = np.mean(2 * abs(plot_df['pred'] - plot_df['real']) / (abs(plot_df['pred']) + abs(plot_df['real']))) * 100

    return -smape_val

In [None]:
def Bayesian_Opt(num, init_points_=25, n_iter_=100):
    params = {'f' + str(i+1): (0, 1) for i in range(69)}
    params['subsample'] = (0.1, 1)
    params['subsample_freq'] = (1, 10)
    params['learning_rate'] = (-2, -0.5)
    params['num_leaves'] = (2, 12)
    params['min_data_in_leaf'] = (2, 12)
    params['feature_fraction'] = (0.1, 1)
    params['max_bin'] = (1,12)

    # Define an inner function that wraps around the LGB function
    def wrapped_LGB(**kwargs):
        return LGB(num=num, **kwargs)

    # Bayesian optimization 객체 생성
    bo = BayesianOptimization(f=wrapped_LGB, pbounds=params, verbose=2, random_state = 777)

    # JSONLogger 객체 인스턴스화
    logger = JSONLogger(path=f"{root_path}b_num_{num}_parmas.json")

    # BayesianOptimization 객체와 이벤트 연결
    bo.subscribe(Events.OPTIMIZATION_STEP, logger)

    # 목표 함수 최대화 과정 수행
    bo.maximize(init_points=init_points_, n_iter=n_iter_) # n_iter=20

    return bo

In [None]:
# Private_Score 복원 목적 시 True / Bayesian Optimization 직접 실행 원할 시 False
recovery_private_result = True
if recovery_private_result:
    google_path = 'https://drive.google.com/uc?id='
    file_id = '1Tsixe39VwvqcgkQrTV0dL8oE24cgNrBW'
    output_path = f"{root_path}models.zip"
    gdown.download(google_path+file_id,output_path,quiet=False)
    extract_zip(output_path,root_path)
else:
    b_num_list = sorted(list(df_0['건물번호'].unique()))
    for num in b_num_list:
        now_file_list = os.listdir(root_path)
        if f"b_num_{num}_parmas.json" not in now_file_list:
            bo = Bayesian_Opt(num)

Downloading...
From: https://drive.google.com/uc?id=1Tsixe39VwvqcgkQrTV0dL8oE24cgNrBW
To: /content/data/models.zip
100%|██████████| 4.16M/4.16M [00:00<00:00, 87.3MB/s]


Start extracting zip file.


# 8/15 Target Modeling

In [None]:
# Read DataSet
sample_submission = pd.read_csv(f"{root_path}sample_submission.csv")
train = pd.read_csv(f"{root_path}train.csv")
test = pd.read_csv(f"{root_path}test.csv")
test['전력소비량(kWh)'] = float('nan')

In [None]:
# 8월 15일 Modeling 위한 DataSet 편집
test_split = train.loc[(train['일시']>'20220815'),test.columns.tolist()]
test_ans = test_split[['num_date_time','전력소비량(kWh)']].copy()
test_split['전력소비량(kWh)'] = float('nan')
test = test_split.copy()
train = train.loc[train['일시']<'20220815']

In [None]:
ref_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mape',
    'subsample': 1,
    'subsample_freq': 1,
    'learning_rate': 0.01,
    'num_leaves': 2**5-1,
    'min_data_in_leaf': 2**8-1,
    'feature_fraction': 1,
    'max_bin': 1024,
    'n_estimators': 50000,
    'boost_from_average': False,
    'verbose': -1,
    'seed' : 777
}

In [None]:
# 815 Target Modeling
candidate_num = 48
model_num = 20
final_ensemble_cnt = 16
test_cut_day = 15

start_time = time.time()
Recursive_ans_list = []
seed_list = [777]
for seed in seed_list:
    print("--------------------------------")
    print(f"[seed: {seed}]\n")
    ref_params['seed'] = seed

    val_pred_list, val_real_list = [], []
    ans_list = []
    model_list = {}
    test['전력소비량(kWh)'] = float('nan')
    for lag in range(1):
        df_0, train_0, valid_0, test_0 = preprocess_and_create_features(train, test, month=8, valid_cut_day=8, test_cut_day=15, lag=lag)
        b_type_list = sorted(list(df_0['건물번호'].unique()))
        for b_type in b_type_list:
            params = ref_params.copy()
            ans_temp_list = []
            feature_marker_matrix, selected_params = read_params(b_type, candidate_num=candidate_num, model_num=model_num)
            for num in range(len(feature_marker_matrix)):
                try:
                    feature_marker_list, best_params = feature_marker_matrix[num], selected_params[num]
                    feature_list = [df_0.columns[_] for _ in feature_marker_list]
                    df_b_type = df_0.loc[df_0['건물번호']==b_type, feature_list]
                    train_b_type = train_0.loc[train_0['건물번호']==b_type, feature_list]
                    valid_b_type = valid_0.loc[valid_0['건물번호']==b_type, feature_list]
                    test_b_type = test_0.loc[test_0['건물번호']==b_type, feature_list]

                    params['subsample'] = best_params['subsample']
                    params['subsample_freq'] =  int(best_params['subsample_freq'])
                    #params['learning_rate'] =  10**best_params['learning_rate']
                    params['num_leaves'] =  int(2**best_params['num_leaves']-1)
                    params['min_data_in_leaf'] =  int(2**best_params['min_data_in_leaf']-1)
                    params['feature_fraction'] =  best_params['feature_fraction']
                    params['max_bin'] =  int(2**best_params['max_bin'])
                except:
                    print(f"No param json b_type {b_type}...")
                    df_b_type = df_0[df_0['건물번호']==b_type]
                    train_b_type = train_0[train_0['건물번호']==b_type]
                    valid_b_type = valid_0[valid_0['건물번호']==b_type]
                    test_b_type = test_0[test_0['건물번호']==b_type]

                X_train, y_train = train_b_type.drop('전력소비량(kWh)',axis=1), train_b_type['전력소비량(kWh)']
                X_valid, y_valid = valid_b_type.drop('전력소비량(kWh)',axis=1), valid_b_type['전력소비량(kWh)']
                X_test, y_test = test_b_type.drop("전력소비량(kWh)", axis=1), test_b_type['전력소비량(kWh)']

                test_target = test_b_type[(test_b_type['month']==8)&(test_b_type['day']==test_cut_day+lag)]
                X_test_target, y_test_target = test_target.drop('전력소비량(kWh)',axis=1), test_target['전력소비량(kWh)']

                # Create model
                if b_type not in model_list:
                    model_list[b_type] = {}
                if num not in model_list[b_type]:
                    print(f"[day {lag+1}] [b_type {b_type}] [num {num}] Train model ... {(time.time()-start_time)/60:.2f}min")
                    print("--------")

                    model = lgb.train(params=params,
                            train_set=lgb.Dataset(X_train.drop(['건물번호'],axis=1), y_train),
                            valid_sets=lgb.Dataset(X_valid.drop(['건물번호'],axis=1), y_valid),
                            feval=lgbm_smape,
                            verbose_eval=2000,
                            early_stopping_rounds=100)
                    model_list[b_type][num] = model

                    del model
                    gc.collect()

                    print("--------")

                # Validation predict
                pred_val = model_list[b_type][num].predict(X_valid.drop(['건물번호'],axis=1))
                val_pred_list += list(pred_val)
                val_real_list += list(y_valid)
                smape_val = np.mean(2 * abs(pred_val - y_valid) / (abs(pred_val) + abs(y_valid))) * 100

                # Test predict
                pred_eva = model_list[b_type][num].predict(X_test_target.drop(['건물번호'],axis=1))
                ans = X_test_target[['건물번호','month','day','time']].copy()
                ans['answer'] = pred_eva
                ans = ans.sort_values(['건물번호','month','day','time']).reset_index(drop=True)
                ans['num_date_time'] = ans['건물번호'].astype(str) + "_2022" + ans['month'].map(lambda x: str(x).zfill(2)) + ans['day'].map(lambda x: str(x).zfill(2)) + ' ' + ans['time'].map(lambda x: str(x).zfill(2))
                ans_temp_list.append((smape_val,ans))

                del df_b_type, train_b_type, valid_b_type, test_b_type, X_train, y_train, X_valid, y_valid, X_test, y_test, test_target, X_test_target, y_test_target
                gc.collect()

            ans_temp_list = [ans_tuple[-1] for ans_tuple in sorted(ans_temp_list)[:final_ensemble_cnt]]
            ans.loc[:,'answer'] = pd.concat(ans_temp_list,axis=1)[['answer']].median(axis=1)
            ans_list.append(ans)

            # Update TestSet
            print(f"[day {lag+1}] [b_type {b_type}] Update Recursive")
            test.loc[test['num_date_time'].isin(ans['num_date_time']),'전력소비량(kWh)'] = ans['answer'].values
            print("------------------------")

            del ans, smape_val, ans_temp_list
            gc.collect()


        print("--------")
        print("Done\n")

    val_pred_list = np.array(val_pred_list)
    val_real_list = np.array(val_real_list)
    smape_val = np.mean(2 * abs(val_pred_list - val_real_list) / (abs(val_pred_list) + abs(val_real_list))) * 100
    print(f"final_val_SMAPE : {smape_val}")
    print("--------")

    Recursive_ans = pd.concat(ans_list,axis=0,ignore_index=True)
    Recursive_ans = Recursive_ans.sort_values(['건물번호','month','day','time']).reset_index(drop=True)
    Recursive_ans = Recursive_ans[['num_date_time','month','day','time','answer']].copy()
    Recursive_ans_list.append(Recursive_ans['answer'].values)
Recursive_ans['answer'] = np.mean(Recursive_ans_list,axis=0)
Recursive_ans

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
--------
[day 1] [b_type 60] [num 11] Train model ... 34.12min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1693]	valid_0's mape: 0.064956	valid_0's SMAPE: 6.94945
--------
[day 1] [b_type 60] [num 12] Train model ... 34.56min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1204]	valid_0's mape: 0.0636266	valid_0's SMAPE: 6.85524
--------
[day 1] [b_type 60] [num 13] Train model ... 34.58min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[821]	valid_0's mape: 0.0676562	valid_0's SMAPE: 7.22451
--------
[day 1] [b_type 60] [num 14] Train model ... 34.59min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[621]	valid_0's mape: 0.0569929	valid_0's SMAPE: 6.06963
--------
[day 1] [b_type 60] [num 15] Train m

Unnamed: 0,num_date_time,month,day,time,answer
0,1_20220815 00,8,15,0,1930.899502
1,1_20220815 01,8,15,1,1875.406035
2,1_20220815 02,8,15,2,1788.252578
3,1_20220815 03,8,15,3,1707.473613
4,1_20220815 04,8,15,4,1728.369127
...,...,...,...,...,...
2395,100_20220815 19,8,15,19,1142.708285
2396,100_20220815 20,8,15,20,1033.975356
2397,100_20220815 21,8,15,21,926.747014
2398,100_20220815 22,8,15,22,813.332572


In [None]:
ans_815 = Recursive_ans.copy()
ans_815

Unnamed: 0,num_date_time,month,day,time,answer
0,1_20220815 00,8,15,0,1930.899502
1,1_20220815 01,8,15,1,1875.406035
2,1_20220815 02,8,15,2,1788.252578
3,1_20220815 03,8,15,3,1707.473613
4,1_20220815 04,8,15,4,1728.369127
...,...,...,...,...,...
2395,100_20220815 19,8,15,19,1142.708285
2396,100_20220815 20,8,15,20,1033.975356
2397,100_20220815 21,8,15,21,926.747014
2398,100_20220815 22,8,15,22,813.332572


In [None]:
ans_815[['num_date_time','answer']].to_csv(f"{root_path}/ans_815.csv", index=False)

# Main Modeling

In [None]:
# Read DataSet, again
train = pd.read_csv(f"{root_path}train.csv")
test = pd.read_csv(f"{root_path}test.csv")
train.loc[train['num_date_time'].isin(ans_815['num_date_time']),'전력소비량(kWh)'] = ans_815['answer'].values
test['전력소비량(kWh)'] = float('nan')

In [None]:
ref_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mape',
    'subsample': 1,
    'subsample_freq': 1,
    'learning_rate': 0.01,
    'num_leaves': 2**5-1,
    'min_data_in_leaf': 2**8-1,
    'feature_fraction': 1,
    'max_bin': 1024,
    'n_estimators': 50000,
    'boost_from_average': False,
    'verbose': -1,
    'seed' : 777
}

In [None]:
# Main Recursive Modeling
candidate_num = 48
model_num = 20
final_ensemble_cnt = 16
test_cut_day = 25

BO_target_num = []
start_time = time.time()
Recursive_ans_list = []
seed_list = [777]
for seed in seed_list:
    print("--------------------------------")
    print(f"[seed: {seed}]\n")
    ref_params['seed'] = seed

    val_pred_list, val_real_list = [], []
    ans_list = []
    model_list = {}
    test['전력소비량(kWh)'] = float('nan')
    for lag in range(7):
        df_0, train_0, valid_0, test_0 = preprocess_and_create_features(train, test, befor_815=False)
        b_type_list = sorted(list(df_0['건물번호'].unique()))
        for b_type in b_type_list:
            params = ref_params.copy()
            ans_temp_list = []
            feature_marker_matrix, selected_params = read_params(b_type, candidate_num=candidate_num, model_num=model_num)
            for num in range(len(feature_marker_matrix)):
                try:
                    feature_marker_list, best_params = feature_marker_matrix[num], selected_params[num]
                    feature_list = [df_0.columns[_] for _ in feature_marker_list]
                    df_b_type = df_0.loc[df_0['건물번호']==b_type, feature_list]
                    train_b_type = train_0.loc[train_0['건물번호']==b_type, feature_list]
                    valid_b_type = valid_0.loc[valid_0['건물번호']==b_type, feature_list]
                    test_b_type = test_0.loc[test_0['건물번호']==b_type, feature_list]

                    params['subsample'] = best_params['subsample']
                    params['subsample_freq'] =  int(best_params['subsample_freq'])
                    #params['learning_rate'] =  10**best_params['learning_rate']
                    params['num_leaves'] =  int(2**best_params['num_leaves']-1)
                    params['min_data_in_leaf'] =  int(2**best_params['min_data_in_leaf']-1)
                    params['feature_fraction'] =  best_params['feature_fraction']
                    params['max_bin'] =  int(2**best_params['max_bin'])
                except:
                    print(f"No param json b_type {b_type}...")
                    df_b_type = df_0[df_0['건물번호']==b_type]
                    train_b_type = train_0[train_0['건물번호']==b_type]
                    valid_b_type = valid_0[valid_0['건물번호']==b_type]
                    test_b_type = test_0[test_0['건물번호']==b_type]

                X_train, y_train = train_b_type.drop('전력소비량(kWh)',axis=1), train_b_type['전력소비량(kWh)']
                X_valid, y_valid = valid_b_type.drop('전력소비량(kWh)',axis=1), valid_b_type['전력소비량(kWh)']
                X_test, y_test = test_b_type.drop("전력소비량(kWh)", axis=1), test_b_type['전력소비량(kWh)']

                test_target = test_b_type[(test_b_type['month']==8)&(test_b_type['day']==test_cut_day+lag)]
                X_test_target, y_test_target = test_target.drop('전력소비량(kWh)',axis=1), test_target['전력소비량(kWh)']

                # Create model
                if b_type not in model_list:
                    model_list[b_type] = {}
                if num not in model_list[b_type]:
                    print(f"[day {lag+1}] [b_type {b_type}] [num {num}] Train model ... {(time.time()-start_time)/60:.2f}min")
                    print("--------")

                    model = lgb.train(params=params,
                            train_set=lgb.Dataset(X_train.drop(['건물번호'],axis=1), y_train),
                            valid_sets=lgb.Dataset(X_valid.drop(['건물번호'],axis=1), y_valid),
                            feval=lgbm_smape,
                            verbose_eval=2000,
                            early_stopping_rounds=100)
                    model_list[b_type][num] = model

                    del model
                    gc.collect()

                    print("--------")

                # Validation predict
                pred_val = model_list[b_type][num].predict(X_valid.drop(['건물번호'],axis=1))
                val_pred_list += list(pred_val)
                val_real_list += list(y_valid)
                smape_val = np.mean(2 * abs(pred_val - y_valid) / (abs(pred_val) + abs(y_valid))) * 100

                # Test predict
                pred_eva = model_list[b_type][num].predict(X_test_target.drop(['건물번호'],axis=1))
                ans = X_test_target[['건물번호','month','day','time']].copy()
                ans['answer'] = pred_eva
                ans = ans.sort_values(['건물번호','month','day','time']).reset_index(drop=True)
                ans['num_date_time'] = ans['건물번호'].astype(str) + "_2022" + ans['month'].map(lambda x: str(x).zfill(2)) + ans['day'].map(lambda x: str(x).zfill(2)) + ' ' + ans['time'].map(lambda x: str(x).zfill(2))
                ans_temp_list.append((smape_val,ans))

                del df_b_type, train_b_type, valid_b_type, test_b_type, X_train, y_train, X_valid, y_valid, X_test, y_test, test_target, X_test_target, y_test_target
                gc.collect()

            ans_temp_list = [ans_tuple[-1] for ans_tuple in sorted(ans_temp_list)[:final_ensemble_cnt]]
            ans.loc[:,'answer'] = pd.concat(ans_temp_list,axis=1)[['answer']].median(axis=1)
            ans_list.append(ans)

            # Update TestSet
            print(f"[day {lag+1}] [b_type {b_type}] Update Recursive")
            test.loc[test['num_date_time'].isin(ans['num_date_time']),'전력소비량(kWh)'] = ans['answer'].values
            print("------------------------")

            del ans, smape_val, ans_temp_list
            gc.collect()

        print("--------")
        print("Done\n")

    val_pred_list = np.array(val_pred_list)
    val_real_list = np.array(val_real_list)
    smape_val = np.mean(2 * abs(val_pred_list - val_real_list) / (abs(val_pred_list) + abs(val_real_list))) * 100
    print(f"final_val_SMAPE : {smape_val}")
    print("--------")

    Recursive_ans = pd.concat(ans_list,axis=0,ignore_index=True)
    Recursive_ans = Recursive_ans.sort_values(['건물번호','month','day','time']).reset_index(drop=True)
    Recursive_ans = Recursive_ans[['num_date_time','month','day','time','answer']].copy()
    Recursive_ans_list.append(Recursive_ans['answer'].values)
Recursive_ans['answer'] = np.mean(Recursive_ans_list,axis=0)
Recursive_ans

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
--------
[day 1] [b_type 70] [num 6] Train model ... 53.34min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[346]	valid_0's mape: 0.126714	valid_0's SMAPE: 13.3219
--------
[day 1] [b_type 70] [num 7] Train model ... 53.35min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[338]	valid_0's mape: 0.0950127	valid_0's SMAPE: 9.52525
--------
[day 1] [b_type 70] [num 8] Train model ... 53.36min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[455]	valid_0's mape: 0.123568	valid_0's SMAPE: 13.0587
--------
[day 1] [b_type 70] [num 9] Train model ... 53.38min
--------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[409]	valid_0's mape: 0.121319	valid_0's SMAPE: 12.8006
--------
[day 1] [b_type 70] [num 10] Train model ...

Unnamed: 0,num_date_time,month,day,time,answer
0,1_20220825 00,8,25,0,1965.626490
1,1_20220825 01,8,25,1,1910.975698
2,1_20220825 02,8,25,2,1794.561720
3,1_20220825 03,8,25,3,1699.175230
4,1_20220825 04,8,25,4,1717.304688
...,...,...,...,...,...
16795,100_20220831 19,8,31,19,870.565485
16796,100_20220831 20,8,31,20,763.572874
16797,100_20220831 21,8,31,21,709.570542
16798,100_20220831 22,8,31,22,591.059985


In [None]:
Recursive_ans[['num_date_time','answer']].to_csv(f"{root_path}/my_submission_LGBM_Final.csv", index=False)

In [None]:
# plot_series
plot_df = pd.concat([train,test],axis=0, ignore_index=True)
plot_df = plot_df[plot_df['num_date_time'].isin(Recursive_ans['num_date_time'])]
plot_df.loc[:,'Recursive_ans'] = Recursive_ans['answer'].values
for i in range(100):
    train_data = train_0[train_0['건물번호']==i+1]['전력소비량(kWh)']
    valid_data = valid_0[valid_0['건물번호']==i+1]['전력소비량(kWh)']
    rc_test_data = plot_df[plot_df['건물번호']==i+1]['Recursive_ans']
    plot_series(train_data, valid_data, rc_test_data, markers=[',' , ',' , ','], title=f"b_type: {i+1}")
