## Import 

In [1]:
import random
import pandas as pd
import numpy as np
import os
import math
import matplotlib.pyplot as plt

# data pre-processing load
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

# others ML model load
# from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# from catboost import CatBoostRegressor

from tqdm import tqdm

## Fixed Random-Seed

In [2]:
# For reproducibility

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) # seed 고정

## Load Data

In [3]:
# path 설정

dir_path = "./open"

train_path = os.path.join(dir_path, "train_split.csv")
original_train_path = os.path.join(dir_path, "train.csv")

test_path = os.path.join(dir_path, "valid_split.csv")

building_path = os.path.join(dir_path, "building_info.csv")

In [4]:
# Load 데이터프레임

train_df = pd.read_csv(train_path)
original_train_df = pd.read_csv(original_train_path)

test_df = pd.read_csv(test_path)

building_df = pd.read_csv(building_path)

In [5]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.40
...,...,...,...,...,...,...,...,...,...,...
187195,100_20220817 19,100,20220817 19,26.3,,2.8,68.0,1.0,,1049.52
187196,100_20220817 20,100,20220817 20,24.7,,1.4,75.0,0.0,,874.32
187197,100_20220817 21,100,20220817 21,22.4,,0.5,89.0,,,678.24
187198,100_20220817 22,100,20220817 22,22.6,,1.2,88.0,,,632.64


In [6]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220818 00,1,20220818 00,24.1,,2.1,78.0,,,1898.88
1,1_20220818 01,1,20220818 01,23.4,,2.2,82.0,,,1804.80
2,1_20220818 02,1,20220818 02,23.1,,2.0,84.0,,,1626.72
3,1_20220818 03,1,20220818 03,23.2,,1.9,84.0,,,1565.28
4,1_20220818 04,1,20220818 04,22.7,,2.1,87.0,,,1584.48
...,...,...,...,...,...,...,...,...,...,...
16795,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04
16796,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96
16797,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12
16798,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08


## Data Pre-processing

### train과 test 동일하게 진행

In [7]:
def rename_columns(df):
    # 컬럼명 영어로 수정
    df_cols = ['num_date_time', 'building_num', 'date', 'temp', 
               'prec', 'wind', 'hum', 'sunshine', 'solar', 'power']

    df_cols_dict = {key: value for key, value in zip(df.columns, df_cols)}
    
    df = df.rename(columns=df_cols_dict)
    
    return df

def handling_missing_values(df):
    # 강수량 결측치 0.0으로 채우기
    df['prec'].fillna(0.0, inplace=True)

    # 풍속, 습도 결측치 평균으로 채우고 반올림하기
    df['wind'].fillna(round(df['wind'].mean(),2), inplace=True)
    df['hum'].fillna(round(df['hum'].mean(),2), inplace=True)
    
    return df

def create_time_columns(df):
    date = pd.to_datetime(df['date'])
    df['date'] = date
    df['hour'] = date.dt.hour
    df['day'] =  date.dt.weekday
    df['month'] = date.dt.month
    df['week'] = date.dt.isocalendar().week 
    
    return df

def create_holiday(df):
    ## 공휴일 변수 추가
    df['holiday'] = df.apply(lambda x : 0 if x['day'] < 5 else 1, axis = 1)

    # 지선 6월 1일
    df.loc[(df['date'] >= '2022-06-01') & (df['date'] < '2022-06-02'), 'holiday'] = 1

    # 현충일 6월 6일
    df.loc[(df['date'] >= '2022-06-06') & (df['date'] < '2022-06-07'), 'holiday'] = 1

    # 광복절 8월 15일
    df.loc[(df['date'] >= '2022-08-15') & (df['date'] < '2022-08-16'), 'holiday'] = 1
    
    return df

def create_sin_cos_hour(df):
    # sin & cos 변수 추가
    df['sin_hour'] = np.sin(2*np.pi*df['hour']/24)
    df['cos_hour'] = np.cos(2*np.pi*df['hour']/24)
    
    return df

def create_temp_f(df):
    # 화씨 온도 추가
    df['temp_f'] = (df['temp'] * 9/5) + 32 
    
    return df

def create_wind_chill_temp(df):
    # 체감 온도 변수 추가
    # https://www.weather.go.kr/w/theme/daily-life/regional-composite-index.do
    df['wind_chill_temp'] = 13.12 + 0.6215*df['temp'] - 11.37*(df['wind']*3.6)**0.16 + 0.3965*(df['wind']*3.6)**0.16*df['temp']
    
    return df

# 열지수 column 추가
# https://www.wpc.ncep.noaa.gov/html/heatindex_equation.shtml
def create_Heat_index(df):
    T = df['temp_f']
    RH = df['hum']
    HI = pd.Series([0] * len(T), name = 'Heat_index')
    
    condition3 = T < 80
    condition4 = T >= 80
    condition1 = (RH > 85) & ((T > 80) & (T < 87))
    condition2 = (RH < 13) & ((T > 80) & (T < 112))

    HI[condition3] = 0.5 * (T[condition3] + 61.0 + ((T[condition3]-68.0)*1.2) + (RH[condition3]*0.094))

    HI[condition4] = -42.379 + 2.04901523*T[condition4] + 10.14333127*RH[condition4] - .22475541*T[condition4]*RH[condition4] - .00683783*T[condition4]*T[condition4] - .05481717*RH[condition4]*RH[condition4] + .00122874*T[condition4]*T[condition4]*RH[condition4] + .00085282*T[condition4]*RH[condition4]*RH[condition4] - .00000199*T[condition4]*T[condition4]*RH[condition4]*RH[condition4]

    HI[condition1] = HI[condition1] + ((RH[condition1]-85)/10) * ((87-T[condition1])/5)
    
    HI[condition2] = HI[condition2] - ((13-RH[condition2])/4)*np.sqrt((17-abs(T[condition2]-95.))/17)

    df['Heat_index'] = HI

    return df

    
def create_thi(df):
    # Temperature Humidity Index(THI) 변수 추가
    df['THI'] = 9/5*df['temp'] - 0.55*(1-df['hum']/100)*(9/5*df['temp']-26)+32
    
    return df

def create_cdh(df):
    # Cooling Degree Hour 변수 추가
    def CDH(xs):
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[:(i+1)]-26))
            else:
                ys.append(np.sum(xs[(i-11):(i+1)]-26))
        return np.array(ys)

    cdhs = np.array([])
    for num in range(1,101):
        building_df = df[df['building_num'] == num]
        cdh = CDH(building_df['temp'].values)
        cdhs = np.concatenate([cdhs, cdh])
    
    df['CDH'] = cdhs
    
    return df

def create_working_hour(df):
    # 일 관련 시간 추가
    df['work_hour'] = ((df['hour']>=8) & (df['hour']<=19)).astype(int)
    df['lunch_hour'] = ((df['hour']>=11) & (df['hour']<=13) & (df['day']<=4)).astype(int)
    df['lunch_hour2'] = ((df['hour']>=12) & (df['hour']<=14) & (df['day']>4)).astype(int)
    
    df['dinner_hour'] = ((df['hour']>=17) & (df['hour']<=22)).astype(int)
    df['dinner_hour2'] = ((df['hour']>=18) & (df['day']>=4) & (df['day']<=5)).astype(int)
    
    return df

In [8]:
### 발전량 평균 넣어주기

## 건물당 요일 + 시간별 발전량 평균 : day_hour_mean
def create_day_hour_mean(df):
    day_hour_power_mean = pd.pivot_table(df, values = 'power', 
                                         index = ['building_num', 'hour', 'day'], 
                                         aggfunc = np.mean).reset_index()
    
    day_hour_power_mean.columns = ['building_num', 'hour', 'day', 'day_hour_mean']
    
    return day_hour_power_mean

def create_day_hour_std(df):
    day_hour_power_std = pd.pivot_table(df, values = 'power', 
                                         index = ['building_num', 'hour', 'day'], 
                                         aggfunc = np.std).reset_index()
    
    day_hour_power_std.columns = ['building_num', 'hour', 'day', 'day_hour_std']
    
    return day_hour_power_std


def create_hour_mean(df):
    hour_power_mean = pd.pivot_table(df, values = 'power', 
                                         index = ['building_num', 'hour'], 
                                         aggfunc = np.mean).reset_index()
    
    hour_power_mean.columns = ['building_num', 'hour', 'hour_mean']
    
    return hour_power_mean

def create_hour_std(df):
    hour_power_std = pd.pivot_table(df, values = 'power', 
                                         index = ['building_num', 'hour'], 
                                         aggfunc = np.std).reset_index()
    
    hour_power_std.columns = ['building_num', 'hour', 'hour_std']
    
    return hour_power_std

In [9]:
def train_pre_processing(train_df):
    # 컬럼명 영어로 수정
    train_df = rename_columns(train_df)
    
    # 결측치 채우기
    train_df = handling_missing_values(train_df)
    
    # 시계열 - 시간 관련 변수들 생성
    train_df = create_time_columns(train_df)
    train_df = create_working_hour(train_df)

    ## 공휴일 변수 추가
    train_df = create_holiday(train_df)
    
    # sin & cos 변수 추가
    train_df = create_sin_cos_hour(train_df)
    
    # 화씨 온도 변수 추가
    train_df = create_temp_f(train_df)

    # 체감 온도 변수 추가
    train_df = create_wind_chill_temp(train_df)

    # 열지수(Heat index) 변수 추가
    train_df = create_Heat_index(train_df)
    
    # Temperature Humidity Index(THI) 변수 추가
    train_df = create_thi(train_df)

    # Cooling Degree Hour(CDH) 변수 추가
    train_df = create_cdh(train_df)
    
    #  day_hour_mean 변수 추가
    day_hour_power_mean = create_day_hour_mean(train_df)
    train_df = pd.merge(train_df, day_hour_power_mean, how='left', on=['building_num', 'hour', 'day'])
    
    #  day_hour_std 변수 추가
    day_hour_power_std = create_day_hour_std(train_df)
    train_df = pd.merge(train_df, day_hour_power_std, how='left', on=['building_num', 'hour', 'day'])
    
    # 컬럼 제거
    # num_date_time, date, sunshine, solar, hour drop 컬럼 제거
    train_df = train_df.drop(['num_date_time', 'date', 'sunshine', 'solar', 'hour'], axis=1)
    
    return train_df

In [10]:
def test_pre_processing(test_df, train_df=None):
    # 컬럼명 영어로 수정
    test_df = rename_columns(test_df)
    train_df = rename_columns(train_df)
    
    # 결측치 채우기
    test_df = handling_missing_values(test_df)
    
    # 시계열 - 시간 관련 변수들 생성
    test_df = create_time_columns(test_df)
    train_df = create_time_columns(train_df)
    
    test_df = create_working_hour(test_df)

    ## 공휴일 변수 추가
    test_df = create_holiday(test_df)
    
    # sin & cos 변수 추가
    test_df = create_sin_cos_hour(test_df)
        
    # 화씨 온도 변수 추가
    test_df = create_temp_f(test_df)

    # 체감 온도 변수 추가
    test_df = create_wind_chill_temp(test_df)

    # 열지수(Heat index) 변수 추가
    test_df = create_Heat_index(test_df)

    # 불쾌지수
    
    
    # Temperature Humidity Index(THI) 변수 추가
    test_df = create_thi(test_df)

    # Cooling Degree Hour(CDH) 변수 추가
    test_df = create_cdh(test_df)
    
    #  day_hour_mean 변수 추가
    day_hour_power_mean = create_day_hour_mean(train_df)
    test_df = pd.merge(test_df, day_hour_power_mean, how='left', on=['building_num', 'hour', 'day'])
    
    #  day_hour_std 변수 추가
    day_hour_power_std = create_day_hour_std(train_df)
    test_df = pd.merge(test_df, day_hour_power_std, how='left', on=['building_num', 'hour', 'day'])
    
    # 컬럼 제거
    # num_date_time, date, sunshine, solar, hour drop 컬럼 제거
    if 'sunshine' in test_df.columns.tolist() :
        test_df = test_df.drop(['sunshine', 'solar'], axis=1)
    
    test_df = test_df.drop(['num_date_time', 'date', 'hour'], axis=1)
    
    return test_df

In [11]:
dp_train_df = train_pre_processing(train_df)
dp_train_df

Unnamed: 0,building_num,temp,prec,wind,hum,power,day,month,week,work_hour,...,holiday,sin_hour,cos_hour,temp_f,wind_chill_temp,Heat_index,THI,CDH,day_hour_mean,day_hour_std
0,1,18.6,0.0,0.9,42.0,1085.28,2,6,22,0,...,1,0.000000,1.000000,65.48,19.858037,63.702,63.09388,-7.4,1730.20,514.351157
1,1,18.0,0.0,1.1,45.0,1047.36,2,6,22,0,...,1,0.258819,0.965926,64.40,19.031307,62.655,62.46400,-15.4,1645.24,498.420165
2,1,17.7,0.0,1.5,45.0,974.88,2,6,22,0,...,1,0.500000,0.866025,63.86,18.420654,62.061,62.08735,-23.7,1531.04,461.428496
3,1,16.7,0.0,1.4,48.0,953.76,2,6,22,0,...,1,0.707107,0.707107,62.06,17.348120,60.222,60.89884,-33.0,1485.80,434.979228
4,1,18.4,0.0,2.8,43.0,986.40,2,6,22,0,...,1,0.866025,0.500000,65.12,18.658787,63.353,62.88788,-40.6,1468.60,398.379783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187195,100,26.3,0.0,2.8,68.0,1049.52,2,8,33,1,...,0,-0.965926,0.258819,79.34,28.102036,80.170,75.58416,22.5,971.86,169.075481
187196,100,24.7,0.0,1.4,75.0,874.32,2,8,33,0,...,0,-0.866025,0.500000,76.46,26.428987,77.331,73.92175,23.0,889.12,157.735045
187197,100,22.4,0.0,0.5,89.0,678.24,2,8,33,0,...,0,-0.707107,0.707107,72.32,24.307819,73.435,71.45364,20.8,775.26,149.094821
187198,100,22.6,0.0,1.2,88.0,632.64,2,8,33,0,...,0,-0.500000,0.866025,72.68,24.121286,73.784,71.71112,16.9,665.20,109.581312


In [12]:
dp_test_df = test_pre_processing(test_df, train_df)
dp_test_df

Unnamed: 0,building_num,temp,prec,wind,hum,power,day,month,week,work_hour,...,holiday,sin_hour,cos_hour,temp_f,wind_chill_temp,Heat_index,THI,CDH,day_hour_mean,day_hour_std
0,1,24.1,0.0,2.1,78.0,1898.88,3,8,33,0,...,0,0.000000,1.000000,75.38,25.590398,76.284,73.27702,-1.9,1603.156364,460.172345
1,1,23.4,0.0,2.2,82.0,1804.80,3,8,33,0,...,0,0.258819,0.965926,74.12,24.750124,75.086,72.52412,-4.5,1526.923636,463.436431
2,1,23.1,0.0,2.0,84.0,1626.72,3,8,33,0,...,0,0.500000,0.866025,73.58,24.444627,74.586,72.20896,-7.4,1413.338182,430.915405
3,1,23.2,0.0,1.9,84.0,1565.28,3,8,33,0,...,0,0.707107,0.707107,73.76,24.585492,74.784,72.37312,-10.2,1354.647273,391.412133
4,1,22.7,0.0,2.1,87.0,1584.48,3,8,33,0,...,0,0.866025,0.500000,72.86,23.953052,73.935,71.79751,-13.5,1363.287273,371.955950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100,23.1,0.0,0.9,86.0,881.04,2,8,34,1,...,0,-0.965926,0.258819,73.58,24.808277,74.680,72.38034,-19.6,971.860000,169.075481
16796,100,22.4,0.0,1.3,86.0,798.96,2,8,34,0,...,0,-0.866025,0.500000,72.32,23.856233,73.294,71.21736,-20.2,889.120000,157.735045
16797,100,21.3,0.0,1.0,92.0,825.12,2,8,34,0,...,0,-0.707107,0.707107,70.34,22.768173,71.398,69.79704,-22.3,775.260000,149.094821
16798,100,21.0,0.0,0.3,94.0,640.08,2,8,34,0,...,0,-0.500000,0.866025,69.80,23.090291,70.898,69.41060,-25.1,665.200000,109.581312


In [13]:
# 건물별 그룹핑
dic ={}
for i in range(1, 101):
    dic['df'+str(i)] = dp_test_df[dp_test_df['building_num'] == i][['temp', 'prec', 'wind', 'hum']].reset_index(drop = True)

In [14]:
# DataFrame 그룹핑 함수
def group_equal_dataframes(dataframes):
    grouped = []  # 그룹화된 DataFrame 리스트
    grouped_indices = set()

    for i, df1 in enumerate(dataframes):
        if i in grouped_indices:
            continue

        group = [i]

        for j, df2 in enumerate(dataframes[i + 1:], start=i + 1):
            if j not in grouped_indices and dic[df1].equals(dic[df2]):
                group.append(j)
                grouped_indices.add(j)

        if len(group) > 1:
            grouped.append(group)

    return grouped

# DataFrame 그룹핑
dataframes_to_group = list(dic.keys())
grouped_dataframes = group_equal_dataframes(dataframes_to_group)

def add_one_to_2d_list(matrix):
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            matrix[i][j] += 1
    return matrix

grouped_dataframes = add_one_to_2d_list(grouped_dataframes)

In [15]:
len(grouped_dataframes)

13

In [16]:
grouped_dataframes

[[1,
  2,
  3,
  4,
  5,
  17,
  19,
  24,
  25,
  32,
  33,
  34,
  35,
  37,
  38,
  41,
  45,
  53,
  57,
  59,
  60,
  61,
  62,
  68,
  69,
  74,
  80,
  81,
  85,
  93,
  94],
 [6, 7, 12, 18, 36, 47, 58, 73, 77, 78, 79, 83, 84, 86, 92],
 [8, 11],
 [10, 13, 54, 67, 96],
 [14, 64, 82, 88, 89],
 [15, 51, 90],
 [22, 43, 44],
 [23, 66],
 [27, 39, 56, 63, 70, 71],
 [28, 98],
 [40, 48, 49, 55, 65, 72, 91],
 [42, 50],
 [52, 97]]

In [17]:
# 2차원 리스트를 1차원 리스트로 펴기
flatten_grouped_dataframes = [item for sublist in grouped_dataframes for item in sublist]
len(flatten_grouped_dataframes)

85

In [18]:
independent_buildings_15 = np.setdiff1d(range(1,101), flatten_grouped_dataframes)
independent_buildings_15

array([  9,  16,  20,  21,  26,  29,  30,  31,  46,  75,  76,  87,  95,
        99, 100])

In [19]:
# 결과 출력
for group_id, group in enumerate(grouped_dataframes, start=1):
    print(f"Group {group_id}: {group}")

Group 1: [1, 2, 3, 4, 5, 17, 19, 24, 25, 32, 33, 34, 35, 37, 38, 41, 45, 53, 57, 59, 60, 61, 62, 68, 69, 74, 80, 81, 85, 93, 94]
Group 2: [6, 7, 12, 18, 36, 47, 58, 73, 77, 78, 79, 83, 84, 86, 92]
Group 3: [8, 11]
Group 4: [10, 13, 54, 67, 96]
Group 5: [14, 64, 82, 88, 89]
Group 6: [15, 51, 90]
Group 7: [22, 43, 44]
Group 8: [23, 66]
Group 9: [27, 39, 56, 63, 70, 71]
Group 10: [28, 98]
Group 11: [40, 48, 49, 55, 65, 72, 91]
Group 12: [42, 50]
Group 13: [52, 97]



## SMAPE 평가지표 정의

$$ SMAPE(y, \hat{y}) = \frac{100}{n}\sum_{i=1}^{n}\frac{\left|y_i - \hat{y_i}\right|}{(|y_i| + |\hat{y_i}|) / 2} $$


In [20]:
# Define SMAPE loss function
def SMAPE(actual, pred):
    return 100 * np.mean(2 * np.abs(actual - pred) / (np.abs(actual) + np.abs(pred)))

아래와 같이 평가 Metric인 SMAPE는 <span style="color: red; font-weight: bold;">실제값보다 작게 추정할 때 더 좋지 않습니다.</span>


이는 전력사용량을 높게 예측하는 것보다 작게 예측할 때 실제로 더 큰 문제가 될 수 있음을 반영한 것으로 보입니다.

In [21]:
print("실제값이 100일 때 50으로 underestimate할 때의 SMAPE : {}".format(SMAPE(100, 50)))
print("실제값이 100일 때 150으로 overestimate할 때의 SMAPE : {}".format(SMAPE(100, 150)))

실제값이 100일 때 50으로 underestimate할 때의 SMAPE : 66.66666666666666
실제값이 100일 때 150으로 overestimate할 때의 SMAPE : 40.0


## Custom Loss XGBoost 

In [22]:
#### alpha를 argument로 받는 함수로 실제 objective function을 wrapping하여 alpha값을 쉽게 조정할 수 있도록 작성했습니다.
# custom objective function for forcing model not to underestimate
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [23]:
best_iterations_dict = dict()

In [None]:
all_y_test = []
all_y_pred = []

# 1차원으로 푼 다음에 1 ~ 100에서 1차원으로 푼 거 차 집합
for grouped_list in tqdm(grouped_dataframes):
    X_train = dp_train_df[dp_train_df["building_num"].isin(grouped_list)].drop(['building_num', 'power'], axis=1)
    y_train = dp_train_df[dp_train_df["building_num"].isin(grouped_list)]['power']

    X_test = dp_test_df[dp_test_df["building_num"].isin(grouped_list)].drop(['building_num', 'power'], axis=1)
    y_test = dp_test_df[dp_test_df["building_num"].isin(grouped_list)]['power']
    
    X_train['week'] = X_train['week'].astype('int64')
    X_test['week'] = X_test['week'].astype('int64')
    
    xgb_reg = XGBRegressor(n_estimators = 10000, eta = 0.01, min_child_weight = 6, 
                        max_depth = 5, colsample_bytree = 0.5, 
                        subsample = 0.9, seed=42)
    
    xgb_reg.set_params(**{'objective':weighted_mse(2)})
    xgb_reg.set_params(**{'early_stopping_rounds':300})
    

    xgb_reg.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    y_pred = xgb_reg.predict(X_test)

    # 튜플로 바꿔서 키 값으로 사용할 수 있도록,,,
    best_iterations_dict[tuple(grouped_list)] = xgb_reg.best_iteration
    
    all_y_test.extend(y_test)
    all_y_pred.extend(y_pred)

    # print('group' + str(i))
    print('best iterations: {}'.format(xgb_reg.best_iteration))
    print('SMAPE : {}'.format(SMAPE(y_test, y_pred)))
    print()

  0%|                                                                                        | 0/13 [00:00<?, ?it/s]

In [None]:
for building_num in independent_buildings_15:
    
    X_train = dp_train_df[dp_train_df["building_num"] == building_num].drop(['building_num', 'power'], axis=1)
    y_train = dp_train_df[dp_train_df["building_num"] == building_num]['power']

    X_test = dp_test_df[dp_test_df["building_num"] == building_num].drop(['building_num', 'power'], axis=1)
    y_test = dp_test_df[dp_test_df["building_num"] == building_num]['power']
    
    X_train['week'] = X_train['week'].astype('int64')
    X_test['week'] = X_test['week'].astype('int64')
    
    xgb_reg = XGBRegressor(n_estimators = 10000, eta = 0.01, min_child_weight = 6, 
                       max_depth = 5, colsample_bytree = 0.5, 
                       subsample = 0.9, seed=42)
    
    xgb_reg.set_params(**{'objective':weighted_mse(2)})
    xgb_reg.set_params(**{'early_stopping_rounds':300})
    

    xgb_reg.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    y_pred = xgb_reg.predict(X_test)
    
    best_iterations_dict[building_num] = xgb_reg.best_iteration
    
    all_y_test.extend(y_test)
    all_y_pred.extend(y_pred)

    
    print(f"building_num : {building_num}")
    print('best iterations: {}'.format(xgb_reg.best_iteration))
    print('SMAPE : {}'.format(SMAPE(y_test, y_pred)))
    print()

In [None]:
print("-"*45)
print(f'building_num 1-100 SMAPE : {SMAPE(np.array(all_y_test), np.array(all_y_pred))}')
print("-"*45)