In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
path = '/content/drive/MyDrive/2023_Dacon_Electricity/'
import os
os.listdir(path)

['building_info.csv',
 'sample_submission.csv',
 'test.csv',
 'train.csv',
 'pr_report.html',
 'AutoML.ipynb',
 'Baseline.ipynb']

In [4]:
import pandas as pd
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')
building_info = pd.read_csv(path + 'building_info.csv')

In [None]:
print(train.shape)
print(test.shape)
print(submission.shape)

(204000, 10)
(16800, 7)
(16800, 2)


In [5]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
train.drop('num_date_time', axis = 1, inplace=True)

In [6]:
test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test.drop('num_date_time', axis = 1, inplace=True)

In [9]:
building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)
#building_info.drop('Unnamed: 0', axis = 1 , inplace=True)

In [10]:
train = pd.merge(train, building_info, on='building_number', how='left')
test = pd.merge(test, building_info, on='building_number', how='left')

In [11]:
columns_to_replace = ['solar_power_capacity', 'ess_capacity', 'pcs_capacity']

for column in columns_to_replace:
    train[column] = train[column].replace('-', 0)
    test[column] = test[column].replace('-', 0)

In [12]:
for column in columns_to_replace:
    train[column] = train[column].astype(float)
    test[column] = test[column].astype(float)

In [13]:
import datetime

def to_datetime(s):
    """
    Args:
        s: ex) '20220601 01'
    Returns:
        weekday: 0~6(int), 0: 월요일, 1: 화요일, ...
    """
    s = s.split()[0]  # 20220601
    date = datetime.datetime.strptime(s, '%Y%m%d')
    weekday = date.weekday()  #
    return weekday



In [14]:
train_origin_ = train.copy()
train['Weekday'] = train_origin_.apply(lambda x:to_datetime(x['date_time']), axis=1)


test_origin_ = test.copy()
test['Weekday'] = test_origin_.apply(lambda x:to_datetime(x['date_time']), axis=1)

In [15]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

# date time feature 생성
train['hour'] = train['date_time'].dt.hour
train['day'] = train['date_time'].dt.day
train['month'] = train['date_time'].dt.month
train['year'] = train['date_time'].dt.year




In [16]:
test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

# date time feature 생성
test['hour'] = test['date_time'].dt.hour
test['day'] = test['date_time'].dt.day
test['month'] = test['date_time'].dt.month
test['year'] = test['date_time'].dt.year


In [17]:
train.drop(columns=['date_time'], inplace=True)
test.drop(columns=['date_time'], inplace=True)

In [18]:
# one hot encoding 생성
train = pd.get_dummies(train, columns=['building_type'], drop_first=True)
test = pd.get_dummies(test, columns=['building_type'], drop_first=True)

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np

In [20]:
train['windspeed'].fillna(train['windspeed'].mean(), inplace=True)
train['humidity'].fillna(train['humidity'].mean(), inplace=True)
test['rainfall'].fillna(0, inplace=True)
train = train.fillna(0)

In [21]:
train['sin_hour'] = np.sin((2 * np.pi * train['hour']/24.0).values.astype(float))
train['cos_hour'] = np.cos((2 * np.pi * train['hour']/24.0).values.astype(float))

test['sin_hour'] = np.sin((2 * np.pi * test['hour']/24.0).values.astype(float))
test['cos_hour'] = np.cos((2 * np.pi * test['hour']/24.0).values.astype(float))

In [22]:
train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32
test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32

In [23]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = train[train['building_number'] == num]
    cdh = CDH(temp['temperature'].values)
    cdhs = np.concatenate([cdhs, cdh])
train['CDH'] = cdhs

cdhs = np.array([])
for num in range(1,101,1):
    temp = test[test['building_number'] == num]
    cdh = CDH(temp['temperature'].values)
    cdhs = np.concatenate([cdhs, cdh])
test['CDH'] = cdhs

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204000 entries, 0 to 203999
Data columns (total 31 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   building_number                            204000 non-null  int64  
 1   temperature                                204000 non-null  float64
 2   rainfall                                   204000 non-null  float64
 3   windspeed                                  204000 non-null  float64
 4   humidity                                   204000 non-null  float64
 5   sunshine                                   204000 non-null  float64
 6   solar_radiation                            204000 non-null  float64
 7   power_consumption                          204000 non-null  float64
 8   total_area                                 204000 non-null  float64
 9   cooling_area                               204000 non-null  float64
 10  solar_po

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

correlation_matrix = np.corrcoef(train, rowvar=False)

sns.set(style="white")  # Set the style of the plot

plt.figure(figsize=(20, 20))  # Set the figure size

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.title("Pearson Correlation Coefficient Heatmap")
plt.show()



In [None]:
train['cooling_ratio'] = train['cooling_area'] / train['total_area']
test['cooling_ratio'] = test['cooling_area'] / test['total_area']

In [None]:
train

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,total_area,cooling_area,...,building_type_Hospital,building_type_Hotel and Resort,building_type_Knowledge Industry Center,building_type_Other Buildings,building_type_Public,building_type_Research Institute,building_type_University,THI,CDH,cooling_ratio
0,1,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,110634.00,39570.00,...,0,0,0,1,0,0,0,49.6576,-7.4,0.357666
1,1,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,110634.00,39570.00,...,0,0,0,1,0,0,0,47.7625,-15.4,0.357666
2,1,17.7,0.0,1.5,45.0,0.0,0.0,974.88,110634.00,39570.00,...,0,0,0,1,0,0,0,47.2225,-23.7,0.357666
3,1,16.7,0.0,1.4,48.0,0.0,0.0,953.76,110634.00,39570.00,...,0,0,0,1,0,0,0,44.7856,-33.0,0.357666
4,1,18.4,0.0,2.8,43.0,0.0,0.0,986.40,110634.00,39570.00,...,0,0,0,1,0,0,0,49.0061,-40.6,0.357666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,0.5,0.0,881.04,57497.84,40035.23,...,0,1,0,0,0,0,0,63.6624,-19.6,0.696291
203996,100,22.4,0.0,1.3,86.0,0.0,0.0,798.96,57497.84,40035.23,...,0,1,0,0,0,0,0,62.4024,-20.2,0.696291
203997,100,21.3,0.0,1.0,92.0,0.0,0.0,825.12,57497.84,40035.23,...,0,1,0,0,0,0,0,64.1976,-22.3,0.696291
203998,100,21.0,0.0,0.3,94.0,0.0,0.0,640.08,57497.84,40035.23,...,0,1,0,0,0,0,0,65.0744,-25.1,0.696291


In [24]:
train_x = train.drop(columns=['sunshine', 'solar_radiation', 'power_consumption'])
train_y = train['power_consumption']

In [None]:
#컬럼순서바꾸기 필요시

train_x_columns = train_x.columns
test = test[train_x_columns]

In [None]:
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test)

In [None]:
train_x

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,...,building_type_Hotel and Resort,building_type_Knowledge Industry Center,building_type_Other Buildings,building_type_Public,building_type_Research Institute,building_type_University,sin_hour,cos_hour,THI,CDH
0,1,18.6,0.0,0.9,42.0,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.000000,1.000000,49.6576,-7.4
1,1,18.0,0.0,1.1,45.0,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.258819,0.965926,47.7625,-15.4
2,1,17.7,0.0,1.5,45.0,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.500000,0.866025,47.2225,-23.7
3,1,16.7,0.0,1.4,48.0,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.707107,0.707107,44.7856,-33.0
4,1,18.4,0.0,2.8,43.0,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.866025,0.500000,49.0061,-40.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.965926,0.258819,63.6624,-19.6
203996,100,22.4,0.0,1.3,86.0,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.866025,0.500000,62.4024,-20.2
203997,100,21.3,0.0,1.0,92.0,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.707107,0.707107,64.1976,-22.3
203998,100,21.0,0.0,0.3,94.0,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.500000,0.866025,65.0744,-25.1


In [None]:
test

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,...,building_type_Hotel and Resort,building_type_Knowledge Industry Center,building_type_Other Buildings,building_type_Public,building_type_Research Institute,building_type_University,sin_hour,cos_hour,THI,CDH
0,1,23.5,0.0,2.2,72,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.000000,1.000000,58.3456,-2.5
1,1,23.0,0.0,0.9,72,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.258819,0.965926,57.4456,-5.5
2,1,22.7,0.0,1.5,75,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.500000,0.866025,57.8725,-8.8
3,1,22.1,0.0,1.3,78,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.707107,0.707107,57.9376,-12.7
4,1,21.8,0.0,1.0,77,110634.00,39570.00,0,0,0,...,0,0,1,0,0,0,0.866025,0.500000,56.9961,-16.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100,22.5,0.0,0.9,84,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.965926,0.258819,61.4824,-34.5
16796,100,20.7,0.0,0.4,95,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.866025,0.500000,65.2725,-34.4
16797,100,20.2,0.0,0.4,98,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.707107,0.707107,66.7056,-35.3
16798,100,20.1,0.0,1.1,97,57497.84,40035.23,0,0,0,...,1,0,0,0,0,0,-0.500000,0.866025,65.7281,-36.8


In [25]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# Score

In [None]:
def SMAPE(y, pred):
    smape = abs((y - pred))/((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)
    return smape

def mae(y, pred):
    return np.mean(abs(y-pred))

def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

#Data Report

In [None]:
!pip install -U pandas-profiling
import pandas_profiling

In [None]:
pr = train.profile_report()
pr

Output hidden; open in https://colab.research.google.com to view.

In [None]:
pr.to_file(path +'pr_report.html')

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train_split,y_train_split)

In [None]:
validate(X_val_split, y_val_split, rf_model)

(53.13398915341288, 0.02533700100985709)

In [None]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [None]:
preds = model.predict(test)

In [None]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2084.6976
1,1_20220825 01,2103.0288
2,1_20220825 02,1957.5792
3,1_20220825 03,1947.8736
4,1_20220825 04,1966.7760
...,...,...
16795,100_20220831 19,888.1914
16796,100_20220831 20,850.8942
16797,100_20220831 21,799.3296
16798,100_20220831 22,691.7598


In [None]:
submission['answer'] = preds
submission.to_csv('./10_RF_notscaled_submission.csv', index=False)

#XGBoost

In [None]:
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

In [None]:
xgb_model.fit(X_train_split,y_train_split)

In [None]:
validate(X_val_split, y_val_split, xgb_model)

(147.69270729887333, 0.08519403187056283)

In [None]:
xgb_model.fit(train_x_scaled, train_y)
preds = xgb_model.predict(test_x_scaled)

submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1876.576050
1,1_20220825 01,1820.768311
2,1_20220825 02,1778.096069
3,1_20220825 03,1580.162476
4,1_20220825 04,1503.693359
...,...,...
16795,100_20220831 19,1007.754456
16796,100_20220831 20,877.997803
16797,100_20220831 21,755.926880
16798,100_20220831 22,694.781982


In [None]:
submission.to_csv('./9_XGBoost_scaled_submission.csv', index=False)

# LightGBM

In [None]:
import lightgbm as lgb

In [None]:
lgb_model = lgb.LGBMRegressor(objective='regression', verbose=-1)
lgb_model.fit(X_train_split,y_train_split)

In [None]:
validate(X_val_split, y_val_split, lgb_model)

(12.826893380642927, 227.75366036522453)

# CatBoost

In [None]:
!pip install catboost
from catboost import CatBoost, Pool



In [None]:
cat_model = CatBoost()
cat_model.fit(X_train_split,y_train_split)

Learning rate set to 0.091578
0:	learn: 2276.9598879	total: 109ms	remaining: 1m 48s
1:	learn: 2132.5021961	total: 186ms	remaining: 1m 32s
2:	learn: 1999.5738612	total: 257ms	remaining: 1m 25s
3:	learn: 1887.4150828	total: 341ms	remaining: 1m 24s
4:	learn: 1789.8099551	total: 412ms	remaining: 1m 22s
5:	learn: 1698.3922662	total: 516ms	remaining: 1m 25s
6:	learn: 1617.5603562	total: 620ms	remaining: 1m 27s
7:	learn: 1548.2667801	total: 699ms	remaining: 1m 26s
8:	learn: 1490.2935331	total: 776ms	remaining: 1m 25s
9:	learn: 1433.0246417	total: 838ms	remaining: 1m 22s
10:	learn: 1388.3063639	total: 893ms	remaining: 1m 20s
11:	learn: 1344.0414993	total: 969ms	remaining: 1m 19s
12:	learn: 1303.9952763	total: 1.03s	remaining: 1m 18s
13:	learn: 1258.5798698	total: 1.09s	remaining: 1m 17s
14:	learn: 1226.2750330	total: 1.16s	remaining: 1m 16s
15:	learn: 1193.8069712	total: 1.22s	remaining: 1m 14s
16:	learn: 1162.8674938	total: 1.28s	remaining: 1m 14s
17:	learn: 1137.0521232	total: 1.34s	remainin

<catboost.core.CatBoost at 0x79e8f4487490>

In [None]:
validate(X_val_split, y_val_split, cat_model)

(8.796508962978297, 151.39223786967403)

# Extratrees

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

et_model = ExtraTreesRegressor()
#et_model.fit(X_train_split,y_train_split)
et_model.fit(train_x_scaled, train_y)

In [None]:
preds = et_model.predict(test_x_scaled)
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1970.1168
1,1_20220825 01,2027.4624
2,1_20220825 02,1895.8656
3,1_20220825 03,1898.1072
4,1_20220825 04,1929.7824
...,...,...
16795,100_20220831 19,886.0944
16796,100_20220831 20,811.6596
16797,100_20220831 21,804.6702
16798,100_20220831 22,649.2618


In [None]:
submission.to_csv('./4_Extratrees_scaled_submission.csv', index=False)

#AutoML

In [26]:
!pip install pycaret
from pycaret.regression import *

Collecting pycaret
  Downloading pycaret-3.0.4-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.4/484.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyod>=1.0.8 (from pycaret)
  Downloading pyod-1.1.0.tar.gz (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.4/153.4 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting xxhash (from pycaret)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 

In [27]:
clf = setup(data = train, target = 'power_consumption')

Unnamed: 0,Description,Value
0,Session id,1974
1,Target,power_consumption
2,Target type,Regression
3,Original data shape,"(204000, 33)"
4,Transformed data shape,"(204000, 33)"
5,Transformed train set shape,"(142800, 33)"
6,Transformed test set shape,"(61200, 33)"
7,Numeric features,32
8,Preprocess,True
9,Imputation type,simple


In [28]:
best_3 = compare_models(sort = 'MAPE', fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,80.2188,24845.6325,157.5791,0.9959,0.0846,0.0451,59.252
rf,Random Forest Regressor,93.5737,35841.5908,189.257,0.994,0.091,0.0507,109.192
dt,Decision Tree Regressor,123.1618,72723.6887,269.4161,0.9879,0.1225,0.0646,2.01
xgboost,Extreme Gradient Boosting,157.9195,59837.4859,244.6099,0.99,0.1569,0.0968,21.77
lightgbm,Light Gradient Boosting Machine,220.2284,101761.4412,318.9951,0.9831,0.1869,0.1388,4.726
knn,K Neighbors Regressor,279.9649,267479.6375,517.1788,0.9555,0.2318,0.1579,19.236
gbr,Gradient Boosting Regressor,492.1012,458883.2703,677.371,0.9237,0.3481,0.3109,23.048
huber,Huber Regressor,1379.0763,6500204.4585,2549.2387,-0.0807,0.7541,0.678,1.128
lasso,Lasso Regression,1203.4986,3837792.0033,1958.7576,0.362,0.6801,0.6844,9.806
llar,Lasso Least Angle Regression,1205.6962,3837265.1216,1958.6251,0.362,0.6793,0.6866,0.394


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [None]:
model_et = create_model('et', fold = 5)
model_et = tune_model(model_et, fold=5, optimize = 'MAPE', choose_better = True)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,81.4234,27084.3059,164.5731,0.9957,0.0855,0.0457
1,78.5587,23646.0257,153.7726,0.996,0.0786,0.0439
2,79.5207,24107.1583,155.2648,0.9961,0.0812,0.0445
3,80.5175,24303.7999,155.8968,0.9958,0.0829,0.0453
4,81.0738,25086.873,158.3884,0.9957,0.0949,0.0459
Mean,80.2188,24845.6325,157.5791,0.9959,0.0846,0.0451
Std,1.0502,1212.2573,3.8012,0.0002,0.0056,0.0007


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
blended = blend_models(estimator_list = best_3, fold = 5)

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
pred_holdout = predict_model(blended)

In [None]:
final_model = finalize_model(model_et)

In [None]:
predictions = predict_model(final_model, data = test)

In [None]:
submission['answer'] = predictions['Score']
submission.to_csv('13_AutoML_submission.csv', index = False)
submission.to_csv(path + '13_AutoML_submission.csv', index = False)

#Ensemble

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

rf_model = RandomForestRegressor()
#rf_model.fit(X_train_split,y_train_split)
rf_model.fit(train_x_scaled, train_y)

et_model = ExtraTreesRegressor()
#et_model.fit(X_train_split,y_train_split)
et_model.fit(train_x_scaled, train_y)

dt_model = DecisionTreeRegressor()
#dt_model.fit(X_train_split,y_train_split)
dt_model.fit(train_x_scaled, train_y)

In [None]:
from sklearn.ensemble import VotingRegressor
voting = VotingRegressor(estimators=[('RandomForest', rf_model), ('ExtraTrees', et_model), ('DecisionTree', dt_model)])
#voting.fit(X_train_split,y_train_split)
voting.fit(train_x_scaled, train_y)

In [None]:
print(validate(X_val_split, y_val_split, rf_model))
print(validate(X_val_split, y_val_split, et_model))
print(validate(X_val_split, y_val_split, dt_model))
print(validate(X_val_split, y_val_split, voting))

(4.03396992140522, 76.94686773870369)
(3.786344845487288, 71.0607785217613)
(5.17103096472522, 98.52319346404411)
(3.925988452655767, 73.95239347479797)


In [None]:
preds = voting.predict(test_x_scaled)
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1965.4976
1,1_20220825 01,2113.3456
2,1_20220825 02,1989.3856
3,1_20220825 03,1954.5120
4,1_20220825 04,1963.8688
...,...,...
16795,100_20220831 19,879.9320
16796,100_20220831 20,816.4912
16797,100_20220831 21,803.4836
16798,100_20220831 22,590.5058


In [None]:
submission.to_csv('./3_Voting_scaled_submission.csv', index=False)

#Multi-Model

In [None]:
def SMAPE(y, pred):
    smape = abs((y - pred))/((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)
    return smape

def mae(y, pred):
    return np.mean(abs(y-pred))

In [None]:
from tqdm import tqdm

def validate_multi(valid_x, valid_y, models):
    """
    Args:
        models: dict, {1: model1, 2: model2, ..., 100: model100}
    """
    preds = []
    for i in range(1, 101):
        _x = valid_x[valid_x['building_number'] == i]
        _x = _x.drop(columns=['building_number'])#, 'total_area', 'cooling_area'])
        pred = models[i].predict(_x).tolist()
        preds.extend(pred)
    preds = np.array(preds)
    smape_score, mae_score = SMAPE(valid_y, preds), mae(valid_y, preds)
    return smape_score, mae_score

def train_multiple_models(train_x, train_y, n_estimators=100):
    models = {}
    for i in tqdm(range(1, 101)):
        _x = train_x[train_x['building_number'] == i]
        _x = _x.drop(columns=['building_number'])#, 'total_area', 'cooling_area'])
        _y = train_y[_x.index]
        model_lgb = RandomForestRegressor(n_estimators=n_estimators)
        model_lgb.fit(_x, _y)
        models[i] = model_lgb
    return models

In [None]:
#pred = train_multiple_models(train_x, train_y)
pred = train_multiple_models(train_x, train_y)

100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


In [None]:
preds_real = []
for i in tqdm(range(1, 101)):
    _x = test[test['building_number'] == i]
    _x = _x.drop(columns=['building_number'])#, 'total_area', 'cooling_area'])
    preds = pred[i].predict(_x).tolist()
    preds_real.extend(preds)

100%|██████████| 100/100 [00:00<00:00, 116.84it/s]


In [None]:
submission['answer'] = preds_real
submission.to_csv('./12_RF_multimodel_ratio_submission.csv', index=False)

In [None]:
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2066.3808
1,1_20220825 01,2117.7168
2,1_20220825 02,1912.9680
3,1_20220825 03,1918.5840
4,1_20220825 04,1951.8528
...,...,...
16795,100_20220831 19,863.6424
16796,100_20220831 20,813.9984
16797,100_20220831 21,744.5376
16798,100_20220831 22,658.9896


In [None]:
X_train_split, X_val_split, y_train_split, y_val_split

In [None]:
models_f2 = train_multiple_models(train_x, train_y, 50)
preds = []
for i in range(1, 101):
    _x = X_val_split[X_val_split['building_number'] == i]
    _x = _x.drop(columns=['building_number', 'total_area', 'cooling_area'])
    pred = models_f2[i].predict(_x).tolist()
    preds.extend(pred)
preds = np.array(preds)

100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


In [None]:
eda = X_val_split.copy()
eda['pred'], eda['y'] = preds, y_val_split
eda['gap'] = (eda['y'] - eda['pred']) / ((eda['y'] + eda['pred'])/2)  # SMAPE가 target이기 때문에 scale 고려

In [None]:
eda.sort_values('gap')

Unnamed: 0,building_number,temperature,rainfall,windspeed,humidity,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,...,building_type_Public,building_type_Research Institute,building_type_University,sin_hour,cos_hour,THI,CDH,pred,y,gap
202997,100,24.2,0.0,0.8,93.0,57497.840,40035.23,0,0,0,...,0,0,0,9.659258e-01,2.588190e-01,70.1161,-11.1,21134.6240,339.60,-1.936743
202324,100,18.0,0.0,1.1,86.0,57497.840,40035.23,0,0,0,...,0,0,0,8.660254e-01,5.000000e-01,54.4824,-70.7,18558.0080,306.48,-1.935014
130590,65,17.3,0.0,1.2,92.0,183839.000,0.00,0,0,0,...,0,0,0,1.000000e+00,6.123234e-17,56.9976,-79.7,18339.8320,307.26,-1.934089
13564,7,23.5,0.0,0.9,95.0,101711.520,41341.10,0,800,300,...,0,0,0,8.660254e-01,5.000000e-01,70.3125,-3.8,17674.9440,303.60,-1.932453
86950,43,20.6,11.4,3.6,100.0,148883.850,35633.20,0,0,0,...,0,0,0,-5.000000e-01,8.660254e-01,69.0800,-22.5,24712.3520,436.80,-1.930526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47724,24,30.7,0.0,1.8,67.0,659696.910,516097.85,97.65,0,0,...,0,0,1,1.224647e-16,-1.000000e+00,70.0901,26.3,300.7080,16122.60,1.926761
53842,27,29.7,0.0,1.1,68.0,578484.113,501381.53,30,0,0,...,0,0,1,5.000000e-01,-8.660254e-01,68.4936,-14.2,428.8608,24626.00,1.931533
54060,27,24.7,0.9,3.2,88.0,578484.113,501381.53,30,0,0,...,0,0,1,1.224647e-16,-1.000000e+00,67.7216,-10.7,388.9512,22688.40,1.932583
53891,27,31.3,0.0,0.8,63.0,578484.113,501381.53,30,0,0,...,0,0,1,2.588190e-01,-9.659258e-01,70.5541,14.7,399.2328,25051.60,1.937254


In [None]:
bads = eda.groupby('building_number').agg(lambda x:np.mean(abs(x))).sort_values('gap').tail(5)
goods = eda.groupby('building_number').agg(lambda x:np.mean(abs(x))).sort_values('gap').head(5)

In [None]:
bads

Unnamed: 0_level_0,temperature,rainfall,windspeed,humidity,total_area,cooling_area,Weekday,hour,day,month,...,building_type_Public,building_type_Research Institute,building_type_University,sin_hour,cos_hour,THI,CDH,pred,y,gap
building_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
66,25.939642,0.313811,1.573146,77.846547,105073.0,0.0,2.923274,10.769821,15.015345,6.987212,...,0.0,0.0,0.0,0.620629,0.645191,66.378788,27.771611,2334.052292,397.346317,1.168305
33,25.498123,0.494638,2.281501,77.782842,28059.0,20397.0,2.959786,11.337802,15.243968,6.89008,...,0.0,0.0,0.0,0.635011,0.617245,65.699931,29.616086,2574.41868,8696.689866,1.208632
32,25.775196,0.71201,2.325326,77.266319,35300.0,14687.0,2.929504,11.668407,15.018277,6.979112,...,0.0,0.0,0.0,0.619134,0.650212,66.06111,29.216188,2708.35337,9866.934517,1.271629
24,26.104218,0.680645,2.363027,76.002481,659696.91,516097.85,3.042184,12.148883,15.096774,6.937965,...,0.0,0.0,1.0,0.646023,0.61946,66.515883,27.476675,2451.779407,10314.343176,1.29459
27,26.268702,0.230789,2.235369,72.348601,578484.113,501381.53,2.92112,11.282443,14.824427,6.933842,...,0.0,0.0,1.0,0.638525,0.62761,65.761543,29.817557,2422.394671,17644.089567,1.540772


In [None]:
goods

Unnamed: 0_level_0,temperature,rainfall,windspeed,humidity,total_area,cooling_area,Weekday,hour,day,month,...,building_type_Public,building_type_Research Institute,building_type_University,sin_hour,cos_hour,THI,CDH,pred,y,gap
building_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,24.709499,0.522691,1.135884,83.158311,45956.56,13190.98,3.002639,11.129288,14.635884,6.915567,...,0.0,0.0,0.0,0.645422,0.627529,67.252384,32.838522,2419.136572,1933.12781,0.526125
15,26.273544,0.100971,2.121359,77.65534,167012.31,167012.31,2.980583,11.080097,15.104369,6.946602,...,0.0,0.0,0.0,0.619628,0.644014,67.758003,38.820631,2396.318023,1784.637306,0.546021
48,26.253398,0.184223,1.807767,87.502427,66729.0,50000.0,2.941748,11.456311,15.752427,6.963592,...,0.0,0.0,0.0,0.648654,0.619952,72.140214,27.187379,2230.952337,1906.677961,0.554932
14,24.832143,0.398333,2.82119,79.488095,16844.16,14102.92,2.97381,11.802381,14.685714,7.004762,...,0.0,0.0,0.0,0.637953,0.623679,65.1141,28.425714,2348.139345,1927.984,0.560034
35,25.930127,0.671139,2.295696,77.625316,9736.0,6070.0,3.012658,11.893671,15.35443,6.951899,...,0.0,0.0,0.0,0.626688,0.636254,66.297574,27.512658,2493.563117,2175.224051,0.566356


In [None]:
res = eda.groupby('building_number').agg(lambda x:np.mean(abs(x))).sort_values('gap')

In [None]:
building_type_columns

['building_type_Commercial',
 'building_type_Data Center',
 'building_type_Department Store and Outlet',
 'building_type_Discount Mart',
 'building_type_Hospital',
 'building_type_Hotel and Resort',
 'building_type_Knowledge Industry Center',
 'building_type_Other Buildings',
 'building_type_Public',
 'building_type_Research Institute',
 'building_type_University']