In [21]:
import pandas as pd
import os 

import warnings

# 모든 경고를 무시하도록 설정
warnings.filterwarnings('ignore')

forecast = pd.read_csv(os.path.join("Data", "forecast.csv"), index_col=0)
weather = pd.read_csv(os.path.join("Data", "weather.csv"), index_col=0)
weather = weather.dropna(axis=1)

pv    = pd.read_csv(os.path.join("Data", "pv_day_merged.csv"), index_col=0)
excol = [col for col in pv.columns if "시간당발전량" not in col]
pv    = pv.drop(columns=excol)
pv_e  = pv[:-24]
test    = pv.iloc[-24:]

merged_factor = weather.join(forecast)

factor = merged_factor.copy()
factor = factor[factor.index <= "2023-08-30"]
factor.index = pd.to_datetime(factor.index) + pd.DateOffset(hours=24)


In [22]:
col = pv_e.columns
name = col[0]

pv_data = pv_e[name]
pv_df = pd.DataFrame(pv_data)
fore_data = pv_df[:-24].values # day before pv
pv_df = pv_df[pv_df.index >= "2022-07-02"]
pv_df.index = pd.to_datetime(pv_df.index)


pv_df_c = pv_df.join(factor)
pv_df_c["fore"] = fore_data

In [23]:
from pycaret.regression import *
from sklearn.model_selection import train_test_split

s = RegressionExperiment()

# 데이터셋 로드
data = pv_df_c

# 데이터셋을 트레인/테스트 셋으로 분리
train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

# PyCaret 설정 (트레인 데이터로만)
regression_setup = s.setup(train_data, target=name, session_id=123, normalize=True,  # 스케일링을 활성화
          normalize_method='minmax')

# 모델 학습 및 최적화
best_model = regression_setup.compare_models()
best_tune = regression_setup.tune_model(best_model, optimize='MSE', n_iter=10)
best_tune

# 테스트 데이터로 예측 수행 및 평가
# predictions = predict_model(final_model, data=test_data)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,축구장_시간당발전량
2,Target type,Regression
3,Original data shape,"(2304, 21)"
4,Transformed data shape,"(2304, 21)"
5,Transformed train set shape,"(1612, 21)"
6,Transformed test set shape,"(692, 21)"
7,Numeric features,20
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,10.2441,341.6744,18.4105,0.5618,0.8668,4.0208,0.357
gbr,Gradient Boosting Regressor,10.8218,345.8731,18.5091,0.5611,1.0659,3.5639,0.135
lightgbm,Light Gradient Boosting Machine,10.4116,344.6186,18.4823,0.5566,1.004,3.3812,0.621
et,Extra Trees Regressor,10.3608,349.4658,18.6313,0.5522,0.8602,4.1332,0.123
xgboost,Extreme Gradient Boosting,10.6054,374.5086,19.238,0.5176,0.9703,3.2159,0.096
knn,K Neighbors Regressor,11.8676,411.8909,20.1924,0.473,1.2723,3.2544,0.017
lr,Linear Regression,13.0132,417.2465,20.3217,0.471,1.481,3.4447,0.016
br,Bayesian Ridge,13.0839,422.6892,20.4461,0.4657,1.4842,3.3246,0.013
ridge,Ridge Regression,13.0967,423.0311,20.454,0.4654,1.4852,3.3311,0.013
ada,AdaBoost Regressor,13.3395,444.6696,21.0149,0.4347,1.1309,6.1491,0.034


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,10.0359,292.8826,17.1138,0.6556,1.0366,4.616
1,11.1903,410.1633,20.2525,0.6201,0.9846,5.4123
2,12.0143,408.8848,20.2209,0.5657,0.9762,2.7474
3,10.1313,341.7662,18.4869,0.6304,0.9705,2.4853
4,9.9757,263.2957,16.2264,0.5218,1.0442,3.6348
5,8.3177,232.1122,15.2352,0.6231,0.9207,4.1057
6,9.4456,272.2902,16.5012,0.6265,0.9228,2.1446
7,10.157,328.3866,18.1214,0.4578,0.895,2.6077
8,9.2478,264.8356,16.2738,0.6923,0.9822,4.8401
9,10.4581,345.9698,18.6003,0.5861,0.9101,3.5245


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [24]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='AppleGothic'
plt.rcParams['axes.unicode_minus'] =False

regression_setup.evaluate_model(best_tune)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [22]:
selected_feature = ['fore', '지면온도(°C)', '기온(°C)', '풍속', '습도(%)', '남북바람성분', '1시간기온', '습도', '풍향', '동서바람성분']

In [25]:
pv_df_c[selected_feature]

Unnamed: 0_level_0,fore,지면온도(°C),기온(°C),풍속,습도(%),남북바람성분,1시간기온,습도,풍향,동서바람성분
시간,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-07-02 00:00:00,0.0,25.0,25.6,2.840000,99,2.620000,27.200000,78.000000,157.800000,-1.080000
2022-07-02 01:00:00,0.0,24.8,25.1,2.920000,99,2.680000,28.200000,73.000000,155.200000,-1.240000
2022-07-02 02:00:00,0.0,24.4,25.0,3.155556,99,2.848148,29.259259,68.333333,155.037037,-1.340741
2022-07-02 03:00:00,0.0,24.2,24.9,3.274074,99,2.985185,30.259259,63.703704,157.074074,-1.274074
2022-07-02 04:00:00,0.0,24.0,24.5,3.218519,99,2.992593,31.259259,60.555556,159.148148,-1.166667
...,...,...,...,...,...,...,...,...,...,...
2023-08-30 19:00:00,0.0,26.5,24.9,1.525114,99,-1.068493,24.228311,97.283105,305.598174,0.214155
2023-08-30 20:00:00,0.0,26.1,24.4,1.572103,99,-1.250215,24.214592,96.072961,227.596567,0.021459
2023-08-30 21:00:00,0.0,25.9,24.2,1.486266,99,-1.250215,23.643777,95.000000,322.660944,0.150215
2023-08-30 22:00:00,0.0,25.7,24.0,1.672103,99,-1.414592,24.214592,95.000000,215.931330,0.257940


In [27]:
import pandas as pd
import os 

import warnings

# 모든 경고를 무시하도록 설정
warnings.filterwarnings('ignore')

forecast = pd.read_csv(os.path.join("Data", "forecast.csv"), index_col=0)
weather = pd.read_csv(os.path.join("Data", "weather.csv"), index_col=0)
weather = weather.dropna(axis=1)

merged_factor = weather.join(forecast)

factor = merged_factor.copy()
factor = factor[factor.index <= "2023-08-30"]
factor.index = pd.to_datetime(factor.index) + pd.DateOffset(hours=24)

load    = pd.read_csv(os.path.join("Data", "load.csv"), index_col=0)
load_e = load

In [28]:
col = load_e.columns
name = col[0]

load_data = load_e[name]
load_df = pd.DataFrame(load_data)
fore_data = load_df[:-24].values # day before load
load_df = load_df[load_df.index >= "2022-07-02"]
load_df.index = pd.to_datetime(load_df.index)


load_df_c = load_df.join(factor)
load_df_c["fore"] = fore_data

In [29]:
load_df_c

Unnamed: 0_level_0,전기전자컴퓨터공학동,기온(°C),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),전운량(10분위),중하층운량(10분위),지면온도(°C),...,1시간기온,강수형태,강수확률,남북바람성분,동서바람성분,습도,풍속,풍향,하늘상태,fore
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-02 00:00:00,413.8,25.6,99.0,32.4,25.4,1006.1,1014.1,3.0,3.0,25.0,...,27.200000,0.000000,28.000000,2.620000,-1.080000,78.000000,2.840000,157.800000,3.800000,419.0
2022-07-02 01:00:00,411.8,25.1,99.0,31.5,24.9,1006.1,1014.1,2.0,0.0,24.8,...,28.200000,0.000000,28.000000,2.680000,-1.240000,73.000000,2.920000,155.200000,3.800000,424.5
2022-07-02 02:00:00,416.4,25.0,99.0,31.3,24.8,1005.8,1013.8,4.0,0.0,24.4,...,29.259259,0.000000,20.000000,2.848148,-1.340741,68.333333,3.155556,155.037037,3.000000,409.9
2022-07-02 03:00:00,400.0,24.9,99.0,31.1,24.7,1006.0,1014.0,2.0,0.0,24.2,...,30.259259,0.000000,28.148148,2.985185,-1.274074,63.703704,3.274074,157.074074,3.814815,403.0
2022-07-02 04:00:00,409.7,24.5,99.0,30.4,24.3,1006.2,1014.2,3.0,0.0,24.0,...,31.259259,0.000000,23.703704,2.992593,-1.166667,60.555556,3.218519,159.148148,3.370370,407.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-30 19:00:00,0.0,24.9,99.0,31.1,24.7,999.3,1007.3,10.0,5.0,26.5,...,24.228311,0.575342,60.000000,-1.068493,0.214155,97.283105,1.525114,305.598174,4.000000,0.0
2023-08-30 20:00:00,0.0,24.4,99.0,30.2,24.2,1000.1,1008.0,10.0,5.0,26.1,...,24.214592,0.600858,60.000000,-1.250215,0.021459,96.072961,1.572103,227.596567,4.000000,0.0
2023-08-30 21:00:00,0.0,24.2,99.0,29.8,24.0,1000.6,1008.6,10.0,10.0,25.9,...,23.643777,0.600858,62.493562,-1.250215,0.150215,95.000000,1.486266,322.660944,4.000000,0.0
2023-08-30 22:00:00,0.0,24.0,99.0,29.5,23.8,1000.9,1008.9,10.0,8.0,25.7,...,24.214592,0.356223,46.030043,-1.414592,0.257940,95.000000,1.672103,215.931330,4.000000,0.0


In [30]:
from pycaret.regression import *
from sklearn.model_selection import train_test_split

s = RegressionExperiment()

# 데이터셋 로드
data = load_df_c

# 데이터셋을 트레인/테스트 셋으로 분리
train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

# PyCaret 설정 (트레인 데이터로만)
regression_setup = s.setup(train_data, target=name, session_id=123, normalize=True,  # 스케일링을 활성화
          normalize_method='minmax')

# 모델 학습 및 최적화
best_model = regression_setup.compare_models()
best_tune = regression_setup.tune_model(best_model, optimize='MSE', n_iter=10)
best_tune

# 테스트 데이터로 예측 수행 및 평가
# predictions = predict_model(final_model, data=test_data)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,전기전자컴퓨터공학동
2,Target type,Regression
3,Original data shape,"(2342, 21)"
4,Transformed data shape,"(2342, 21)"
5,Transformed train set shape,"(1639, 21)"
6,Transformed test set shape,"(703, 21)"
7,Numeric features,20
8,Rows with missing values,0.6%
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,7.8771,233.3179,15.143,0.9957,0.0951,0.0339,0.109
lightgbm,Light Gradient Boosting Machine,9.9941,323.8553,17.8008,0.994,0.546,0.041,0.631
rf,Random Forest Regressor,9.2287,356.9175,18.4047,0.9934,0.1858,0.0389,0.242
gbr,Gradient Boosting Regressor,11.0405,395.5667,19.7186,0.9927,0.7144,0.0433,0.132
ada,AdaBoost Regressor,14.5398,608.0937,24.618,0.9887,0.053,0.0639,0.064
dt,Decision Tree Regressor,12.1433,771.6238,26.3111,0.9857,0.0949,0.0526,0.022
xgboost,Extreme Gradient Boosting,10.5161,781.4056,24.8448,0.9855,0.6222,0.0409,0.115
knn,K Neighbors Regressor,12.3509,863.6283,28.0982,0.984,0.116,0.0536,0.016
br,Bayesian Ridge,17.7595,1833.3462,41.0403,0.9659,1.3911,0.0514,0.012
lr,Linear Regression,17.7903,1834.0599,41.0606,0.9659,1.3982,0.0514,0.012


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12.069,507.575,22.5294,0.9908,0.3829,0.0485
1,10.5084,437.2325,20.9101,0.9918,0.3751,0.0424
2,10.961,559.94,23.6631,0.9893,0.5517,0.039
3,10.5241,391.8857,19.7961,0.9927,0.4045,0.0409
4,12.2138,603.714,24.5706,0.9889,0.3802,0.0491
5,8.8993,242.0585,15.5582,0.9954,0.0919,0.0359
6,8.1613,205.0184,14.3185,0.9963,0.0728,0.0352
7,12.8636,716.5523,26.7685,0.9867,0.6972,0.0438
8,9.0774,371.6031,19.277,0.9931,0.3797,0.0388
9,11.7459,550.0673,23.4535,0.9897,0.3972,0.0476


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [31]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='AppleGothic'
plt.rcParams['axes.unicode_minus'] =False

regression_setup.evaluate_model(best_tune)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…