### Code Hist.

 - CODE : Model - KIER Method 02(Clustering)  
 - DESC : 각 군집별 Model Analysis 및 Evaluation  
 - DATE  
   &ensp; 2024-08-20 Created : "M02-03_Model_ML-01_Single.ipynb"에 Cross Validation 적용  

# 01. Code

## 01-01. Init

### 01-01-01. Init_Module Import

In [1]:
#region Basic_Import
## Basic
import os, sys, warnings
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.path.dirname(os.path.abspath('./__file__'))
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('./__file__'))))
warnings.filterwarnings('ignore')

import numpy as np, pandas as pd
from pandas import DataFrame, Series
pd.options.display.float_format = '{:.10f}'.format

import math, random

## Datetime
import time, datetime as dt
from datetime import datetime, date, timedelta

## glob
import glob, requests, json
from glob import glob

## 시각화
import matplotlib.pyplot as plt, seaborn as sns
# %matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]

from scipy import stats

## Split, 정규화
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# K-Means 알고리즘
from sklearn.cluster import KMeans, MiniBatchKMeans

# Clustering 알고리즘의 성능 평가 측도
from sklearn import metrics
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, silhouette_score, rand_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.cluster import contingency_matrix

## Modeling, Model Training
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

## Grid Search
# kfold = KFold(n_splits = 5, shuffle = False, random_state = None)

## For Web
import urllib
from urllib.request import urlopen
from urllib.parse import urlencode, unquote, quote_plus
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
#endregion Basic_Import

In [2]:
## Import_DL
str_tar = "tf"
## For Torch
if str_tar == "torch":
    import torch, torch.nn as nn
    from torch.nn.utils import weight_norm
    print("Torch Imported")
## For TF
elif str_tar == "tf":
    import tensorflow as tf, tensorflow_addons as tfa
    from keras.callbacks import EarlyStopping, ModelCheckpoint
    from keras.models import Sequential, load_model
    from keras_flops import get_flops
    print("Tensorflow Imported")
else:
    print("Error : Cannot be used except for Keywords")
    print(" : torch / tf")

Tensorflow Imported


In [3]:
## Import_Local
from Src_Dev_Common import Data_Datetime as com_date, KMA_Weather as com_KMA, KECO_AirKor as com_KECO, KASI_Holiday as com_Holi, KIER_Usage_M02 as com_KIER_M02, Data_Analysis as com_Analysis, Common_Model as com_Model

### 01-01-02. Config (Directory, Params)

In [4]:
## Init_config
SEED = 42

np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"], os.environ['TF_DETERMINISTIC_OPS'] = str(SEED), "1"

In [5]:
## Define Todate str
str_now_ymd = pd.datetime.now().date()
str_now_y, str_now_m, str_now_d = pd.datetime.now().year, pd.datetime.now().month, pd.datetime.now().day
str_now_hr, str_now_min = pd.datetime.now().hour, pd.datetime.now().minute

print(pd.datetime.now())
print(str(str_now_y) + " / " + str(str_now_m)  + " / " + str(str_now_d))
print(str(str_now_hr) + " : " + str(str_now_min))

2024-08-20 23:03:28.384470
2024 / 8 / 20
23 : 3


In [6]:
## Dict_Domain
## {0:"ELEC", 1:"HEAT", 2:"WATER", 3:"HOT_HEAT", 4:"HOT_FLOW", 99:"GAS"}
## K : 2 or 3
## {0 : '10MIN', 1 : '1H', 2 : '1D', 3 : '1W', 4 : '1M'}
## {0 : 'ALL', 1 : 'C0', 2 : 'C1', 3 : 'C2'}
## 도메인 / 시간간격 / 군집 수 K / 그룹 유형
int_domain, K, int_interval, int_grp = 0, 3, 4, 1 ## [여기까지 진행됨] 뒤에 두개만 조정해가며 ㄱㄱ
# int_domain, int_interval, K, int_grp = 0, 7, 2, 0

dict_grp = {0 : 'ALL', 1 : 'C0', 2 : 'C1', 3 : 'C2'}
## Domain, ACCU/INST Column
str_domain, str_col_accu, str_col_inst = com_KIER_M02.create_domain_str(int_domain)
## Directory Root
str_dirData, str_dir_raw, str_dir_cleansed, str_dirName_bld, str_dirName_h = com_KIER_M02.create_dir_str(str_domain)
## Interval, Target File
str_interval, str_fileRaw, str_fileRaw_hList, str_file = com_KIER_M02.create_file_str(str_domain, int_interval)

print(str(os.listdir(str_dirData)) + "\n")
print(os.listdir(str_dirName_h))

0 : ELEC
str_fileRaw : KIER_RAW_ELEC_2024-06-07.csv
str_fileRaw_hList : KIER_RAW_ELEC_2024-06-07.csv
str_file : KIER_ELEC_INST_1M_Resampled.csv
['.ipynb_checkpoints', 'BS_CONFIGURATION_202309251452.csv', 'DATE_1M_2023-10-20.csv', 'KIER 전처리 현황_2024-06-25.xlsx', 'KIER_0_Raw', 'KIER_1_Cleansed', 'KIER_2_BLD', 'KIER_3_H_ELEC', 'KIER_3_H_GAS', 'KIER_3_H_HEAT', 'KIER_3_H_HOT_FLOW', 'KIER_3_H_HOT_HEAT', 'KIER_3_H_WATER', 'KIER_ASOS_WEATHER_DAILY_202309251521.csv', 'KIER_ASOS_WEATHER_HOUR_202309251521.csv', 'KIER_DATA_OLD', 'KIER_ETC', 'KIER_hList_Comparison_2024-06-26.xlsx', 'KIER_List_Table_Column_2023-09-25.xlsx', 'KIER_Query_2023-09-25.txt', 'KMA_ASOS_119_2010_2023_1st_to CSV.csv', '[IITP] 데이터 테이블 정리 (공유 원본).docx']

['KIER_ELEC_561-1-1_ACCU_01_Raw.csv', 'KIER_ELEC_561-1-2_ACCU_01_Raw.csv', 'KIER_ELEC_561-1-3_ACCU_01_Raw.csv', 'KIER_ELEC_561-1-4_ACCU_01_Raw.csv', 'KIER_ELEC_561-10-1_ACCU_01_Raw.csv', 'KIER_ELEC_561-10-2_ACCU_01_Raw.csv', 'KIER_ELEC_561-10-3_ACCU_01_Raw.csv', 'KIER_ELEC_561-

## 01-02. Data Load (df_raw)

### 01-02-01. KMA ASOS

In [7]:
## KMA_ASOS Data
# str_dir_kmaAsos = "../data/data_KMA_ASOS/"

## Interpolate / Filled ASOS Data
str_file = '../data_Energy_KIER/KMA_ASOS_119_2010_2023_1st_to CSV.csv'
df_ASOS = pd.read_csv(str_file, index_col = 0).reset_index()

try : df_ASOS['METER_DATE'] = pd.to_datetime(df_ASOS['METER_DATE'])
except KeyError : df_kier_raw = com_date.create_col_datetime(df_ASOS, 'METER_DATE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE').drop(labels = ['None'], axis = 1)

df_ASOS

Unnamed: 0,METER_DATE,YEAR,MONTH,DAY,HOUR,MINUTE,temp_outdoor,temp_dew_point,temp_ground,humidity,...,wind_speed,wind_direction,pressure_vapor,pressure_area,pressure_sea,sunshine,solar_radiation,cloud_total,cloud_midlow,visual_range
0,2010-01-01 00:00:00,2010,1,1,0,0,-10.3000000000,-17.6000000000,-5.8000000000,55.0000000000,...,1.6000000000,340.0000000000,1.5000000000,1019.7000000000,1024.3000000000,,,0.0000000000,0.0000000000,2000.0000000000
1,2010-01-01 01:00:00,2010,1,1,1,0,-10.4000000000,-17.9000000000,-5.7000000000,54.0000000000,...,0.3000000000,0.0000000000,1.5000000000,1019.5000000000,1024.1000000000,,,,,
2,2010-01-01 02:00:00,2010,1,1,2,0,-10.6000000000,-17.6000000000,-5.9000000000,56.0000000000,...,0.6000000000,270.0000000000,1.5000000000,1019.7000000000,1024.3000000000,,,,,
3,2010-01-01 03:00:00,2010,1,1,3,0,-11.0000000000,-18.0000000000,-5.9000000000,56.0000000000,...,1.5000000000,290.0000000000,1.5000000000,1019.7000000000,1024.3000000000,,,0.0000000000,0.0000000000,2000.0000000000
4,2010-01-01 04:00:00,2010,1,1,4,0,-11.6000000000,-18.2000000000,-6.0000000000,58.0000000000,...,0.9000000000,290.0000000000,1.5000000000,1019.4000000000,1024.0000000000,,,0.0000000000,0.0000000000,2000.0000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116911,2023-10-17 19:00:00,2023,10,17,19,0,14.4000000000,4.9000000000,14.7000000000,53.0000000000,...,0.4000000000,0.0000000000,8.7000000000,1015.1000000000,1019.9000000000,,,0.0000000000,0.0000000000,5000.0000000000
116912,2023-10-17 20:00:00,2023,10,17,20,0,12.5000000000,5.8000000000,13.4000000000,64.0000000000,...,0.3000000000,0.0000000000,9.2000000000,1015.6000000000,1020.4000000000,,,0.0000000000,0.0000000000,3742.0000000000
116913,2023-10-17 21:00:00,2023,10,17,21,0,11.4000000000,6.5000000000,12.4000000000,72.0000000000,...,0.8000000000,320.0000000000,9.7000000000,1015.8000000000,1020.6000000000,,,0.0000000000,0.0000000000,3158.0000000000
116914,2023-10-17 22:00:00,2023,10,17,22,0,10.2000000000,6.8000000000,11.5000000000,80.0000000000,...,0.8000000000,320.0000000000,9.9000000000,1016.2000000000,1021.1000000000,,,0.0000000000,0.0000000000,3321.0000000000


In [8]:
## Cluster 기준 Interval
str_file_clustering = 'KIER_' + str(str_domain) + '_Labeled_' + str_interval + '_K' + str(K) + '.csv'
df_kier_h_cluster = pd.read_csv(str_dirName_h + str_file_clustering
                                , index_col = 0).rename(columns = {'index' : 'h_index'})[['h_index', 'target_' + str_domain]]
print(str_interval)
print(df_kier_h_cluster['target_' + str_domain].drop_duplicates())
# df_kier_h_cluster

1M
0     2
1     1
11    0
Name: target_ELEC, dtype: int64


In [9]:
list_kier_h_all = df_kier_h_cluster['h_index']
print(len(list_kier_h_all))
list_kier_h_c0 = df_kier_h_cluster[df_kier_h_cluster['target_' + str_domain] == 0]['h_index']
print(len(list_kier_h_c0))
list_kier_h_c1 = df_kier_h_cluster[df_kier_h_cluster['target_' + str_domain] == 1]['h_index']
print(len(list_kier_h_c1))

if K == 3 : 
    list_kier_h_c2 = df_kier_h_cluster[df_kier_h_cluster['target_' + str_domain] == 2]['h_index']
    print(len(list_kier_h_c2))

341
56
164
121


In [10]:
## 사용량 Data Load
## 1시간 단위
str_file = 'KIER_' + str_domain + '_INST_1H_Resampled.csv'
df_raw = pd.read_csv(str_dirName_h + str_file, index_col = 0)
df_raw

Unnamed: 0,METER_DATE,ELEC_INST_EFF_561-1-1,ELEC_INST_EFF_561-1-2,ELEC_INST_EFF_561-1-3,ELEC_INST_EFF_561-1-4,ELEC_INST_EFF_561-2-1,ELEC_INST_EFF_561-2-2,ELEC_INST_EFF_561-2-3,ELEC_INST_EFF_561-2-4,ELEC_INST_EFF_561-3-1,...,ELEC_INST_EFF_563-22-3,ELEC_INST_EFF_563-22-4,ELEC_INST_EFF_563-22-5,ELEC_INST_EFF_563-22-6,ELEC_INST_EFF_563-23-1,ELEC_INST_EFF_563-23-2,ELEC_INST_EFF_563-23-3,ELEC_INST_EFF_563-23-4,ELEC_INST_EFF_563-24-1,ELEC_INST_EFF_563-24-2
0,2022-07-17 23:00:00,0.8600000000,1.2500000000,0.1700000000,1.0633876812,0.8600000000,0.2500000000,0.6000000000,0.4800000000,0.8100000000,...,0.6000000000,0.6100000000,0.1900000000,0.2200000000,0.2200000000,0.3600000000,0.2600000000,0.2100000000,0.1600000000,0.4400000000
1,2022-07-18 00:00:00,0.7277880265,1.1200000000,0.1800000000,0.9600000000,0.4500000000,0.2600000000,0.6000000000,0.4600000000,0.7100000000,...,0.2349605523,0.4600000000,0.1600000000,0.2400000000,0.3100000000,0.4100000000,0.2500000000,0.2000000000,0.1400000000,0.3868823529
2,2022-07-18 01:00:00,0.5011737005,0.9900000000,0.1900000000,0.8900000000,0.5800000000,0.2400000000,0.6300000000,0.4500000000,0.7000000000,...,0.2100000000,0.2700000000,0.1600000000,0.2700000000,0.2500000000,0.3400000000,0.2600000000,0.2000000000,0.1300000000,0.2500000000
3,2022-07-18 02:00:00,0.3300000000,1.0300000000,0.2000000000,1.0600000000,0.2700000000,0.1800000000,0.1400000000,0.2700000000,0.6100000000,...,0.2000000000,0.2500000000,0.1400000000,0.2400000000,0.2100000000,0.3400000000,0.2600000000,0.1900000000,0.1400000000,0.2200000000
4,2022-07-18 03:00:00,0.2600000000,1.0000000000,0.2700000000,0.8700000000,0.6200000000,0.1700000000,0.1300000000,0.2300000000,0.5700000000,...,0.2100000000,0.2800000000,0.1300000000,0.3200000000,0.1900000000,0.4600000000,0.2800000000,0.1800000000,0.1800000000,0.3000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16523,2024-06-05 10:00:00,0.3600000000,0.6900000000,0.3200000000,0.3000000000,0.6400000000,0.3200000000,0.2900000000,0.4400000000,0.3800000000,...,0.1900000000,0.3100000000,0.5000000000,0.2500000000,0.1500000000,0.2500000000,0.1700000000,0.4200000000,0.2100000000,0.2400000000
16524,2024-06-05 11:00:00,0.2700000000,0.4953636364,0.3300000000,0.2100000000,0.4353636364,0.2253636364,0.2053636364,0.5653636364,0.1900000000,...,0.1753636364,0.2353636364,0.3453636364,0.1600000000,0.3453636364,0.3100000000,0.1953636364,0.2053636364,0.2453636364,0.2353636364
16525,2024-06-05 12:00:00,0.2900000000,0.7800000000,0.4000000000,0.2300000000,0.5100000000,0.1800000000,0.1900000000,0.7200000000,0.5900000000,...,0.1800000000,0.1800000000,0.3000000000,0.2000000000,0.4100000000,0.4400000000,0.1800000000,0.1800000000,0.2100000000,0.1900000000
16526,2024-06-05 13:00:00,0.3200000000,0.3900000000,1.0900000000,0.2278662554,0.5000000000,0.1700000000,0.1700000000,0.3078662554,0.3578662554,...,0.2567782655,0.2683431510,0.2200000000,0.3055204052,0.1500000000,0.3467782655,0.2472003185,0.3167782655,0.3367782655,0.2667782655


In [11]:
## 전체 사용량 합계
df_kier_h_all = df_raw.copy()
df_kier_h_all['METER_DATE'] = pd.to_datetime(df_kier_h_all['METER_DATE'])
df_kier_h_tmp = df_raw[list_kier_h_all]
df_kier_h_all[str_domain + '_INST_SUM_ALL'] = df_kier_h_tmp.sum(axis = 1)
## 시점을 밀어서, 세대별 사용량을 과거 사용량으로 사용
df_kier_h_all[str_domain + '_INST_SUM_ALL'] = df_kier_h_all[str_domain + '_INST_SUM_ALL'].shift(1)
df_kier_h_all.dropna()

Unnamed: 0,METER_DATE,ELEC_INST_EFF_561-1-1,ELEC_INST_EFF_561-1-2,ELEC_INST_EFF_561-1-3,ELEC_INST_EFF_561-1-4,ELEC_INST_EFF_561-2-1,ELEC_INST_EFF_561-2-2,ELEC_INST_EFF_561-2-3,ELEC_INST_EFF_561-2-4,ELEC_INST_EFF_561-3-1,...,ELEC_INST_EFF_563-22-4,ELEC_INST_EFF_563-22-5,ELEC_INST_EFF_563-22-6,ELEC_INST_EFF_563-23-1,ELEC_INST_EFF_563-23-2,ELEC_INST_EFF_563-23-3,ELEC_INST_EFF_563-23-4,ELEC_INST_EFF_563-24-1,ELEC_INST_EFF_563-24-2,ELEC_INST_SUM_ALL
1,2022-07-18 00:00:00,0.7277880265,1.1200000000,0.1800000000,0.9600000000,0.4500000000,0.2600000000,0.6000000000,0.4600000000,0.7100000000,...,0.4600000000,0.1600000000,0.2400000000,0.3100000000,0.4100000000,0.2500000000,0.2000000000,0.1400000000,0.3868823529,133.9294726347
2,2022-07-18 01:00:00,0.5011737005,0.9900000000,0.1900000000,0.8900000000,0.5800000000,0.2400000000,0.6300000000,0.4500000000,0.7000000000,...,0.2700000000,0.1600000000,0.2700000000,0.2500000000,0.3400000000,0.2600000000,0.2000000000,0.1300000000,0.2500000000,113.3287316394
3,2022-07-18 02:00:00,0.3300000000,1.0300000000,0.2000000000,1.0600000000,0.2700000000,0.1800000000,0.1400000000,0.2700000000,0.6100000000,...,0.2500000000,0.1400000000,0.2400000000,0.2100000000,0.3400000000,0.2600000000,0.1900000000,0.1400000000,0.2200000000,98.3017853575
4,2022-07-18 03:00:00,0.2600000000,1.0000000000,0.2700000000,0.8700000000,0.6200000000,0.1700000000,0.1300000000,0.2300000000,0.5700000000,...,0.2800000000,0.1300000000,0.3200000000,0.1900000000,0.4600000000,0.2800000000,0.1800000000,0.1800000000,0.3000000000,91.3885285173
5,2022-07-18 04:00:00,0.4100000000,0.6400000000,0.1900000000,0.5700000000,0.5200000000,0.1800000000,0.1600000000,0.2400000000,0.6800000000,...,0.2600000000,0.1500000000,0.2500000000,0.2200000000,0.3900000000,0.2500000000,0.1800000000,0.1900000000,0.2600000000,87.6994378719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16523,2024-06-05 10:00:00,0.3600000000,0.6900000000,0.3200000000,0.3000000000,0.6400000000,0.3200000000,0.2900000000,0.4400000000,0.3800000000,...,0.3100000000,0.5000000000,0.2500000000,0.1500000000,0.2500000000,0.1700000000,0.4200000000,0.2100000000,0.2400000000,98.5381556196
16524,2024-06-05 11:00:00,0.2700000000,0.4953636364,0.3300000000,0.2100000000,0.4353636364,0.2253636364,0.2053636364,0.5653636364,0.1900000000,...,0.2353636364,0.3453636364,0.1600000000,0.3453636364,0.3100000000,0.1953636364,0.2053636364,0.2453636364,0.2353636364,110.9769452449
16525,2024-06-05 12:00:00,0.2900000000,0.7800000000,0.4000000000,0.2300000000,0.5100000000,0.1800000000,0.1900000000,0.7200000000,0.5900000000,...,0.1800000000,0.3000000000,0.2000000000,0.4100000000,0.4400000000,0.1800000000,0.1800000000,0.2100000000,0.1900000000,98.6715790569
16526,2024-06-05 13:00:00,0.3200000000,0.3900000000,1.0900000000,0.2278662554,0.5000000000,0.1700000000,0.1700000000,0.3078662554,0.3578662554,...,0.2683431510,0.2200000000,0.3055204052,0.1500000000,0.3467782655,0.2472003185,0.3167782655,0.3367782655,0.2667782655,105.5118301886


In [12]:
## Cluster별 사용량 합계
## ■ C00
df_kier_h_c0 = df_raw.copy()[list_kier_h_c0]
df_kier_h_c0['METER_DATE'] = pd.to_datetime(df_kier_h_all['METER_DATE'])
df_kier_h_tmp = df_raw[list_kier_h_c0]
df_kier_h_c0[str_domain + '_INST_SUM_C0'] = df_kier_h_tmp.sum(axis = 1)
## 시점을 밀어서, 세대별 사용량을 과거 사용량으로 사용
df_kier_h_c0[str_domain + '_INST_SUM_C0'] = df_kier_h_c0[str_domain + '_INST_SUM_C0'].shift(1)
df_kier_h_c0.dropna()

## ■ C01
df_kier_h_c1 = df_raw.copy()[list_kier_h_c1]
df_kier_h_c1['METER_DATE'] = pd.to_datetime(df_kier_h_all['METER_DATE'])
df_kier_h_tmp = df_raw[list_kier_h_c1]
df_kier_h_c1[str_domain + '_INST_SUM_C1'] = df_kier_h_tmp.sum(axis = 1)
## 시점을 밀어서, 세대별 사용량을 과거 사용량으로 사용
df_kier_h_c1[str_domain + '_INST_SUM_C1'] = df_kier_h_c1[str_domain + '_INST_SUM_C1'].shift(1)
df_kier_h_c1.dropna()

if K == 3:
    ## ■ C02
    df_kier_h_c2 = df_raw.copy()[list_kier_h_c2]
    df_kier_h_c2['METER_DATE'] = pd.to_datetime(df_kier_h_all['METER_DATE'])
    df_kier_h_tmp = df_raw[list_kier_h_c2]
    df_kier_h_c2[str_domain + '_INST_SUM_C2'] = df_kier_h_tmp.sum(axis = 1)
    ## 시점을 밀어서, 세대별 사용량을 과거 사용량으로 사용
    df_kier_h_c2[str_domain + '_INST_SUM_C2'] = df_kier_h_c2[str_domain + '_INST_SUM_C2'].shift(1)
    df_kier_h_c2.dropna()

In [13]:
# 에너지 사용량 총계만
# df_kier_h_all = df_kier_h_all[['METER_DATE', str_domain + '_INST_SUM_ALL']].dropna()

# df_kier_h_c0 = df_kier_h_c0[['METER_DATE', str_domain + '_INST_SUM_C0']].dropna()

# df_kier_h_c1 = df_kier_h_c1[['METER_DATE', str_domain + '_INST_SUM_C1']].dropna()

# df_kier_h_c2 = df_kier_h_c2[['METER_DATE', str_domain + '_INST_SUM_C2']].dropna()


In [14]:
## 날씨 데이터 추가
df_kier_h_all = pd.merge(df_kier_h_all, df_ASOS, how = 'left', on = ['METER_DATE'])
df_kier_h_all = com_KMA.Interpolate_KMA_ASOS(df_kier_h_all)
df_kier_h_all = com_date.create_col_ymdhm(df_kier_h_all, 'METER_DATE')

df_kier_h_c0 = pd.merge(df_kier_h_c0, df_ASOS, how = 'left', on = ['METER_DATE'])
df_kier_h_c0 = com_KMA.Interpolate_KMA_ASOS(df_kier_h_c0)
df_kier_h_c0 = com_date.create_col_ymdhm(df_kier_h_c0, 'METER_DATE')

df_kier_h_c1 = pd.merge(df_kier_h_c1, df_ASOS, how = 'left', on = ['METER_DATE'])
df_kier_h_c1 = com_KMA.Interpolate_KMA_ASOS(df_kier_h_c1)
df_kier_h_c1 = com_date.create_col_ymdhm(df_kier_h_c1, 'METER_DATE')

if K == 3:
    df_kier_h_c2 = pd.merge(df_kier_h_c2, df_ASOS, how = 'left', on = ['METER_DATE'])
    df_kier_h_c2 = com_KMA.Interpolate_KMA_ASOS(df_kier_h_c2)
    df_kier_h_c2 = com_date.create_col_ymdhm(df_kier_h_c2, 'METER_DATE')

In [15]:
pd.set_option('display.max_row', 500)

print(df_kier_h_all.shape)
print(df_kier_h_all.columns)
print(df_kier_h_all.isna().sum())
print(df_kier_h_c0.shape)
print(df_kier_h_c0.columns)
print(df_kier_h_c0.isna().sum())
print(df_kier_h_c1.shape)
print(df_kier_h_c1.columns)
print(df_kier_h_c1.isna().sum())

if K == 3:
    print(df_kier_h_c2.shape)
    print(df_kier_h_c2.columns)
    print(df_kier_h_c2.isna().sum())

(16528, 372)
Index(['METER_DATE', 'ELEC_INST_EFF_561-1-1', 'ELEC_INST_EFF_561-1-2',
       'ELEC_INST_EFF_561-1-3', 'ELEC_INST_EFF_561-1-4',
       'ELEC_INST_EFF_561-2-1', 'ELEC_INST_EFF_561-2-2',
       'ELEC_INST_EFF_561-2-3', 'ELEC_INST_EFF_561-2-4',
       'ELEC_INST_EFF_561-3-1',
       ...
       'wind_speed', 'wind_direction', 'pressure_vapor', 'pressure_area',
       'pressure_sea', 'sunshine', 'solar_radiation', 'cloud_total',
       'cloud_midlow', 'visual_range'],
      dtype='object', length=372)
METER_DATE                0
ELEC_INST_EFF_561-1-1     0
ELEC_INST_EFF_561-1-2     0
ELEC_INST_EFF_561-1-3     0
ELEC_INST_EFF_561-1-4     0
ELEC_INST_EFF_561-2-1     0
ELEC_INST_EFF_561-2-2     0
ELEC_INST_EFF_561-2-3     0
ELEC_INST_EFF_561-2-4     0
ELEC_INST_EFF_561-3-1     0
ELEC_INST_EFF_561-3-2     0
ELEC_INST_EFF_561-3-3     0
ELEC_INST_EFF_561-3-4     0
ELEC_INST_EFF_561-4-1     0
ELEC_INST_EFF_561-4-2     0
ELEC_INST_EFF_561-4-3     0
ELEC_INST_EFF_561-4-4     0
ELEC_INST

## 01-06. Data Split (Train/Test Setting)

In [16]:
## 모든 세대
if int_grp == 0 : df_tar = df_kier_h_all
## 군집 C0
elif int_grp == 1 : df_tar = df_kier_h_c0
## 군집 C1
elif int_grp == 2 : df_tar = df_kier_h_c1
## 군집 C0
elif int_grp == 3 : df_tar = df_kier_h_c2
str_col_tar = str_domain + '_INST_SUM_' + dict_grp[int_grp]

# df_tar = com_date.create_col_ymdhm(df_tar, 'METER_DATE')
# df_tar = com_date.create_col_weekdays(df_tar, 'METER_DATE')
df_tar = df_tar.drop(columns = ['METER_DATE', 'DAY']).dropna()
# df_tar = df_tar.drop(columns = ['METER_DATE', 'day_of_the_week']).dropna()
# df_tar = df_tar.drop(columns = ['METER_DATE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'None']).dropna()

# trainSet_Origin, testSet_Origin = train_test_split(df_tar, test_size = 0.3, shuffle = False)
# print(trainSet_Origin.shape, testSet_Origin.shape)
df_tar

Unnamed: 0,ELEC_INST_EFF_561-5-1,ELEC_INST_EFF_561-5-3,ELEC_INST_EFF_561-6-1,ELEC_INST_EFF_561-6-4,ELEC_INST_EFF_561-7-1,ELEC_INST_EFF_561-8-1,ELEC_INST_EFF_561-9-4,ELEC_INST_EFF_561-10-1,ELEC_INST_EFF_561-11-4,ELEC_INST_EFF_561-13-1,...,wind_speed,wind_direction,pressure_vapor,pressure_area,pressure_sea,sunshine,solar_radiation,cloud_total,cloud_midlow,visual_range
1,0.4300000000,2.8500000000,0.3600000000,0.2900000000,0.2744152454,0.5800000000,0.6000000000,0.1500000000,0.5500000000,0.4600000000,...,0.5000000000,250.0000000000,25.3000000000,997.7000000000,1002.2000000000,0.0000000000,0.0000000000,3.0000000000,3.0000000000,1897.0000000000
2,0.4300000000,0.8943440233,0.3100000000,0.2900000000,0.2100000000,0.5200000000,0.6400000000,0.1800000000,0.4500000000,0.4300000000,...,0.5000000000,110.0000000000,26.0000000000,997.2000000000,1001.7000000000,0.0000000000,0.0000000000,2.0000000000,2.0000000000,1407.0000000000
3,0.4000000000,0.2400000000,0.3200000000,0.2600000000,0.1700000000,0.4600000000,0.5000000000,0.2100000000,0.4010110892,0.3000000000,...,0.7000000000,70.0000000000,26.8000000000,996.7000000000,1001.2000000000,0.0000000000,0.0000000000,4.0000000000,3.0000000000,1044.0000000000
4,0.3400000000,0.2300000000,0.3000000000,0.5600000000,0.1900000000,0.3600000000,0.5200000000,0.1600000000,0.3900000000,0.3200000000,...,0.7000000000,180.0000000000,26.9000000000,996.7000000000,1001.2000000000,0.0000000000,0.0000000000,8.0000000000,8.0000000000,947.0000000000
5,0.5000000000,0.2400000000,0.2800000000,0.3500000000,0.1800000000,0.3100000000,0.4400000000,0.1700000000,0.3500000000,0.4100000000,...,0.3000000000,0.0000000000,26.6000000000,996.6000000000,1001.1000000000,0.0000000000,0.0000000000,7.0000000000,7.0000000000,810.0000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16523,0.2300000000,0.5800000000,0.2900000000,0.3100000000,0.4200000000,0.4000000000,0.3500000000,0.3800000000,0.3600000000,1.4000000000,...,1.4000000000,320.0000000000,10.0000000000,1016.4000000000,1021.3000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000
16524,0.2453636364,1.2900000000,0.2553636364,0.3200000000,0.3653636364,0.3400000000,0.3353636364,0.2553636364,0.3753636364,0.7853636364,...,1.4000000000,320.0000000000,10.0000000000,1016.4000000000,1021.3000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000
16525,0.2900000000,0.7800000000,0.3000000000,0.4000000000,0.4200000000,0.4600000000,0.3900000000,0.2400000000,0.5600000000,2.0100000000,...,1.4000000000,320.0000000000,10.0000000000,1016.4000000000,1021.3000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000
16526,0.3000000000,0.7978662554,0.4300000000,0.3278662554,0.8100000000,0.3900000000,0.3700000000,0.2400000000,0.8500000000,0.6178662554,...,1.4000000000,320.0000000000,10.0000000000,1016.4000000000,1021.3000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000


In [17]:
# list_kf_scores = com_Model.model_ml_analysis_with_KFold(df_tar, 0, 0.3, str_col_tar, 5)
# list_kf_scores

In [18]:
from sklearn.model_selection import KFold, TimeSeriesSplit 

dict_model = {0 : 'CB', 1 : 'DT', 2 : 'LGBM', 3 : 'RF', 4 : 'XGB'}
float_rate = 0.3
test_size = round(len(df_tar) * float_rate)
int_fold = 10

for int_model in range(0, 5) : 
    list_res, list_hists = com_Model.model_ml_analysis_with_KFold(df_tar, int_model, float_rate, str_col_tar, int_fold)

    ## list_res 저장
    str_txt = "../kf_result/kf_result_" + dict_model[int_model] + '_K' + str(int_fold) + '.txt'
    file_txt = open(str_txt, 'w')
    print(list_res, file = file_txt)

    ## list_hist 저장
    str_txt = "../kf_hist/kf_hist_" + dict_model[int_model] + '_K' + str(int_fold) + '.txt'
    file_txt = open(str_txt, 'w')
    print(list_hists, file = file_txt)

    ## open 후 다른 것을 open하면 자동으로 close되어 저장되지만,
    ## 마지막 파일은 반드시 close를 통해 종료해야만 저장이 완료됨
    file_txt.close()


XGB
MAE  :  3.3921
MAPE :  0.1005
MSE  :  20.7254
RMSE :  4.5525
MSLE :  0.0147
MBE  :  0.0818
R2   :  0.861
XGB
MAE  :  2.3135
MAPE :  0.0852
MSE  :  9.2876
RMSE :  3.0476
MSLE :  0.0105
MBE  :  -0.1307
R2   :  0.8779
XGB
MAE  :  2.3675
MAPE :  0.0833
MSE  :  9.5099
RMSE :  3.0838
MSLE :  0.0104
MBE  :  0.2299
R2   :  0.8676
XGB
MAE  :  2.1436
MAPE :  0.0779
MSE  :  7.8853
RMSE :  2.8081
MSLE :  0.009
MBE  :  0.0811
R2   :  0.8872
XGB
MAE  :  2.5414
MAPE :  0.0958
MSE  :  11.3242
RMSE :  3.3651
MSLE :  0.0124
MBE  :  -1.2583
R2   :  0.8275
XGB
MAE  :  5.4022
MAPE :  0.1223
MSE  :  60.4637
RMSE :  7.7758
MSLE :  0.0277
MBE  :  3.6332
R2   :  0.6727
XGB
MAE  :  2.2752
MAPE :  0.0803
MSE  :  9.2459
RMSE :  3.0407
MSLE :  0.0095
MBE  :  -0.2855
R2   :  0.8759
XGB
MAE  :  2.474
MAPE :  0.0801
MSE  :  10.7534
RMSE :  3.2792
MSLE :  0.0101
MBE  :  0.9404
R2   :  0.8524
XGB
MAE  :  2.5462
MAPE :  0.0879
MSE  :  11.0058
RMSE :  3.3175
MSLE :  0.0113
MBE  :  0.1286
R2   :  0.8376
XGB
MAE  :  2.

In [19]:
list_hists

[[3.3921,
  2.3135,
  2.3675,
  2.1436,
  2.5414,
  5.4022,
  2.2752,
  2.474,
  2.5462,
  2.5629],
 [0.1005,
  0.0852,
  0.0833,
  0.0779,
  0.0958,
  0.1223,
  0.0803,
  0.0801,
  0.0879,
  0.1025],
 [20.7254,
  9.2876,
  9.5099,
  7.8853,
  11.3242,
  60.4637,
  9.2459,
  10.7534,
  11.0058,
  10.942],
 [4.5525,
  3.0476,
  3.0838,
  2.8081,
  3.3651,
  7.7758,
  3.0407,
  3.2792,
  3.3175,
  3.3079],
 [0.0147,
  0.0105,
  0.0104,
  0.009,
  0.0124,
  0.0277,
  0.0095,
  0.0101,
  0.0113,
  0.0153],
 [0.0818,
  -0.1307,
  0.2299,
  0.0811,
  -1.2583,
  3.6332,
  -0.2855,
  0.9404,
  0.1286,
  -0.8428],
 [0.861,
  0.8779,
  0.8676,
  0.8872,
  0.8275,
  0.6727,
  0.8759,
  0.8524,
  0.8376,
  0.8258],
 [0.21924042701721191,
  0.40619659423828125,
  0.18417572975158691,
  0.21280980110168457,
  0.21931815147399902,
  0.1782987117767334,
  0.1943192481994629,
  0.19527101516723633,
  0.20368456840515137,
  0.24491643905639648]]

In [20]:
list_res

[2.8019, 0.0916, 16.1143, 3.7578, 0.0131, 0.2578, 0.8386, 0.2258]