### Code Hist.

 - CODE : KIER M02 - Clustering
 - DESC  
    &ensp; : 최적의 Cluster를 선정하기 위한 정량적 비교  
    &emsp; 1) Inertia 기반의 Elbow-Method  
    &emsp; 2) 군집화 계수 비교 : Silhouette / CHI / Dunn Index  

  - DATE  
    &ensp; 2024-02-01 Created  
    &ensp; 2024-04-03 코드 개선  
    &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 1) KIER M02 초기부분 공통코드화  
    &ensp; 2024-04-04 Updated  
    &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 1) 기능 구현 완료 및 논문 작성    
    &ensp; 2024-07-23 Updated  
    &emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 1) Dunn Index 부분 추가    

# 01. Code

## 01-01. Init

### 01-01-01. Init_Module Import

In [None]:
#region Basic_Import
## Basic
import os, sys, warnings
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.path.dirname(os.path.abspath('./__file__'))
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('./__file__'))))
warnings.filterwarnings('ignore')

import numpy as np, pandas as pd
from pandas import DataFrame, Series
pd.options.display.float_format = '{:.10f}'.format

import math, random

## Datetime
import time, datetime as dt
from datetime import datetime, date, timedelta

## glob
import glob, requests, json
from glob import glob

## 시각화
import matplotlib.pyplot as plt, seaborn as sns
# %matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]

from scipy import stats

## Split, 정규화
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# K-Means 알고리즘
from sklearn.cluster import KMeans, MiniBatchKMeans

# Clustering 알고리즘의 성능 평가 측도
from sklearn import metrics
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, silhouette_score, rand_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.cluster import contingency_matrix

## For Web
import urllib
from urllib.request import urlopen
from urllib.parse import urlencode, unquote, quote_plus
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

import tqdm
from tqdm.notebook import tqdm
#endregion Basic_Import

In [None]:
## Import_DL
str_tar = "tf"
## For Torch
if str_tar == "torch":
    import torch, torch.nn as nn
    from torch.nn.utils import weight_norm
    print("Torch Imported")
## For TF
elif str_tar == "tf":
    import tensorflow as tf, tensorflow_addons as tfa
    from keras.callbacks import EarlyStopping, ModelCheckpoint
    from keras.models import Sequential, load_model
    from keras_flops import get_flops
    print("Tensorflow Imported")
else:
    print("Error : Cannot be used except for Keywords")
    print(" : torch / tf")

In [None]:
## Import_Local
from Src_Dev_Common import Data_Datetime as com_date, KMA_Weather as com_KMA, KECO_AirKor as com_KECO, KASI_Holiday as com_Holi, KIER_Usage_M02 as com_KIER_M02, Data_Clustering as com_clustering

### 01-01-02. Config (Directory, Params)

In [None]:
## Init_config
SEED = 42

np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"], os.environ['TF_DETERMINISTIC_OPS'] = str(SEED), "1"

In [None]:
## Define Todate str
str_now_ymd = pd.datetime.now().date()
str_now_y, str_now_m, str_now_d = pd.datetime.now().year, pd.datetime.now().month, pd.datetime.now().day
str_now_hr, str_now_min = pd.datetime.now().hour, pd.datetime.now().minute

print(pd.datetime.now())
print(str(str_now_y) + " / " + str(str_now_m)  + " / " + str(str_now_d))
print(str(str_now_hr) + " : " + str(str_now_min))

In [None]:
## Dict_Domain
## {0:"ELEC", 1:"HEAT", 2:"WATER", 3:"HOT_HEAT", 4:"HOT_FLOW", 99:"GAS"}
## {0 : '10MIN', 1 : '30MIN', 2 : '1H', 3 : '12H', 4 : '1D', 5 : '1W', 6 : '2W', 7 : '1M'}
int_domain, int_interval = 0, 4

## Domain, ACCU/INST Column
str_domain, str_col_accu, str_col_inst = com_KIER_M02.create_domain_str(int_domain)
## Directory Root
str_dirData, str_dir_raw, str_dir_cleansed, str_dirName_bld, str_dirName_h = com_KIER_M02.create_dir_str(str_domain)
## Interval, Target File
str_interval, str_fileRaw, str_fileRaw_hList, str_file = com_KIER_M02.create_file_str(str_domain, int_interval)

print(str(os.listdir(str_dirData)) + "\n")
print(os.listdir(str_dirName_h))

## 01-02. Data Load (df_raw)

### 01-02-01. KIER (Energy Usage)

In [None]:
df_kier_raw = pd.read_csv(str_dirName_h + str_file, index_col = 0)

try : df_kier_raw['METER_DATE'] = pd.to_datetime(df_kier_raw['METER_DATE'])
except KeyError : df_kier_raw = com_date.create_col_datetime(df_kier_raw, 'METER_DATE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE').drop(labels = ['None'], axis = 1)

print(df_kier_raw.isna().sum().sum())
df_kier_raw

In [None]:
## 호실별 순시 사용량 컬럼만 가져오기
# list_col_tar = list(df_kier_raw.columns[6:-2]) ## 10MIN
list_col_tar = list(df_kier_raw.columns[1:]) ## 1H / 1D
df_kier_h = df_kier_raw.set_index('METER_DATE')
df_kier_h

In [None]:
print(len(list_col_tar))
list_col_tar

In [None]:
# ## Error Log : "[5:-2]" 부분을 추가하여 연월일시 및 평균합계 부분을 제거해주지 않으면, 군집화 계수가 제대로 도출되지 못함.
# df_kier_summary_total = df_kier_h.transpose().reset_index()[5:-2]
# ## 또는, 가장 깔끔하게 이렇게 처리해도 좋다
df_kier_summary_total = df_kier_h[list_col_tar].transpose().reset_index()

## 세대 번호의 컬럼명이 'index'로 지정되어 오류 발생
df_kier_summary_total['h_index'] = df_kier_summary_total['index']
df_kier_summary_total = df_kier_summary_total.drop(columns = ['index'])
df_kier_summary_total

In [None]:
X = df_kier_summary_total.drop(columns = 'h_index')
y = df_kier_summary_total['h_index']
X.isna().sum()
# y

http://bigdata.dongguk.ac.kr/lectures/datascience/_book/%EA%B5%B0%EC%A7%91%EB%B6%84%EC%84%9D.html

In [None]:
# 변수 표준화
scaler = StandardScaler() # 변수 표준화 클래스
scaler.fit(X)  # 표준화를 위해 변수별 파라미터(평균, 표준편차) 계산
# scaler.mean_, scaler.scale_
X_std = scaler.transform(X)  # 훈련자료 표준화 변환
# X_std

### Clustering (군집화) : K - Means

군집의 수 결정 방법  
1) elbow method - 군집의 개수와 군집내 변동의 합을 그래프로 나타내고, 변동량의 변화가 작아지는 지점의 군집의 수를 적정 군집의 수로 결정함  
2) 군집화시 계수 비교 : Silhouette / CHI

In [None]:
## 군집시 군집의 수 판단을 위한 Data 수집, 이를 바탕으로 인사이트 도출
int_cluster_min, int_cluster_max = 2, 10 ## 최소 / 최대 군집 수

In [None]:
list_intertia, list_intertia_deriv = com_clustering.clustering_elbow_method(str_interval, int_cluster_min, int_cluster_max, X_std, opt_X = 3)
print(list_intertia)
print(list_intertia_deriv)

In [None]:
list_CHI = com_clustering.clustering_CHI_method(str_interval, int_cluster_min, int_cluster_max, X_std, 2)
print(list_CHI)

In [None]:
list_Silhouette, list_cnt_clusters_by_K = com_clustering.clustering_Silhouette_method(str_interval, int_cluster_min, int_cluster_max, X_std, 2)
print(list_Silhouette)
print(list_cnt_clusters_by_K)

In [None]:
from Src_Dev_Common.cluster_eval import get_Dunn_index

opt_X = 2

## 예외처리01
## Min이 Max보다 크면 그냥 바꿔줌 + Int가 아니면 Int로 바꿔줌
if int_cluster_min > int_cluster_max : int_clusters_min, int_clusters_max = int(int_cluster_max), int(int_cluster_min) + 1
else : int_clusters_min, int_clusters_max = int(int_cluster_min), int(int_cluster_max) + 1

## 초기 변수  생성
list_Dunn = []
K = range(int_clusters_min, int_clusters_max)

for n_cluster in K:
    km_dunn = KMeans(n_clusters = n_cluster, init="k-means++", max_iter=300, n_init=1).fit(X_std) 
    cluster = km_dunn.predict(X_std)
    list_Dunn.append(get_Dunn_index(X_std, cluster))

fig = plt.figure(figsize=(8,8))
fig.set_facecolor('white')
ax = fig.add_subplot()
ax.plot(K, list_Dunn, marker='.', markersize = 5, zorder = 2)
if opt_X != None : plt.scatter(opt_X, list_Dunn[opt_X - 2], color = 'red', marker = '^', label = 'Point', zorder = 9999)
ax.set_xticks(K)
plt.xlabel('k')
plt.ylabel('Dunn Index')
plt.title('Dunn Index by number of clusters (Interval : ' + str_interval + ')')
plt.show()

print(list_Dunn)

## 선정된 군집의 수에 따라 군집화 시뮬레이션 시행

In [None]:
## 위에서 결정된 군집의 수에 따라 군집화 결과 도출
## 초기 변수 생성
K, cnt_loop = 3, 10 ## K : 결정된/평가할 군집의 수, cnt_loop : 평가를 위한 군집화 시도 횟수

In [None]:
# for i in range(0, 10): ## 1W / 1M 이외
km = KMeans(n_clusters = K, init="k-means++", max_iter=300, n_init=1).fit(X_std)
cluster = km.predict(X_std)

list_log_clusters = com_clustering.clustering_get_cnt_by_loop(K, cnt_loop, X_std)
print("총 " + str(cnt_loop) + "회에 걸친 군집화 시뮬레이션")
print(list_log_clusters)

In [None]:
## 최종 군집에 대한 군집화 평가 및 Labeled Data 저장
print(com_clustering.get_cluster_sizes(km, X_std)) ## 최종 군집화에 대한 군집 크기 출력

com_clustering.clustering_visualization(str_interval, km, X_std)
list_scores = com_clustering.get_clustring_score(km, X_std, y)

# df_kier_summary_total['target'] = np.transpose(np.where(km.labels_ == i)[0])
df_kier_summary_total['target_'+str_domain] = 0
for i in range(0, len(df_kier_summary_total)) : df_kier_summary_total['target_'+str_domain].iloc[i] = km.labels_[i]
# df_kier_summary_total[['h_index', 'target_' + str_domain]]

str_file_labeled = str_dirName_h + 'KIER_' + str(str_domain) + '_Labeled_' + str_interval + '_K' + str(K) + '.csv'
df_kier_summary_total = df_kier_summary_total[['h_index', 'target_'+str_domain]]
df_kier_summary_total.to_csv(str_file_labeled)
print(str_file_labeled)
df_kier_summary_total