In [1]:
# import dgl # huggingface 같은 라이브러리
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from lib import Trainer, RNNTrainer
from lib import save_figure_predict
from lib import matplotlib_plot_font
# from models import LSTM
from lib import compute_metrics
from model_select import model_selection

import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
from math import ceil
import warnings
warnings.filterwarnings('ignore')

matplotlib_plot_font()
device = torch.device("cpu")

  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


In [2]:
def seq_data(data,sequence_length):
    x_seq = []
    y_seq = []
    for i in range(len(data) - sequence_length):
        x_seq.append(data[i:i+sequence_length])
        y_seq.append(data[i+sequence_length])
        # print(y_seq)
    return torch.FloatTensor(x_seq).to(device), torch.FloatTensor(y_seq).to(device).view(-1, 1)
    
def seed_everything(seed = 42):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f'Seed set = {seed}')
    
seed_everything()

Seed set = 42


In [13]:
##################
## Load Dataset ##
##################

region_type = 'city'
df2 = pd.read_csv(f'/Users/jeonjunhwi/문서/Projects/Master_GNN/Data/KCDC_data/Processing_Results/smoothing_1_{region_type}_mean.csv', index_col=0, encoding='cp949')

# df = df2.iloc[100:533] # 델타 : 554, 540, 533 오미크론 : 707, 693, 686 
df2 = df2.iloc[:850] #340 326 319
df = df2.diff()
df = df.dropna()

############################################
## train, test, validation length setting ##
############################################

# split_date = '2020-11-23'
# split_date = '2021-06-25'
split_date = '2021-11-25'
val_ratio = 0.2

train_df = df[df.index < split_date]
test = df.iloc[len(train_df):, :]

len_val = int(train_df.shape[0] * val_ratio)
len_train = train_df.shape[0] - len_val
train = train_df.iloc[:len_train,:]
val = train_df.iloc[len_train :, :]


### 컬럼을 숫자로 바꿔줌 ###
region_dict = {}
for i, region in enumerate(df.columns):
    region_dict[i] = region
# df.columns = list(region_dict.keys())

y_pred = pd.DataFrame({})
fig = plt.figure(figsize=(25,15), facecolor='white')
gt_list = []
pred_list = []

############################
## Hyperparameter Setting ##
############################

diff_ = "1st"
suptitle_1 = 'Standard Scaler, LSTM'
epochs = 50
batch_size = 16
num_layers = 1
# hidden_size = 16
# output_size = 16
TIME_STEPS = 5
learning_rate = 1e-2
criterion = nn.MSELoss()

import shutil
MODEL_NAME = 'RNN'
tmp = ""
dataset_name = "Baseline_707"
horizon = len(test)-TIME_STEPS

result_dir = ('%s'+tmp) % (MODEL_NAME)
model_dir = ('%s'+tmp) % (MODEL_NAME)
Figure_path = os.path.join('Result', dataset_name, 'Figure', str(horizon), result_dir)
Diameter_path = os.path.join('Result', dataset_name, 'Diameter', str(horizon), result_dir)
Pred_path = os.path.join('Result', dataset_name, 'Pred', str(horizon), result_dir)
model_path = os.path.join('Save_model', dataset_name, str(horizon), model_dir)


if os.path.exists(Figure_path):
    shutil.rmtree(Figure_path) # 해당 경로 데이터 모두 삭제
if os.path.exists(Diameter_path):
    shutil.rmtree(Diameter_path) # 해당 경로 데이터 모두 삭제
if os.path.exists(Pred_path):
    shutil.rmtree(Pred_path) # 해당 경로 데이터 모두 삭제
if os.path.exists(model_path):
    shutil.rmtree(model_path) # 해당 경로 데이터 모두 삭제
    
os.makedirs(Figure_path) # 새로 폴더 생성
os.makedirs(Diameter_path) # 새로 폴더 생성
os.makedirs(Pred_path) # 새로 폴더 생성
os.makedirs(model_path) # 새로 폴더 생성

gt_list, pred_list = pd.DataFrame({}), pd.DataFrame({})
gt_5_list, pred_5_list = [], []
gt_10_list, pred_10_list = [], []
gt_15_list, pred_15_list = [], []
gt_mul_list, pred_mul_list = pd.DataFrame({}), pd.DataFrame({})
MAE_LSTM_list, RMSE_LSTM_list = [], []
# date_split = f"{df.index[0]} ~ {df.index[len_train]} ~ {df.index[len_train + len_val]} ~ {df.index[-1]}"
# print(date_split)
# for i in [0]:
for i, region in enumerate(df.columns):
# for i, region in enumerate(['인천 동구', '경기 시흥시']):
    
    #######################
    ## Define DataLoader ##
    #######################
    
    train = df[[region]][:len_train]
    val = df[[region]][ len_train: len_train + len_val]
    test = df[[region]][ len_train + len_val:]
    
    scaler = StandardScaler()

    train = scaler.fit_transform(train)
    val = scaler.transform(val)
    test = scaler.transform(test)

    x_train_seq, y_train_seq = seq_data(train, TIME_STEPS)
    x_val_seq, y_val_seq = seq_data(val, TIME_STEPS)
    x_test_seq, y_test_seq = seq_data(test, TIME_STEPS)
    # print(x_train_seq.shape, x_val_seq.shape, x_test_seq.shape)
    
    date_split = f"{df.index[0]} ~ {df.index[len(x_train_seq)-1]} ~ {df.index[len(x_train_seq)+TIME_STEPS + len(x_val_seq)+TIME_STEPS*2-1]} ~ {df.index[-1]}"
    print(date_split)
    
    # input(" stop ")
    # horizon = len(y_test_seq)
    train_torch = torch.utils.data.TensorDataset(x_train_seq, y_train_seq)
    val_torch = torch.utils.data.TensorDataset(x_val_seq, y_val_seq)
    test_torch = torch.utils.data.TensorDataset(x_test_seq, y_test_seq)
    
    train_loader = torch.utils.data.DataLoader(dataset=train_torch, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(dataset=val_torch, batch_size=batch_size, shuffle=False)
    # test_loader = torch.utils.data.DataLoader(dataset=test_torch, batch_size=batch_size, shuffle=False)

    # save_path = f"save_model/lstm_smooth_3_{suptitle_2}_{i}.pt"
    
    ############################
    ## Define Model and Train ##
    ############################
    
    model = model_selection(MODEL_NAME='LSTM', TIME_STEPS=TIME_STEPS, device=device, save_path=model_path)
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    trainer = RNNTrainer(model=model,
                        train_loader=train_loader,
                        val_loader=val_loader,
                        test_loader=torch.tensor(x_test_seq, dtype=torch.float32),
                        loss=criterion,
                        optimizer=optimizer,
                        scaler=scaler,
                        device=device,
                        save_path=model_path,
                        res_df = df)

    
    trainer.train(epochs)
    val_loss, epoch = trainer.train(epochs)
    pred = trainer.predict(diff_, horizon)
    pred_mul, gt_mul, idx_list = trainer.multi_step_ahead_predict(split_date = split_date,
                                                  TIME_STEPS = TIME_STEPS,
                                                  horizon = 30,
                                                  region = region,
                                                  y_test = torch.tensor(y_test_seq, dtype=torch.float32))


    #######################
    ## Calculate Metrics ##
    #######################
    
    ground_truth = df2.diff().loc[df[df.index >= split_date].index[TIME_STEPS:TIME_STEPS+horizon]]

    gt_list[region] = ground_truth.iloc[:,i].tolist()
    pred_list[region] = list(pred)

    # Metric of each region 
    y_pred[region] = pred
    gt_mul_list[region] = gt_mul
    pred_mul_list[region] = np.array(pred_mul[5:]).squeeze() # [[]] -> []
    
gt_list.index = ground_truth.index
pred_list.index = ground_truth.index
gt_mul_list.index = idx_list
pred_mul_list.index = idx_list

# Total Metric
RMSE, RMSE_total = compute_metrics(gt_list, pred_list, metric='rmse')
MAE, MAE_total = compute_metrics(gt_list, pred_list, metric='mae')

RMSE_5, RMSE_total_5 = compute_metrics(gt_mul_list.iloc[:5], pred_mul_list.iloc[:5], metric='rmse')
MAE_5, MAE_total_5 = compute_metrics(gt_mul_list.iloc[:5], pred_mul_list.iloc[:5], metric='mae')

RMSE_10, RMSE_total_10 = compute_metrics(gt_mul_list.iloc[:10], pred_mul_list.iloc[:10], metric='rmse')
MAE_10, MAE_total_10 = compute_metrics(gt_mul_list.iloc[:10], pred_mul_list.iloc[:10], metric='mae')

RMSE_15, RMSE_total_15 = compute_metrics(gt_mul_list.iloc[:15], pred_mul_list.iloc[:15], metric='rmse')
MAE_15, MAE_total_15 = compute_metrics(gt_mul_list.iloc[:15], pred_mul_list.iloc[:15], metric='mae')

RMSE_20, RMSE_total_20 = compute_metrics(gt_mul_list.iloc[:20], pred_mul_list.iloc[:20], metric='rmse')
MAE_20, MAE_total_20 = compute_metrics(gt_mul_list.iloc[:20], pred_mul_list.iloc[:20], metric='mae')

#####################
## Save Prediction ##
#####################

# Save for Total Metric of Each Metric
pd.DataFrame({
              'MAE_LSTM' : MAE,
              'RMSE_LSTM' : RMSE
              }, index=df.columns).to_csv(f'{Pred_path}/RNN_region_metric.csv', encoding='cp949')

pd.DataFrame({'MAE_total' : MAE_total,
              'RMSE_total' : RMSE_total,
              'MAE_total_5' : MAE_total_5,
              'RMSE_total_5' : RMSE_total_5,
              'MAE_total_10' : MAE_total_10,
              'RMSE_total_10' : RMSE_total_10,
              'MAE_total_15' : MAE_total_15,
              'RMSE_total_15' : RMSE_total_15,
              'MAE_total_20' : MAE_total_20,
              'RMSE_total_20' : RMSE_total_20,  
              }, index=['LSTM']).to_csv(f'{Pred_path}/RNN_total_metric.csv', encoding='cp949')

gt_mul_list.to_csv(f'{Pred_path}/RNN_predict_20.csv', encoding='cp949')
pred_mul_list.to_csv(f'{Pred_path}/RNN_predict_20.csv', encoding='cp949')

y_pred['신고일'] = df2.index[-horizon:]
y_pred.set_index('신고일', inplace=True)
y_pred.to_csv(f'{Pred_path}/pred_LSTM_{region_type}.csv', encoding='cp949')

2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 0.9697 val_loss : 7.9054
[epoch : 0] train_loss : 0.4375 val_loss : 6.5521
2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 1.0023 val_loss : 5.8885
[epoch : 0] train_loss : 0.2910 val_loss : 7.3802
2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 0.9542 val_loss : 8.1341
[epoch : 0] train_loss : 0.2544 val_loss : 8.7840
2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 0.9443 val_loss : 5.8587
[epoch : 0] train_loss : 0.2618 val_loss : 8.2430
2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 0.9139 val_loss : 10.7984
[epoch : 0] train_loss : 0.2798 val_loss : 16.6306
2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 0.9701 val_loss : 3.5257
[epoch : 0] train_loss : 0.5472 val_loss : 4.8955
2020-01-20 ~ 2021-07-07 ~ 2021-11-29 ~ 2022-05-17
[epoch : 0] train_loss : 0.9221 val_loss : 7.178

<Figure size 1800x1080 with 0 Axes>

In [8]:
pd.DataFrame({'MAE_total' : MAE_total,
              'RMSE_total' : RMSE_total,
              'MAE_total_5' : MAE_total_5,
              'RMSE_total_5' : RMSE_total_5,
              'MAE_total_10' : MAE_total_10,
              'RMSE_total_10' : RMSE_total_10,
              'MAE_total_15' : MAE_total_15,
              'RMSE_total_15' : RMSE_total_15,
              'MAE_total_20' : MAE_total_20,
              'RMSE_total_20' : RMSE_total_20,  
              }, index=['LSTM'])

Unnamed: 0,MAE_total,RMSE_total,MAE_total_5,RMSE_total_5,MAE_total_10,RMSE_total_10,MAE_total_15,RMSE_total_15,MAE_total_20,RMSE_total_20
LSTM,44.543,223.754318,1.916343,4.142653,2.328384,5.337789,2.668368,6.148017,2.88422,6.491193


### MAE, RMSE 다시 계산하는 코드 짜야함 test_data 25~26일 정도 구간으로 해서

In [5]:
from lib import save_figure_predict

GROUND_TRUTH = df.diff().iloc[-horizon:,]# + df.iloc[-(horizon+1):-1,].values
PRED = y_pred# + df.iloc[-(horizon+1):-1,].values

suptitle = f"{MODEL_NAME}"
save_figure_predict(
                    GROUND_TRUTH = gt_10,
                    y_pred = pred_10_list,
                    # test_data = None,
                    region_dict = region_dict,
                    suptitle = suptitle,
                    legend = ['GROUND TRUTH', MODEL_NAME, 'Test_data'],
                    date_split = f"{df.index[1]} ~ {df.index[len(train[0])]} ~ {df.index[len(train[0])+TIME_STEPS + len(val[0])+TIME_STEPS*2]} ~ {df.index[-1]}",
                    MAE = MAE_LSTM_list, RMSE = RMSE_LSTM_list,
                    MAE_total = MAE_total, RMSE_total = RMSE_total, 
                    PATH=Figure_path
                    )

# GROUND_TRUTH = df.diff().iloc[-horizon:,] + df.iloc[-(horizon+1):-1,].values
# PRED = y_pred + df.iloc[-(horizon+1):-1,].values
# suptitle = f"{MODEL_NAME}_{diff_}"
# save_figure_predict(

#                     df = GROUND_TRUTH,
#                     y_pred = PRED,
#                     test_data = None,
#                     region_dict = region_dict,
#                     suptitle = suptitle,
#                     legend = ['GROUND TRUTH', MODEL_NAME, 'Test_data'],
#                     date_split = f"{df.index[1]} ~ {df.index[len(train[0])]} ~ {df.index[len(train[0])+TIME_STEPS + len(val[0])+TIME_STEPS*2]} ~ {df.index[-1]}",
#                     MAE = MAE_LSTM_list, RMSE = RMSE_LSTM_list,
#                     MAE_total = MAE_total, RMSE_total = RMSE_total, 
#                     PATH=Figure_path
#                     )

NameError: name 'gt_10' is not defined