In [1]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from tqdm.auto import tqdm

In [3]:
from DLinear import *

In [2]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [9]:
train_df.drop(['일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,986.40
...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,881.04
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,798.96
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,825.12
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,640.08


In [10]:
class LTSF_Linear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, individual, feature_size):
        super(LTSF_Linear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forcast_size)

    def forward(self, x):
        if self.individual:
            output = torch.zeros([x.size(0), self.forcast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        return x
def standardization(train_df, test_df, not_col, target):
    train_df_ = train_df.copy()
    test_df_ = test_df.copy()
    col =  [col for col in list(train_df.columns) if col not in [not_col]]
    mean_list = []
    std_list = []
    for x in col:
        mean, std = train_df_.agg(["mean", "std"]).loc[:,x]
        mean_list.append(mean)
        std_list.append(std)
        train_df_.loc[:, x] = (train_df_[x] - mean) / std
        test_df_.loc[:, x] = (test_df_[x] - mean) / std
    return train_df_, test_df_, mean_list[col.index(target)], std_list[col.index(target)]

def time_slide_df(df, window_size, forcast_size, date, target):
    df_ = df.copy()
    data_list = []
    dap_list = []
    date_list = []
    for idx in range(0, df_.shape[0]-window_size-forcast_size+1):
        x = df_.loc[idx:idx+window_size-1, target].values.reshape(window_size, 1)
        y = df_.loc[idx+window_size:idx+window_size+forcast_size-1, target].values
        date_ = df_.loc[idx+window_size:idx+window_size+forcast_size-1, date].values
        data_list.append(x)
        dap_list.append(y)
        date_list.append(date_)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32'), np.array(date_list)

class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
        

In [11]:
Model

DLinear.Model

In [12]:
train_df.rename(columns={'num_date_time': 'date_time'}, inplace=True)
train_df['date_time'] = train_df['date_time'].str.replace('.*_', '', regex=True)
train_df['date_time'] = pd.to_datetime(train_df['date_time'], format='%Y%m%d %H')
train_df['date_time'] = train_df['date_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
#train_df.set_index('date_time', inplace=True)
#print(train_df)

                  date_time  건물번호           일시  기온(C)  강수량(mm)  풍속(m/s)  \
0       2022-06-01 00:00:00     1  20220601 00   18.6      NaN      0.9   
1       2022-06-01 01:00:00     1  20220601 01   18.0      NaN      1.1   
2       2022-06-01 02:00:00     1  20220601 02   17.7      NaN      1.5   
3       2022-06-01 03:00:00     1  20220601 03   16.7      NaN      1.4   
4       2022-06-01 04:00:00     1  20220601 04   18.4      NaN      2.8   
...                     ...   ...          ...    ...      ...      ...   
203995  2022-08-24 19:00:00   100  20220824 19   23.1      NaN      0.9   
203996  2022-08-24 20:00:00   100  20220824 20   22.4      NaN      1.3   
203997  2022-08-24 21:00:00   100  20220824 21   21.3      NaN      1.0   
203998  2022-08-24 22:00:00   100  20220824 22   21.0      NaN      0.3   
203999  2022-08-24 23:00:00   100  20220824 23   20.7      NaN      0.1   

        습도(%)  전력소비량(kWh)  
0        42.0     1085.28  
1        45.0     1047.36  
2        45.0  

In [15]:
test_df.rename(columns={'num_date_time': 'date_time'}, inplace=True)
test_df['date_time'] = test_df['date_time'].str.replace('.*_', '', regex=True)
test_df['date_time'] = pd.to_datetime(test_df['date_time'], format='%Y%m%d %H')
test_df['date_time'] = test_df['date_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
#test_df.set_index('date_time', inplace=True)
test_df

Unnamed: 0,date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,2022-08-25 00:00:00,1,20220825 00,23.5,0.0,2.2,72
1,2022-08-25 01:00:00,1,20220825 01,23.0,0.0,0.9,72
2,2022-08-25 02:00:00,1,20220825 02,22.7,0.0,1.5,75
3,2022-08-25 03:00:00,1,20220825 03,22.1,0.0,1.3,78
4,2022-08-25 04:00:00,1,20220825 04,21.8,0.0,1.0,77
...,...,...,...,...,...,...,...
16795,2022-08-31 19:00:00,100,20220831 19,22.5,0.0,0.9,84
16796,2022-08-31 20:00:00,100,20220831 20,20.7,0.0,0.4,95
16797,2022-08-31 21:00:00,100,20220831 21,20.2,0.0,0.4,98
16798,2022-08-31 22:00:00,100,20220831 22,20.1,0.0,1.1,97


In [16]:
### Univariable ###
### 데이터 셋 생성 ###
window_size = 72
forcast_size= 24
batch_size = 32
targets = '전력소비량(kWh)'
date = 'date_time'

In [17]:
train_x, train_y, train_date = time_slide_df(train_df, window_size, forcast_size, date, targets)

In [18]:
train_x

array([[[1085.28],
        [1047.36],
        [ 974.88],
        ...,
        [1897.44],
        [1560.48],
        [1351.68]],

       [[1047.36],
        [ 974.88],
        [ 953.76],
        ...,
        [1560.48],
        [1351.68],
        [1203.84]],

       [[ 974.88],
        [ 953.76],
        [ 986.4 ],
        ...,
        [1351.68],
        [1203.84],
        [1119.36]],

       ...,

       [[ 850.8 ],
        [ 660.48],
        [ 473.28],
        ...,
        [ 959.76],
        [ 890.64],
        [ 789.6 ]],

       [[ 660.48],
        [ 473.28],
        [ 419.76],
        ...,
        [ 890.64],
        [ 789.6 ],
        [ 737.76]],

       [[ 473.28],
        [ 419.76],
        [ 396.72],
        ...,
        [ 789.6 ],
        [ 737.76],
        [ 610.32]]], dtype=float32)

In [19]:
train_x.shape

(203905, 72, 1)

In [22]:
train_y.shape

(203905, 24)

In [24]:
train_y

array([[1203.84, 1119.36, 1026.72, ..., 1733.76, 1414.08, 1191.84],
       [1119.36, 1026.72,  992.64, ..., 1414.08, 1191.84,  963.36],
       [1026.72,  992.64, 1022.88, ..., 1191.84,  963.36,  872.16],
       ...,
       [ 737.76,  610.32,  457.68, ...,  881.04,  798.96,  825.12],
       [ 610.32,  457.68,  406.56, ...,  798.96,  825.12,  640.08],
       [ 457.68,  406.56,  392.64, ...,  825.12,  640.08,  540.24]],
      dtype=float32)

In [25]:
train_dl = DataLoader(train_x, batch_size = batch_size, shuffle=True,)

In [26]:
train_dl

<torch.utils.data.dataloader.DataLoader at 0x7f46c00e6740>

In [27]:
train_loss_list = []
valid_loss_list = []
#test_loss_list = []
epoch = 50
lr = 0.001
DLinear_model = LTSF_Linear(
                            window_size=window_size,
                            forcast_size=forcast_size,
#                            kernel_size=25,
                            individual=False,
                            feature_size=1,
                            )
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
max_loss = 999999999

In [28]:
for epoch in tqdm(range(1, epoch+1)):
    loss_list = []
    DLinear_model.train()
    for batch_idx, (data, target) in enumerate(train_dl):
        optimizer.zero_grad()
        output = DLinear_model(data)
        loss = criterion(output, target.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())    
    train_loss_list.append(np.mean(loss_list))

    DLinear_model.eval()
    with torch.no_grad():
        for data, target in valid_dl:
            output = DLinear_model(data)
            valid_loss = criterion(output, target.unsqueeze(-1))
            valid_loss_list.append(valid_loss)
        
 #       for data, target in test_dl:
 #           output = DLinear_model(data)
 #           test_loss = criterion(output, target.unsqueeze(-1))
 #           test_loss_list.append(test_loss)

    if valid_loss < max_loss:
        torch.save(DLinear_model, 'DLinear_model.pth')
        max_loss = valid_loss
        print("valid_loss={:.3f}, Model Save".format(valid_loss))
        dlinear_best_epoch = epoch
        dlinear_best_train_loss = np.mean(loss_list)
        dlinear_best_valid_loss = np.mean(valid_loss.item())
        #dlinear_best_test_loss = np.mean(test_loss.item())

    print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss)) #, test_loss : {:.3f} , test_loss


  0%|          | 0/50 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)