In [66]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from tqdm.auto import tqdm

In [67]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [70]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.40
...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08


In [35]:
class LTSF_Linear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, individual, feature_size):
        super(LTSF_Linear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forcast_size)

    def forward(self, x):
        if self.individual:
            output = torch.zeros([x.size(0), self.forcast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        return x

In [None]:
def standardization(train_df, test_df, not_col, target):
    train_df_ = train_df.copy()
    test_df_ = test_df.copy()
    col =  [col for col in list(train_df.columns) if col not in [not_col]]
    mean_list = []
    std_list = []
    for x in col:
        mean, std = train_df_.agg(["mean", "std"]).loc[:,x]
        mean_list.append(mean)
        std_list.append(std)
        train_df_.loc[:, x] = (train_df_[x] - mean) / std
        test_df_.loc[:, x] = (test_df_[x] - mean) / std
    return train_df_, test_df_, mean_list[col.index(target)], std_list[col.index(target)]

def time_slide_df(df, window_size, forcast_size, date, target):
    df_ = df.copy()
    data_list = []
    dap_list = []
    date_list = []
    for idx in range(0, df_.shape[0]-window_size-forcast_size+1):
        x = df_.loc[idx:idx+window_size-1, target].values.reshape(window_size, 1)
        y = df_.loc[idx+window_size:idx+window_size+forcast_size-1, target].values
        date_ = df_.loc[idx+window_size:idx+window_size+forcast_size-1, date].values
        data_list.append(x)
        dap_list.append(y)
        date_list.append(date_)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32'), np.array(date_list)

class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
        

In [10]:
train_df.rename(columns={'num_date_time': 'date_time'}, inplace=True)
train_df['date_time'] = train_df['date_time'].str.replace('.*_', '', regex=True)
train_df['date_time'] = pd.to_datetime(train_df['date_time'], format='%Y%m%d %H')
train_df['date_time'] = train_df['date_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
#train_df.set_index('date_time', inplace=True)
print(train_df)

                  date_time  건물번호           일시  기온(C)  강수량(mm)  풍속(m/s)  \
0       2022-06-01 00:00:00     1  20220601 00   18.6      NaN      0.9   
1       2022-06-01 01:00:00     1  20220601 01   18.0      NaN      1.1   
2       2022-06-01 02:00:00     1  20220601 02   17.7      NaN      1.5   
3       2022-06-01 03:00:00     1  20220601 03   16.7      NaN      1.4   
4       2022-06-01 04:00:00     1  20220601 04   18.4      NaN      2.8   
...                     ...   ...          ...    ...      ...      ...   
203995  2022-08-24 19:00:00   100  20220824 19   23.1      NaN      0.9   
203996  2022-08-24 20:00:00   100  20220824 20   22.4      NaN      1.3   
203997  2022-08-24 21:00:00   100  20220824 21   21.3      NaN      1.0   
203998  2022-08-24 22:00:00   100  20220824 22   21.0      NaN      0.3   
203999  2022-08-24 23:00:00   100  20220824 23   20.7      NaN      0.1   

        습도(%)  일조(hr)  일사(MJ/m2)  전력소비량(kWh)  
0        42.0     NaN        NaN     1085.28  
1    

In [11]:
test_df.rename(columns={'num_date_time': 'date_time'}, inplace=True)
test_df['date_time'] = test_df['date_time'].str.replace('.*_', '', regex=True)
test_df['date_time'] = pd.to_datetime(test_df['date_time'], format='%Y%m%d %H')
test_df['date_time'] = test_df['date_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
#test_df.set_index('date_time', inplace=True)
print(test_df)

                 date_time  건물번호           일시  기온(C)  강수량(mm)  풍속(m/s)  습도(%)
0      2022-08-25 00:00:00     1  20220825 00   23.5      0.0      2.2     72
1      2022-08-25 01:00:00     1  20220825 01   23.0      0.0      0.9     72
2      2022-08-25 02:00:00     1  20220825 02   22.7      0.0      1.5     75
3      2022-08-25 03:00:00     1  20220825 03   22.1      0.0      1.3     78
4      2022-08-25 04:00:00     1  20220825 04   21.8      0.0      1.0     77
...                    ...   ...          ...    ...      ...      ...    ...
16795  2022-08-31 19:00:00   100  20220831 19   22.5      0.0      0.9     84
16796  2022-08-31 20:00:00   100  20220831 20   20.7      0.0      0.4     95
16797  2022-08-31 21:00:00   100  20220831 21   20.2      0.0      0.4     98
16798  2022-08-31 22:00:00   100  20220831 22   20.1      0.0      1.1     97
16799  2022-08-31 23:00:00   100  20220831 23   19.1      0.0      0.1     99

[16800 rows x 7 columns]


In [22]:
### Univariable ###
### 데이터 셋 생성 ###
window_size = 72
forcast_size= 24
batch_size = 32
targets = '전력소비량(kWh)'
date = 'date_time'


In [25]:
#train_df_fe, test_df_fe, mean_, std_ = standardization(train_df, test_df, 'date_time', targets)
train_x, train_y, train_date = time_slide_df(train_df, window_size, forcast_size, date, targets)

In [27]:
test_x, test_y, test_date = time_slide_df(test_df, window_size, forcast_size, date, targets)


TypeError: time_slide_df() missing 1 required positional argument: 'target'

In [31]:
train_ds = Data(train_x[:1000], train_y[:1000])
valid_ds = Data(train_x[1000:], train_y[1000:])

In [33]:

train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle=True,)
valid_dl = DataLoader(valid_ds, batch_size = train_x[1000:].shape[0], shuffle=False)

In [41]:
#test_ds = Data(test_x, test_y)
#test_dl  = DataLoader(test_ds,  batch_size = test_x.shape[0], shuffle=False)


### 모델 학습 ###
train_loss_list = []
valid_loss_list = []
#test_loss_list = []
epoch = 50
lr = 0.001
DLinear_model = LTSF_Linear(
                            window_size=window_size,
                            forcast_size=forcast_size,
#                            kernel_size=25,
                            individual=False,
                            feature_size=1,
                            )
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
max_loss = 999999999

for epoch in tqdm(range(1, epoch+1)):
    loss_list = []
    DLinear_model.train()
    for batch_idx, (data, target) in enumerate(train_dl):
        optimizer.zero_grad()
        output = DLinear_model(data)
        loss = criterion(output, target.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())    
    train_loss_list.append(np.mean(loss_list))

    DLinear_model.eval()
    with torch.no_grad():
        for data, target in valid_dl:
            output = DLinear_model(data)
            valid_loss = criterion(output, target.unsqueeze(-1))
            valid_loss_list.append(valid_loss)
        
 #       for data, target in test_dl:
 #           output = DLinear_model(data)
 #           test_loss = criterion(output, target.unsqueeze(-1))
 #           test_loss_list.append(test_loss)

    if valid_loss < max_loss:
        torch.save(DLinear_model, 'DLinear_model.pth')
        max_loss = valid_loss
        print("valid_loss={:.3f}, Model Save".format(valid_loss))
        dlinear_best_epoch = epoch
        dlinear_best_train_loss = np.mean(loss_list)
        dlinear_best_valid_loss = np.mean(valid_loss.item())
        #dlinear_best_test_loss = np.mean(test_loss.item())

    print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss)) #, test_loss : {:.3f} , test_loss


  0%|          | 0/50 [00:00<?, ?it/s]

valid_loss=1231140.875, Model Save
epoch = 1, train_loss : 3687568.730, valid_loss : 1231140.875
valid_loss=784274.625, Model Save
epoch = 2, train_loss : 806365.745, valid_loss : 784274.625
valid_loss=670496.562, Model Save
epoch = 3, train_loss : 415167.869, valid_loss : 670496.562
valid_loss=628722.562, Model Save
epoch = 4, train_loss : 277906.645, valid_loss : 628722.562
valid_loss=611424.000, Model Save
epoch = 5, train_loss : 227544.049, valid_loss : 611424.000
valid_loss=606104.562, Model Save
epoch = 6, train_loss : 206859.927, valid_loss : 606104.562
valid_loss=600203.562, Model Save
epoch = 7, train_loss : 198856.653, valid_loss : 600203.562
valid_loss=581017.562, Model Save
epoch = 8, train_loss : 195577.667, valid_loss : 581017.562
valid_loss=573748.938, Model Save
epoch = 9, train_loss : 191387.862, valid_loss : 573748.938
valid_loss=569547.625, Model Save
epoch = 10, train_loss : 190010.167, valid_loss : 569547.625
valid_loss=559421.938, Model Save
epoch = 11, train_loss

In [45]:
test_df

Unnamed: 0,date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,2022-08-25 00:00:00,1,20220825 00,23.5,0.0,2.2,72
1,2022-08-25 01:00:00,1,20220825 01,23.0,0.0,0.9,72
2,2022-08-25 02:00:00,1,20220825 02,22.7,0.0,1.5,75
3,2022-08-25 03:00:00,1,20220825 03,22.1,0.0,1.3,78
4,2022-08-25 04:00:00,1,20220825 04,21.8,0.0,1.0,77
...,...,...,...,...,...,...,...
16795,2022-08-31 19:00:00,100,20220831 19,22.5,0.0,0.9,84
16796,2022-08-31 20:00:00,100,20220831 20,20.7,0.0,0.4,95
16797,2022-08-31 21:00:00,100,20220831 21,20.2,0.0,0.4,98
16798,2022-08-31 22:00:00,100,20220831 22,20.1,0.0,1.1,97


In [57]:
test_df['전력소비량(kWh)'] = 0

In [62]:
# test_df에 대한 입력 데이터와 목표 변수 추출
test_x, test_y, _ = time_slide_df(test_df, window_size, forcast_size, date='date_time', target='전력소비량(kWh)')

# 데이터셋 생성
test_ds = Data(torch.tensor(test_x), torch.tensor(test_y))

# 데이터로더 생성
test_dl = DataLoader(test_ds, batch_size=test_x.shape[0], shuffle=False)

# 모델 평가
DLinear_model.eval()
test_loss_list = []
with torch.no_grad():
    for data, target in test_dl:
        output = DLinear_model(data)
        test_loss = criterion(output, target.unsqueeze(-1))
        test_loss_list.append(test_loss)

print("Test Loss: {:.3f}".format(np.mean(test_loss_list)))


Test Loss: 0.010


In [52]:
DLinear_model

LTSF_Linear(
  (Linear): Linear(in_features=72, out_features=24, bias=True)
)