In [87]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
import numpy as np
from typing import List, Callable
# import tensorflow as tf
# from tensorflow.keras import layers


# matplotlib.font_manager._rebuild()
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False #한글 폰트 사용시 마이너스 폰트 깨짐 해결
sns.set(font="Malgun Gothic",rc={"axes.unicode_minus":False}, style='whitegrid')

In [88]:
data_root = "./data/daily"

## 1. Data 준비

### 1) Weather data

In [89]:
# load each data
temp_df = pd.read_csv(f'{data_root}/Temperatures.csv', encoding='cp949')
precip_df = pd.read_csv(f'{data_root}/Precipitation.csv', encoding='cp949').fillna(0)
wind_df = pd.read_csv(f'{data_root}/Wind.csv', encoding='cp949')
humidity_df = pd.read_csv(f'{data_root}/Humidity.csv', encoding='cp949')
sunshine_df = pd.read_csv(f'{data_root}/Sunshine.csv', encoding='cp949')

# merge
weather_df = temp_df.merge(precip_df).merge(wind_df).merge(humidity_df).merge(sunshine_df)
weather_columns = ["평균기온", "강수량mm", "평균풍속ms", "평균습도rh","일조합"]

selected_weather_df = weather_df[['일시']+weather_columns]
display(selected_weather_df.head(3))

Unnamed: 0,일시,평균기온,강수량mm,평균풍속ms,평균습도rh,일조합
0,2012-01-01,-3.0,0.0,2.9,64.5,4.4
1,2012-01-02,-4.8,0.0,2.3,66.5,5.9
2,2012-01-03,-4.5,0.4,2.5,68.3,2.7


### 2) Stock data

In [90]:
company_name = "Samsung Electronics Co"
stock_price_df = pd.read_csv(f'./data/stock/{company_name}.csv')
stock_columns = ['Close']
selected_stock_df = stock_price_df[['Date'] + stock_columns]
display(selected_stock_df.head(3))

Unnamed: 0,Date,Close
0,2000-01-04,6110.0
1,2000-01-05,5580.0
2,2000-01-06,5620.0


### 3) Merge

In [91]:
stock_weather_df = selected_weather_df.rename(columns={"일시":"Date"}).merge(selected_stock_df)
display(stock_weather_df)

Unnamed: 0,Date,평균기온,강수량mm,평균풍속ms,평균습도rh,일조합,Close
0,2012-01-02,-4.8,0.0,2.3,66.5,5.9,21600.0
1,2012-01-03,-4.5,0.4,2.5,68.3,2.7,22100.0
2,2012-01-04,-7.4,0.0,3.1,55.4,8.1,21600.0
3,2012-01-05,-5.7,0.0,1.5,49.8,9.0,21100.0
4,2012-01-06,-2.8,0.0,2.5,42.9,8.4,20800.0
...,...,...,...,...,...,...,...
2553,2022-05-20,20.6,0.0,2.3,52.8,10.8,68000.0
2554,2022-05-23,23.0,0.0,2.3,63.5,10.4,67900.0
2555,2022-05-24,22.9,0.0,2.1,56.8,12.3,66500.0
2556,2022-05-25,21.0,3.5,2.6,66.6,8.1,66400.0


In [92]:
display(stock_weather_df.isnull().sum())
display(stock_weather_df[stock_weather_df['평균풍속ms'].isnull()])
display(stock_weather_df[stock_weather_df['일조합'].isnull()])
stock_weather_df = stock_weather_df.fillna(0)

Date      0
평균기온      0
강수량mm     0
평균풍속ms    2
평균습도rh    0
일조합       9
Close     0
dtype: int64

Unnamed: 0,Date,평균기온,강수량mm,평균풍속ms,평균습도rh,일조합,Close
1460,2017-12-05,-4.2,0.1,,40.1,2.1,51260.0
1461,2017-12-06,0.2,1.2,,72.6,7.7,50020.0


Unnamed: 0,Date,평균기온,강수량mm,평균풍속ms,평균습도rh,일조합,Close
1418,2017-09-28,19.2,0.0,2.8,49.1,,51260.0
1422,2017-10-12,11.4,0.0,2.0,71.0,,54800.0
1449,2017-11-20,-0.4,0.0,1.4,55.3,,55200.0
1454,2017-11-27,2.5,0.0,1.3,53.0,,52640.0
1457,2017-11-30,-2.4,0.0,3.3,26.8,,50800.0
1488,2018-01-18,2.1,0.0,1.9,59.6,,49900.0
1494,2018-01-26,-14.8,0.0,2.6,34.9,,50780.0
1882,2019-08-28,26.1,0.0,1.9,66.2,,44150.0
2217,2021-01-07,-14.5,0.0,4.1,49.9,,82900.0


## 2. Training

In [93]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class StockModel(nn.Module):
    def __init__(self, input_dim, t_dim, h_c=256, num_layers=2):
        super(StockModel, self).__init__()
        self.input_dim = input_dim
        self.t_dim = t_dim

        self.rnn = nn.GRU(input_size=input_dim, hidden_size=h_c, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(in_features=h_c, out_features=1)

    def forward(self, x):
        output, hn = self.rnn(x)
        return self.fc(output[:, :, :]) # batch, last_cell, out_dim


### 2) Dataloader

In [94]:
from torch.utils.data import DataLoader, Dataset

class StockDataset(Dataset):
    def __init__(self, df, t_dim, label:List = None,):
        super(StockDataset, self).__init__()
        self.df = df.reset_index(drop=True)
        self.t_dim = t_dim
        if label is None:
            label = df.columns
        self.label = label
        self.total_length = len(df) - (t_dim + 1)

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        x = self.df.loc[idx: idx+self.t_dim-1].values
        y = self.df.loc[idx+1: idx+self.t_dim][self.label].values
        return x, y
# preprocess
# log
stock_weather_df['Close'] = np.log(1+stock_weather_df['Close'])
train_df = stock_weather_df[stock_weather_df['Date'] < '2019-01-01'].drop(columns='Date')
test_df = stock_weather_df[stock_weather_df['Date'] >= '2019-01-01'].drop(columns='Date')

train_mean, train_std = train_df.mean(), train_df.std()

train_df = (train_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

display(train_df.head(3), test_df.head(3))

Unnamed: 0,평균기온,강수량mm,평균풍속ms,평균습도rh,일조합,Close
0,-1.620981,-0.27642,-0.192611,0.498315,-0.234437,-1.366412
1,-1.593744,-0.241892,0.036974,0.620731,-1.045259,-1.281495
2,-1.857033,-0.27642,0.725729,-0.256583,0.323003,-1.366412


Unnamed: 0,평균기온,강수량mm,평균풍속ms,평균습도rh,일조합,Close
1720,-1.63006,-0.27642,-0.881366,-1.113494,0.475032,0.802278
1721,-1.502955,-0.27642,-1.225743,-1.38553,0.475032,0.690485
1722,-1.285061,-0.27642,-1.455328,-0.249782,-0.741201,0.675651


In [95]:
def train(model, train_loader, criterion, optimizer, epochs=10, device=torch.device('cpu'), valid_loader=None):
    model = model.to(device)
    for epoch in range(epochs):
        running_loss = 0
        # train
        for step, (x, y) in enumerate(train_loader):
            x, y = x.float().to(device), y.float().to(device)
            optimizer.zero_grad()

            pred = model(x)
            loss =criterion(pred, y)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if (step+1) % 100 == 0:
                print(f"{epoch+1} epoch : {step+1 :5d}step - loss={running_loss/(step+1)}")
        print(f"{epoch+1} epoch : {step+1 :5d}step - loss={running_loss/(step+1)}")

        # valid
        if valid_loader is not None:
            valid_loss = 0
            for step, (x, y) in enumerate(valid_loader):
                x, y = x.float().to(device), y.float().to(device)
                pred = model(x)
                valid_loss += F.l1_loss(pred, y).item()
            print(f"valid_loss={valid_loss/(step+1)}")

    return model

In [96]:
# config
batch_size = 32
t_dim = 24  # todo: 변수명 변경 필요..
input_dim = len(weather_columns) + len(stock_columns)
epochs = 10
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


Train using only stock price data

In [97]:
# model
stock_model = StockModel(1, t_dim=t_dim) # input_dim=1: [stock price value]
# dataloader
train_ds = StockDataset(train_df[['Close']], t_dim=t_dim, label=['Close'])
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_ds = StockDataset(test_df[['Close']], t_dim=t_dim, label=['Close'])
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)
# loss, optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.RMSprop(stock_model.parameters(), lr=0.001)
# train
model = train(stock_model, train_loader, criterion, optimizer, epochs, device=device,
              valid_loader=valid_loader)


1 epoch :    53step - loss=0.09312282950740378
valid_loss=0.18607042815822822
2 epoch :    53step - loss=0.014180956724679694
valid_loss=0.35542036736240756
3 epoch :    53step - loss=0.009093610319431941
valid_loss=0.14008191457161537
4 epoch :    53step - loss=0.0074058351555030865
valid_loss=0.12647116298858935
5 epoch :    53step - loss=0.006942121421847984
valid_loss=0.1294395778901302
6 epoch :    53step - loss=0.012307862597429808
valid_loss=0.1306858855084731
7 epoch :    53step - loss=0.005682153776639475
valid_loss=0.35515443980693817
8 epoch :    53step - loss=0.005962995948480829
valid_loss=0.18193556511631379
9 epoch :    53step - loss=0.00679250966476382
valid_loss=0.1788157091404383
10 epoch :    53step - loss=0.010920617110008057
valid_loss=0.17282057438905424


Train stock price & weather data

In [98]:
model = StockModel(input_dim, t_dim)

train_ds = StockDataset(train_df, t_dim=t_dim, label=['Close'])
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_ds = StockDataset(test_df, t_dim=t_dim, label=['Close'])
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

criterion = nn.MSELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
model = train(model, train_loader, criterion, optimizer, epochs, device=device,
              valid_loader=valid_loader)

1 epoch :    53step - loss=0.21847050700266407
valid_loss=0.32899341216454137
2 epoch :    53step - loss=0.024235810399196058
valid_loss=0.3541843260710056
3 epoch :    53step - loss=0.01409255968898816
valid_loss=0.17057287521087205
4 epoch :    53step - loss=0.012220212434119773
valid_loss=0.20177223533391953
5 epoch :    53step - loss=0.007771459276313489
valid_loss=0.23019099006286034
6 epoch :    53step - loss=0.007766554290253036
valid_loss=0.1623167137687023
7 epoch :    53step - loss=0.007993196392045269
valid_loss=0.1933013338309068
8 epoch :    53step - loss=0.010161591026018251
valid_loss=0.24874989401835662
9 epoch :    53step - loss=0.007253518306105485
valid_loss=0.2909256168282949
10 epoch :    53step - loss=0.009728433823494136
valid_loss=0.21129524937042823
