# FinanceDataReader에서 주어진 종목 및 기간 데이터 받아오기
-  주어진 데이터에서 거래중지로 의심되는 종목 제외

In [18]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import FinanceDataReader as fdr

##### Data Load

In [2]:
df_old = pd.read_csv("../data/train.csv")
df_add = pd.read_csv("../data/train_additional.csv")
sub_df = pd.read_csv("../data/sample_submission.csv")

In [3]:
df_add = pd.concat([df_old, df_add]).reset_index(drop=True)

In [4]:
df_old.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']
df_add.columns = ['date', 'code', 'name', 'volume', 'open', 'high', 'low', 'close']

In [9]:
df_old['date'] = pd.to_datetime(df_old['date'], format='%Y%m%d')
df_add['date'] = pd.to_datetime(df_add['date'], format='%Y%m%d')

In [10]:
print('시작일 :', df_old['date'].min(), '종료일 :', df_old['date'].max())
print('시작일 :', df_add['date'].min(), '종료일 :', df_add['date'].max())

시작일 : 2021-06-01 00:00:00 종료일 : 2023-05-30 00:00:00
시작일 : 2021-06-01 00:00:00 종료일 : 2023-07-28 00:00:00


##### open 데이터로 pivot_table 생성

In [11]:
open_old_df = pd.pivot_table(df_old, index='date', columns='code', values='open').sort_index()
open_add_df = pd.pivot_table(df_add, index='date', columns='code', values='open').sort_index()

##### FDR 데이터 Load

In [105]:
fdr_old_df = pd.DataFrame(index=open_old_df.index)
for code in df_old['code'].unique():
    tmp = fdr.DataReader(code[1:], '2021-06-01', '2023-05-30')
    fdr_old_df.loc[:, code] = tmp['Close']

In [106]:
fdr_add_df = pd.DataFrame(index=open_add_df.index)
for code in df_add['code'].unique():
    tmp = fdr.DataReader(code[1:], '2021-06-01', '2023-07-28')
    fdr_add_df.loc[:, code] = tmp['Close']

##### 마지막 날 거래 X 종목 제외

In [16]:
old_last_no_halt = open_old_df.loc[:, open_old_df.iloc[-1, :] != 0]
add_last_no_halt = open_add_df.loc[:, open_add_df.iloc[-1, :] != 0]

In [17]:
fdr_old_no_halt = fdr_old_df.loc[:, old_last_no_halt.columns]
fdr_add_no_halt = fdr_add_df.loc[:, add_last_no_halt.columns]

### csv 저장

In [18]:
fdr_old_no_halt.to_csv('../data/fdr_old_no_halt.csv')
fdr_add_no_halt.to_csv('../data/fdr_add_no_halt.csv')

# LSTM 모델

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import StandardScaler

##### public 또는 private 기간 데이터 중 선택

In [None]:
is_old = False  # private 기간 데이터

##### Load Data

In [None]:
sub_df = pd.read_csv("data/sample_submission.csv")
if is_old:
    df = pd.read_csv('data/fdr_old_no_halt.csv', index_col=0)
else:
    df = pd.read_csv('data/fdr_add_no_halt.csv', index_col=0)

##### train data sequence 길이 설정 & 현재로부터 몇 일 후의 종가를 예측할 것인지 설정

In [None]:
seq_len = 30
pred_day = 15

##### 30일 길이의 train sequence data 및 valid, test data 생성 함수 정의

In [None]:
def create_sequences(data, seq_len, pred_day=15):
    num_codes, num_days = data.shape
    X_train, y_train = [], []
    X_valid, y_valid = [[] for _ in range(num_codes)], [[] for _ in range(num_codes)]
    X_test = []
    
    for i in range(num_codes):
        for j in range(seq_len, num_days-(pred_day<<1)):
            X_train.append(data[i][j-seq_len:j])
            y_train.append(data[i][j+pred_day-1])
        for j in range(num_days-(pred_day<<1), num_days-pred_day):
            X_valid[i].append(data[i][j-seq_len:j])
            y_valid[i].append(data[i][j+pred_day-1])
        X_test.append(data[i][-seq_len:])

    X_train, y_train, X_valid, y_valid, X_test = np.array(X_train), np.array(y_train), np.array(X_valid), np.array(y_valid), np.array(X_test)
    return torch.tensor(X_train).float(), torch.tensor(y_train).float(), torch.tensor(X_valid).float(), torch.tensor(y_valid).float(), torch.tensor(X_test).float()

##### Scaler 적용

In [None]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

##### Train Valid Split

In [None]:
X_train, y_train, X_valid, y_valid, X_test = create_sequences(scaled_df.T, seq_len, pred_day)

##### LSTM 모델 코드

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(torch.unsqueeze(x, -1), (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

##### 하이퍼파라미터 설정

In [None]:
input_size = 1
hidden_size = 64
num_layers = 1

num_epochs = 30
batch_size = 64
lr = 0.001

##### 모델 & 손실함수 & 옵티마이저 생성

In [None]:
model = LSTMModel(input_size, hidden_size, num_layers)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

##### 데이터셋 준비

In [None]:
train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_valid, y_valid)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

##### 모델 학습 및 평가

In [None]:
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(torch.squeeze(outputs), labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.6f}')

with torch.no_grad():
    model.eval()
    valid_outputs = model(X_valid[:, -1, :])
    valid_loss = criterion(torch.squeeze(valid_outputs), y_valid[:, -1])
    print(f'Valid loss: {valid_loss.item():.6f}')

##### test data로 15일 후 종가 예측 & 수익률에 따른 rank 계산

In [None]:
# Make predictions on Test data
test_outputs = model(X_test)
final_prices = scaler.inverse_transform(test_outputs.reshape(1, -1).detach().numpy())
final_prices = final_prices.reshape(-1)

result = pd.DataFrame(df.iloc[-1])
result.columns = ['last']
result['pred'] = final_prices
result['rtn'] = (result['pred'] - result['last']) / result['last']
ranks = result['rtn'].rank(method='first', ascending=False).astype('int')

##### submission 파일 생성 및 저장
- 거래중지 의심 종목 : 1000등 근방으로 설정

In [None]:
submission = pd.merge(sub_df, ranks, left_on='종목코드', right_on=result.index, how='left')
submission['순위'] = submission['rtn'].fillna(0).astype(int)
submission = submission.drop('rtn', axis=1)

ranks = submission['순위']
halts = 2000 - df.shape[1]
ranks = np.where(ranks >= 1001, ranks+halts, ranks)
ranks[ranks == 0] = range(1001, 1001+halts)
submission['순위'] = ranks
submission.to_csv('submission/lstm_02.csv', index=False)