In [107]:
import pandas as pd
import requests
import numpy as np
from io import StringIO
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def download_stock_data():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00312/dow_jones_index.zip'
    response = requests.get(url)
    if response.status_code == 200:
        with open('dow_jones_index.zip', 'wb') as f:
            f.write(response.content)
        print('주식 데이터 다운로드 완료!')
    else:
        print('데이터 다운로드에 실패했습니다.')

def read_stock_data():
    import zipfile

    with zipfile.ZipFile('dow_jones_index.zip', 'r') as z:
        z.extractall()

    df = pd.read_csv('dow_jones_index.data')
    return df

# 주식 데이터 다운로드
download_stock_data()

# 데이터를 Pandas DataFrame으로 읽기
stock_data = read_stock_data()
stock_data.head()

주식 데이터 다운로드 완료!


Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,1,AA,1/7/2011,$15.82,$16.72,$15.78,$16.42,239655616,3.79267,,,$16.71,$15.97,-4.42849,26,0.182704
1,1,AA,1/14/2011,$16.71,$16.71,$15.64,$15.97,242963398,-4.42849,1.380223,239655616.0,$16.19,$15.79,-2.47066,19,0.187852
2,1,AA,1/21/2011,$16.19,$16.38,$15.60,$15.79,138428495,-2.47066,-43.024959,242963398.0,$15.87,$16.13,1.63831,12,0.189994
3,1,AA,1/28/2011,$15.87,$16.63,$15.82,$16.13,151379173,1.63831,9.3555,138428495.0,$16.18,$17.14,5.93325,5,0.185989
4,1,AA,2/4/2011,$16.18,$17.39,$16.18,$17.14,154387761,5.93325,1.987452,151379173.0,$17.33,$17.37,0.230814,97,0.175029


In [108]:
def preprocess_stock_data_v2(stock_data, columns=['open', 'high', 'low', 'close','next_weeks_open', 'next_weeks_close']):
    # 숫자로 변환할 열들에 대해 $ 문자를 제거하고 실수형으로 변환합니다.
    for column in columns:
        stock_data[column] = stock_data[column].str.replace('$', '', regex=True).astype(float)
        stock_data[column].fillna(method='ffill', inplace=True)

    # 날짜를 datetime 형식으로 변환합니다.
    stock_data['date'] = pd.to_datetime(stock_data['date'])

    return stock_data

In [109]:
stock_data = preprocess_stock_data_v2(stock_data)

In [110]:
# 주식별로 데이터 정렬
stock_data = pd.concat([stock_data[stock_data['stock'] == stock] for stock in stock_data['stock'].unique()]).reset_index(drop=True)

In [111]:
stock_data.head(50)

Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,1,AA,2011-01-07,15.82,16.72,15.78,16.42,239655616,3.79267,,,16.71,15.97,-4.42849,26,0.182704
1,1,AA,2011-01-14,16.71,16.71,15.64,15.97,242963398,-4.42849,1.380223,239655616.0,16.19,15.79,-2.47066,19,0.187852
2,1,AA,2011-01-21,16.19,16.38,15.6,15.79,138428495,-2.47066,-43.024959,242963398.0,15.87,16.13,1.63831,12,0.189994
3,1,AA,2011-01-28,15.87,16.63,15.82,16.13,151379173,1.63831,9.3555,138428495.0,16.18,17.14,5.93325,5,0.185989
4,1,AA,2011-02-04,16.18,17.39,16.18,17.14,154387761,5.93325,1.987452,151379173.0,17.33,17.37,0.230814,97,0.175029
5,1,AA,2011-02-11,17.33,17.48,16.97,17.37,114691279,0.230814,-25.712195,154387761.0,17.39,17.28,-0.632547,90,0.172712
6,1,AA,2011-02-18,17.39,17.68,17.28,17.28,80023895,-0.632547,-30.226696,114691279.0,16.98,16.68,-1.76678,83,0.173611
7,1,AA,2011-02-25,16.98,17.15,15.96,16.68,132981863,-1.76678,66.177694,80023895.0,16.81,16.58,-1.36823,76,0.179856
8,1,AA,2011-03-04,16.81,16.94,16.13,16.58,109493077,-1.36823,-17.66315,132981863.0,16.58,16.03,-3.31725,69,0.180941
9,1,AA,2011-03-11,16.58,16.75,15.42,16.03,114332562,-3.31725,4.4199,109493077.0,15.95,16.11,1.00313,62,0.187149


In [114]:
stock_data.head()

Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,1,AA,2011-01-07,15.82,16.72,15.78,16.42,239655616,3.79267,,,16.71,15.97,-4.42849,26,0.182704
1,1,AA,2011-01-14,16.71,16.71,15.64,15.97,242963398,-4.42849,1.380223,239655616.0,16.19,15.79,-2.47066,19,0.187852
2,1,AA,2011-01-21,16.19,16.38,15.6,15.79,138428495,-2.47066,-43.024959,242963398.0,15.87,16.13,1.63831,12,0.189994
3,1,AA,2011-01-28,15.87,16.63,15.82,16.13,151379173,1.63831,9.3555,138428495.0,16.18,17.14,5.93325,5,0.185989
4,1,AA,2011-02-04,16.18,17.39,16.18,17.14,154387761,5.93325,1.987452,151379173.0,17.33,17.37,0.230814,97,0.175029


In [115]:
# Device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def preprocess_data(stock_data, seq_length, target_column='close'):
    stock_data = stock_data.drop(columns=['quarter', 'stock', 'date']) #범주형 데이터와 시간을 drop하고
    scaler = MinMaxScaler()
    stock_data_normalized = scaler.fit_transform(stock_data) #정규화를 진행해준 뒤
    
    X, y = [], []
    for i in range(len(stock_data_normalized) - seq_length):
        X.append(stock_data_normalized[i:i+seq_length])
        y.append(stock_data_normalized[i+seq_length][target_column])

    X = torch.tensor(X).float().to(device)
    y = torch.tensor(y).float().to(device)

    return X, y, scaler

def pad_data(stock_data, max_length):
    padded_stock_data = stock_data.copy()
    for stock in stock_data['stock'].unique():
        stock_rows = stock_data[stock_data['stock'] == stock].index
        pad_size = max_length - len(stock_rows)
        if pad_size > 0:
            padding_rows = pd.DataFrame(index=pd.RangeIndex(start=stock_rows[-1] + 1, stop=stock_rows[-1] + pad_size + 1))
            for col in stock_data.columns:
                padding_rows[col] = None
            padded_stock_data = pd.concat([padded_stock_data, padding_rows], axis=0)

    return padded_stock_data