In [195]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [196]:
import torch

from torch.utils.data import Dataset, DataLoader, TensorDataset

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold

from db.db import init_db, get_stocks

from model.predictor import LSTMModel

from helpers.plots import data_plot, plot_loss, plot_forecasting

In [197]:
await init_db()

In [198]:
df = await get_stocks()

# get the df size
df.shape

(60263, 6)

In [199]:
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

print(categorical_columns)
print(numerical_columns)

Index(['symbol'], dtype='object')
Index(['open', 'high', 'low', 'close', 'volume'], dtype='object')


In [200]:
df.dropna(inplace=True, axis=0)

# Select only open and symbol columns
df = df[['open', 'symbol']]

df.head()

Unnamed: 0,open,symbol
1990-01-02,0.248953,AAPL
1990-01-03,0.268375,AAPL
1990-01-04,0.270141,AAPL
1990-01-05,0.26661,AAPL
1990-01-08,0.264843,AAPL


In [201]:
# Split the dataset into train and test based on the date
from_date = '2020-01-01'

train = df[df.index < from_date]
test = df[df.index >= from_date]

print(train.shape, test.shape)

(34263, 2) (7350, 2)


In [202]:
# Scale the data per symbol
scaler = MinMaxScaler()
for symbol in train['symbol'].unique():
    train.loc[train['symbol'] == symbol, 'open'] = scaler.fit_transform(train.loc[train['symbol'] == symbol, 'open'].values.reshape(-1, 1))
    
print(train.loc['1990-01-02'])

                open symbol
1990-01-02  0.002144   AAPL
1990-01-02  0.000060   MSFT


In [203]:
# Create sequences and target for the LSTM model
def create_sequences(data: np.ndarray, seq_length: int): 
    X, y = [], []
    for i in range(len(data) - seq_length - 1):
        X.append(data[i:i + seq_length])
        y.append(data[i+1:i+seq_length+1])
    return np.array(X), np.array(y)

seq_length = 10
X_train, y_train = create_sequences(train, seq_length)

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()

X_train.shape, y_train.shape

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
# Define the model
input_size = X_train.shape[2]
hidden_layer_size = 100
output_size = y_train.shape[1]
num_layers = 12

model = LSTMModel(input_size, hidden_layer_size, output_size, num_layers)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(model)

LSTMModel(
  (lstm): LSTM(12, 100, num_layers=10, batch_first=True)
  (fc): Linear(in_features=100, out_features=12, bias=True)
)


In [None]:
def train_model(model, loss_fn, optimizer, train_data, validation_data, epochs=20):
    train_losses = []
    val_losses = []
    
    for i in range(epochs):
        model.train()
        epoch_losses = []
        for X_batch, y_batch in train_data:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss.item())
        train_losses.append(np.mean(epoch_losses))
        
        model.eval()
        val_loss = []
        with torch.no_grad():
            for X_val, y_val in validation_data:
                X_val, y_val = X_val.to(device), y_val.to(device)
                y_pred = model(X_val)
                loss = loss_fn(y_pred, y_val)
                val_loss.append(loss.item())
            val_losses.append(np.mean(val_loss))
        print(f"Epoch {i}: train loss {train_losses[-1]} val loss {val_losses[-1]}")
        
    return train_losses, val_losses

In [None]:
batch_size = 64
epochs = 5
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]
    
    train_data = DataLoader(TensorDataset(X_tr, y_tr), batch_size=batch_size, shuffle=True)
    val_data = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=True)
    
    train_losses, val_losses = train_model(model, loss_fn, optimizer, train_data, val_data, epochs=epochs)
    plot_loss(train_losses, val_losses)

Epoch 0: train loss 0.07318525403164901 val loss 0.07320621229994756
Epoch 1: train loss 0.07318613205391628 val loss 0.07321063765826133
Epoch 2: train loss 0.07318484201095998 val loss 0.07320746485717021
Epoch 3: train loss 0.07318453284768531 val loss 0.07319883758632037


KeyboardInterrupt: 