In [13]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
from tqdm import tqdm
import pandas as pd
# import torch_directml

#### Define the TimeSeriesDataset class

In [14]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, i):
        seq = self.data[i : i + self.seq_len]
        label = self.data[i + self.seq_len, -2]
        return {
            "seq": seq.clone().detach().float(),
            "label": label.clone().detach().float().unsqueeze(0),
        }


#### Define the Model class

In [34]:
class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=10):
        super(Model, self).__init__()
        self.rnn = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=0.2,
        )
        self.fc = nn.Linear(hidden_dim, 2 * hidden_dim)
        self.fc2 = nn.Linear(2 * hidden_dim, output_dim)
        self.h0 = None
        self.c0 = None

    def forward(self, x):
        # print(x.shape)
        if (
            self.h0 is None
            or self.c0 is None
            or self.h0.shape[1] != x.shape[0]
            or self.c0.shape[1] != x.shape[0]
        ):
            
            self.h0 = torch.zeros(
                self.rnn.num_layers, x.size(0), self.rnn.hidden_size
            ).to(x.device)
            self.c0 = torch.zeros(
                self.rnn.num_layers, x.size(0), self.rnn.hidden_size
            ).to(x.device)

        out, (self.h0, self.c0) = self.rnn(x, (self.h0, self.c0))

        # Detach hidden states from the computation graph to prevent backpropagation
        self.h0 = self.h0.detach()
        self.c0 = self.c0.detach()
        out = self.fc(out[:, -1, :])
        out = torch.relu(out)
        out = self.fc2(out)
        return out




#### Initialize the device, model, criterion, and optimizer


In [41]:
# self.device = torch_directml.device()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Model(input_dim=6, hidden_dim=64, output_dim=1, num_layers=10).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

#### Load the data

In [16]:
data = {}

for csv_file in os.listdir("./data"):
    if not csv_file.endswith(".csv"):
        continue
    print(f"Processing {csv_file}")
    file = open(f"./data/{csv_file}", "r")

    stock_name = csv_file.split(".")[0]
    df = pd.read_csv(
        file,
        header=1,
        names=["ticker", "date", "open", "high", "low", "close", "volume"],
    )
    df = df.sort_values(by="date", ascending=True)
    data[stock_name] = torch.tensor(df.iloc[:, 1:].values)

    file.close()


Processing BID.csv
Processing BVH.csv
Processing CTG.csv
Processing FPT.csv
Processing GAS.csv
Processing HDB.csv
Processing HPG.csv
Processing KDH.csv
Processing MBB.csv
Processing MSN.csv
Processing MWG.csv
Processing NVL.csv
Processing PDR.csv
Processing PLX.csv
Processing PNJ.csv
Processing POW.csv
Processing REE.csv
Processing SBT.csv
Processing SSI.csv
Processing STB.csv
Processing TCB.csv
Processing TCH.csv
Processing TPB.csv
Processing VCB.csv
Processing VHM.csv
Processing VIC.csv
Processing VJC.csv
Processing VNM.csv
Processing VPB.csv
Processing VRE.csv



#### Create the dataset and dataloader

In [48]:
dataset = TimeSeriesDataset(data["BID"], seq_len=10)
print(len(dataset))
merged_dataset = dataset
for i, stock_name in enumerate(data):
    if stock_name == "BID":
        continue
    merged_dataset = ConcatDataset(
        [merged_dataset, TimeSeriesDataset(data[stock_name], seq_len=10)]
    )
    if i % 3 == 0:
        break
print(len(merged_dataset))
train_dataset, test_dataset = random_split(merged_dataset, [0.8, 0.2])
torch.set_printoptions(sci_mode=False)
print(len(train_dataset), len(test_dataset))
dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=256, shuffle=True)

1836
11398
9119 2279


In [47]:
print(merged_dataset[0])

{'seq': tensor([[20140124.0000,    14.4241,    14.8037,    14.0446,    14.2723, 8417060.0000],
        [20140128.0000,    14.1964,    14.2723,    13.7409,    13.9686, 3240910.0000],
        [20140206.0000,    13.8168,    13.8927,    13.3613,    13.3613, 959590.0000],
        [20140208.0000,    13.2095,    13.2854,    12.5262,    12.6021, 2987600.0000],
        [20140210.0000,    12.5262,    13.0576,    12.2984,    12.9058, 4052090.0000],
        [20140212.0000,    13.0576,    13.0576,    12.8299,    12.8299, 1938480.0000],
        [20140212.0000,    12.7540,    12.8299,    12.6781,    12.7540, 1176180.0000],
        [20140212.0000,    12.7540,    12.8299,    12.6022,    12.6021, 2156140.0000],
        [20140214.0000,    12.7539,    12.9058,    12.6021,    12.7540, 1489380.0000],
        [20140216.0000,    12.8299,    12.8299,    12.6022,    12.6781, 1173560.0000],
        [20140218.0000,    12.6781,    12.6781,    12.6022,    12.6021, 1861540.0000],
        [20140220.0000,    12.6780, 

#### Train the model

In [49]:
epochs = 100

In [50]:
model.train()
for epoch in range(epochs):
    running_loss = 0.0
    with tqdm(dataloader, unit="step", ncols=90, desc=f"Epoch {epoch + 1}/{epochs}") as tepoch:
        for batch in tepoch:
            seq = batch["seq"].to(device)
            label = batch["label"].to(device)
            optimizer.zero_grad()
            out = model(seq)
            loss = torch.sqrt(criterion(out, label))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            tepoch.set_postfix(loss=f"{loss.item():.3f}")
    running_loss /= len(dataloader)
    model.eval()
    with torch.inference_mode():

        testing_loss = 0.0
        for i, batch in enumerate(testloader):
            seq = batch["seq"].to(device)
            label = batch["label"].to(device)
            out = model(seq)
            loss = torch.sqrt(criterion(out, label))
            testing_loss += loss.item()
        if (epoch+1) % 3 == 0:
            for pred, act in zip(out[:10], label[:10]):
                print(f"Pred: {pred.item()}, Actual: {act.item()}")
            
        print(
            f"Epochs: {epoch+1:3d} train: {running_loss :.3f}| test: {testing_loss / (len(testloader)):.3f}"
        )

    model.train()


Epoch 1/100:   0%|                                              | 0/285 [00:00<?, ?step/s]

Epoch 1/100: 100%|███████████████████████| 285/285 [00:19<00:00, 14.84step/s, loss=19.598]


Epochs:   1 train: 19.283| test: 19.115


Epoch 2/100: 100%|███████████████████████| 285/285 [00:18<00:00, 15.54step/s, loss=22.203]


Epochs:   2 train: 19.276| test: 19.138


Epoch 3/100:  23%|█████▌                  | 66/285 [00:04<00:15, 14.42step/s, loss=22.002]


KeyboardInterrupt: 