In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt
from Informer_structure import load_data, preprocess_data, StockForecastDataset, transformer
import sys
import os

In [None]:
file_directory = '/content/drive/MyDrive/Stock_Data/'

if file_directory not in sys.path:
    sys.path.append(file_directory)

In [None]:
df_cleaned = load_data("/content/drive/MyDrive/Stock_Data/multi_stock.csv")
df_preprocessed, scalers, label_encoder = preprocess_data(df_cleaned)

Checking for null values... Date      0
close     0
high      0
low       0
open      0
volume    0
ticker    0
dtype: int64
Data after dropping nulls: (113827, 7)
Ticker: AAPL, ID: 0
Ticker: AMZN, ID: 1
Ticker: BAC, ID: 2
Ticker: COST, ID: 3
Ticker: CVX, ID: 4
Ticker: GOOGL, ID: 5
Ticker: GS, ID: 6
Ticker: JNJ, ID: 7
Ticker: JPM, ID: 8
Ticker: LLY, ID: 9
Ticker: META, ID: 10
Ticker: MS, ID: 11
Ticker: MSFT, ID: 12
Ticker: NKE, ID: 13
Ticker: NVDA, ID: 14
Ticker: PFE, ID: 15
Ticker: PG, ID: 16
Ticker: TSLA, ID: 17
Ticker: WMT, ID: 18
Ticker: XOM, ID: 19


In [None]:
df_preprocessed

Unnamed: 0,Date,open,high,low,close,volume,ticker,ticker_id
0,2000-01-03,0.001109,0.001184,0.001081,0.001187,0.058034,AAPL,0
1,2000-01-04,0.001147,0.001163,0.001075,0.001080,0.055496,AAPL,0
2,2000-01-05,0.001096,0.001162,0.001096,0.001097,0.084307,AAPL,0
3,2000-01-06,0.001123,0.001122,0.001004,0.000994,0.083186,AAPL,0
4,2000-01-07,0.001014,0.001054,0.001010,0.001045,0.049902,AAPL,0
...,...,...,...,...,...,...,...,...
113822,2023-12-22,0.217516,0.217551,0.217489,0.216218,0.000682,CVX,4
113823,2023-12-26,0.217990,0.218437,0.219149,0.218165,0.000548,CVX,4
113824,2023-12-27,0.218276,0.218380,0.218341,0.217449,0.000567,CVX,4
113825,2023-12-28,0.216713,0.217023,0.215541,0.214385,0.000872,CVX,4


In [None]:
dataset = StockForecastDataset(df_preprocessed)

print("Number of samples:", len(dataset))
print("Sample input shape:", dataset[0][0].shape)
print("Sample target shape:", dataset[0][1].shape)

Number of samples: 112427
Sample input shape: torch.Size([60, 6])
Sample target shape: torch.Size([10])


In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))


dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using Device",dev)

model = transformer(input_dim=6, pred=10, d_model=128, n__heads=4, num_layers=3, ff_dim=256).to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

best_rmse = float("inf")

for epoch in range(1,25):
    model.train()
    epoch_loss = 0.0

    for x, y in train_loader:
        x, y = x.to(dev), y.to(dev)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch}, Loss: {epoch_loss/len(train_loader)}")

    model.eval()
    val_pred, val_true = [], []
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(dev), y.to(dev)
            output = model(x)
            val_pred.append(output.cpu().numpy())
            val_true.append(y.cpu().numpy())

    val_pred = np.concatenate(val_pred, axis=0)
    val_true = np.concatenate(val_true, axis=0)
    rmse = sqrt(mean_squared_error(val_true.flatten(), val_pred.flatten()))
    scheduler.step(rmse)

    print(f"Epoch {epoch}, Validation RMSE: {rmse}")

    if rmse < best_rmse:
        best_rmse = rmse
        torch.save(model.cpu().state_dict(), "/content/drive/MyDrive/Stock_Data/best_informer_model.pth")
        print(f"Model saved with RMSE: {best_rmse}")
        model.to(dev)

print("Best RMSE:", best_rmse)


Train dataset size: 89941
Validation dataset size: 22486
Using Device cuda
Epoch 1, Loss: 0.0009709986424380374
Epoch 1, Validation RMSE: 0.008916845678640935
Model saved with RMSE: 0.008916845678640935
Epoch 2, Loss: 0.00018281941917108643
Epoch 2, Validation RMSE: 0.008294597155872219
Model saved with RMSE: 0.008294597155872219
Epoch 3, Loss: 0.00012954685296241172
Epoch 3, Validation RMSE: 0.00877188454245849
Epoch 4, Loss: 0.00010303251401890048
Epoch 4, Validation RMSE: 0.007521350190854752
Model saved with RMSE: 0.007521350190854752
Epoch 5, Loss: 9.149150824159137e-05
Epoch 5, Validation RMSE: 0.0075527822061347985
Epoch 6, Loss: 8.426223572361076e-05
Epoch 6, Validation RMSE: 0.007683095765327409
Epoch 7, Loss: 7.937061074064357e-05
Epoch 7, Validation RMSE: 0.007072808792512276
Model saved with RMSE: 0.007072808792512276
Epoch 8, Loss: 7.534571311043757e-05
Epoch 8, Validation RMSE: 0.007771931964771099
Epoch 9, Loss: 7.172781484823755e-05
Epoch 9, Validation RMSE: 0.007667745