In [13]:
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import yfinance as yf
import lightning.pytorch as pl

In [14]:
sp500 = yf.download("^GSPC", period="max", start="2001-1-1")
sp500.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-02,1320.280029,1320.280029,1276.050049,1283.27002,1283.27002,1129400000
2001-01-03,1283.27002,1347.76001,1274.619995,1347.560059,1347.560059,1880700000
2001-01-04,1347.560059,1350.23999,1329.140015,1333.339966,1333.339966,2131000000
2001-01-05,1333.339966,1334.77002,1294.949951,1298.349976,1298.349976,1430800000
2001-01-08,1298.349976,1298.349976,1276.290039,1295.859985,1295.859985,1115500000


In [15]:
sp500.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5862 entries, 2001-01-02 to 2024-04-22
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       5862 non-null   float64
 1   High       5862 non-null   float64
 2   Low        5862 non-null   float64
 3   Close      5862 non-null   float64
 4   Adj Close  5862 non-null   float64
 5   Volume     5862 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 320.6 KB


In [16]:
from typing import Any
from lightning.pytorch.utilities.types import TRAIN_DATALOADERS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pytorch_forecasting import TimeSeriesDataSet
import os


class UnivariateSeriesDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data: pd.DataFrame,
        n_lags: int,
        horizon: int,
        test_size: float = 0.2,
        batch_size: int = 16,
        feature_name: str = "Close",
    ) -> None:
        super().__init__()
        self.data = data
        self.n_lags = n_lags
        self.horizon = horizon
        self.test_size = test_size
        self.batch_size = batch_size
        self.feature_name = feature_name
        self.training: TimeSeriesDataSet
        self.validation: TimeSeriesDataSet
        self.test: TimeSeriesDataSet
        self.predict: TimeSeriesDataSet
        self.target_scaler = StandardScaler()
        self.setup()

    def preprocess_data(self):
        self.data["target"] = self.data[self.feature_name]
        self.data["time_index"] = np.arange(len(self.data))
        self.data["group_id"] = 0

    def split_data(self):
        time_indices = self.data["time_index"].values
        train_indices, test_indices = train_test_split(
            time_indices, test_size=self.test_size, shuffle=False
        )
        train_indices, val_indices = train_test_split(
            train_indices, test_size=0.1, shuffle=False
        )
        return train_indices, val_indices, test_indices

    def scale_target(self, df, indices):
        scaled_values = self.target_scaler.transform(df.loc[indices, ["target"]])
        df.loc[indices, "target"] = scaled_values

    def setup(self, stage=None) -> None:
        self.preprocess_data()
        train_indices, val_indices, test_indices = self.split_data()

        train_df = self.data.loc[self.data["time_index"].isin(train_indices)]
        val_df = self.data.loc[self.data["time_index"].isin(val_indices)]
        test_df = self.data.loc[self.data["time_index"].isin(test_indices)]

        self.target_scaler.fit(train_df[["target"]])
        self.scale_target(train_df, train_df.index)
        self.scale_target(val_df, val_df.index)
        self.scale_target(test_df, test_df.index)

        # Setup datasets
        self.training = TimeSeriesDataSet(
            train_df,
            time_idx="time_index",
            target="target",
            group_ids=["group_id"],
            max_encoder_length=self.n_lags,
            max_prediction_length=self.horizon,
            time_varying_unknown_reals=[self.feature_name],
            scalers={name: StandardScaler() for name in [self.feature_name]},
        )
        self.validation = TimeSeriesDataSet.from_dataset(
            self.training, val_df, stop_randomization=True
        )
        self.test = TimeSeriesDataSet.from_dataset(
            self.training, test_df, stop_randomization=True
        )
        self.predict = TimeSeriesDataSet.from_dataset(
            self.training, test_df, stop_randomization=True
        )

    def train_dataloader(self) -> Any:
        return self.training.to_dataloader(
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
            shuffle=False,
            persistent_workers=True,
        )

    def val_dataloader(self) -> Any:
        return self.validation.to_dataloader(
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
            shuffle=False,
            persistent_workers=True,
        )

    def test_dataloader(self) -> Any:
        return self.test.to_dataloader(
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
            shuffle=False,
            persistent_workers=True,
        )

    def predict_dataloader(self) -> Any:
        return self.predict.to_dataloader(
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
            shuffle=False,
            persistent_workers=True,
        )

In [17]:
datamodule = UnivariateSeriesDataModule(
    data=sp500, n_lags=7, horizon=1, batch_size=32, test_size=0.2
)

In [18]:
train_dataloader = datamodule.train_dataloader()

In [19]:
x, y = next(iter(train_dataloader))
print("x =", x)
print("\ny =", y)
print("\nsizes of x =")
for key, value in x.items():
    print(f"\t{key} = {value.size()}")

x = {'encoder_cat': tensor([], size=(32, 7, 0), dtype=torch.int64), 'encoder_cont': tensor([[[-0.3243],
         [-0.1724],
         [-0.2060],
         [-0.2886],
         [-0.2945],
         [-0.2829],
         [-0.2534]],

        [[-0.1724],
         [-0.2060],
         [-0.2886],
         [-0.2945],
         [-0.2829],
         [-0.2534],
         [-0.2214]],

        [[-0.2060],
         [-0.2886],
         [-0.2945],
         [-0.2829],
         [-0.2534],
         [-0.2214],
         [-0.2409]],

        [[-0.2886],
         [-0.2945],
         [-0.2829],
         [-0.2534],
         [-0.2214],
         [-0.2409],
         [-0.2218]],

        [[-0.2945],
         [-0.2829],
         [-0.2534],
         [-0.2214],
         [-0.2409],
         [-0.2218],
         [-0.2151]],

        [[-0.2829],
         [-0.2534],
         [-0.2214],
         [-0.2409],
         [-0.2218],
         [-0.2151],
         [-0.1714]],

        [[-0.2534],
         [-0.2214],
         [-0.2409],
    

In [20]:
test_dataloader = datamodule.test_dataloader()

In [21]:
x, y = next(iter(test_dataloader))
print("x =", x)
print("\ny =", y)
print("\nsizes of x =")
for key, value in x.items():
    print(f"\t{key} = {value.size()}")

x = {'encoder_cat': tensor([], size=(32, 7, 0), dtype=torch.int64), 'encoder_cont': tensor([[[3.3710],
         [3.4449],
         [3.4231],
         [3.4675],
         [3.5540],
         [3.5585],
         [3.5108]],

        [[3.4449],
         [3.4231],
         [3.4675],
         [3.5540],
         [3.5585],
         [3.5108],
         [3.5852]],

        [[3.4231],
         [3.4675],
         [3.5540],
         [3.5585],
         [3.5108],
         [3.5852],
         [3.6755]],

        [[3.4675],
         [3.5540],
         [3.5585],
         [3.5108],
         [3.5852],
         [3.6755],
         [3.6819]],

        [[3.5540],
         [3.5585],
         [3.5108],
         [3.5852],
         [3.6755],
         [3.6819],
         [3.6813]],

        [[3.5585],
         [3.5108],
         [3.5852],
         [3.6755],
         [3.6819],
         [3.6813],
         [3.6836]],

        [[3.5108],
         [3.5852],
         [3.6755],
         [3.6819],
         [3.6813],
         [3

In [22]:
predict_dataloader = datamodule.predict_dataloader()

In [23]:
x, y = next(iter(predict_dataloader))
print("x =", x)
print("\ny =", y)
print("\nsizes of x =")
for key, value in x.items():
    print(f"\t{key} = {value.size()}")

x = {'encoder_cat': tensor([], size=(32, 7, 0), dtype=torch.int64), 'encoder_cont': tensor([[[3.3710],
         [3.4449],
         [3.4231],
         [3.4675],
         [3.5540],
         [3.5585],
         [3.5108]],

        [[3.4449],
         [3.4231],
         [3.4675],
         [3.5540],
         [3.5585],
         [3.5108],
         [3.5852]],

        [[3.4231],
         [3.4675],
         [3.5540],
         [3.5585],
         [3.5108],
         [3.5852],
         [3.6755]],

        [[3.4675],
         [3.5540],
         [3.5585],
         [3.5108],
         [3.5852],
         [3.6755],
         [3.6819]],

        [[3.5540],
         [3.5585],
         [3.5108],
         [3.5852],
         [3.6755],
         [3.6819],
         [3.6813]],

        [[3.5585],
         [3.5108],
         [3.5852],
         [3.6755],
         [3.6819],
         [3.6813],
         [3.6836]],

        [[3.5108],
         [3.5852],
         [3.6755],
         [3.6819],
         [3.6813],
         [3

In [24]:
print(y[0])

tensor([[3.5852],
        [3.6755],
        [3.6819],
        [3.6813],
        [3.6836],
        [3.7345],
        [3.7549],
        [3.7497],
        [3.7274],
        [3.7457],
        [3.7482],
        [3.7483],
        [3.7135],
        [3.7128],
        [3.6533],
        [3.6965],
        [3.6794],
        [3.6420],
        [3.6773],
        [3.5911],
        [3.4667],
        [3.5211],
        [3.6189],
        [3.5876],
        [3.4796],
        [3.5418],
        [3.5861],
        [3.6620],
        [3.6523],
        [3.7220],
        [3.7079],
        [3.7274]])
