# Create a dataset

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from binconvfm.utils.download.gift_eval import list_arrow_files, PostProcessingDataset
from torch.utils.data import Dataset, DataLoader

In [2]:
dataset_name = "Salesforce/GiftEvalPretrain"
files_to_process = 16
num_workers = 8

pretrain_file_names = list_arrow_files("Salesforce/GiftEvalPretrain")
ds = PostProcessingDataset(file_names=pretrain_file_names[:files_to_process])

dataloader = DataLoader(ds, shuffle=False, batch_size=None, num_workers=num_workers)

# Define a model

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl
from torch import nn

class LinearRegressionModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(32, 1)

    def forward(self, x):
        # x: [batch_size, 32, 1] => squeeze last dim to [batch_size, 32]
        x = x.squeeze(-1)
        return self.linear(x)  # Output shape: [batch_size, 1]

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.float()
        y = y.float()
        y = y.squeeze(-1)  # shape: [batch_size, 1]
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)

# Train a model

In [None]:
model = LinearRegressionModel()
trainer = pl.Trainer(max_epochs=1, accelerator="auto")
trainer.fit(model, dataloader)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/aklyukvin/Projects/timeseries-foundational/binconvfm/.venv/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type   | Params | Mode 
---------------------

['BEIJING_SUBWAY_30MIN/data-00000-of-00001.arrow', 'HZMETRO/data-00000-of-00001.arrow']['LOS_LOOP/data-00000-of-00001.arrow', 'PEMS03/data-00000-of-00001.arrow']['PEMS04/data-00000-of-00001.arrow', 'PEMS07/data-00000-of-00001.arrow']['Q-TRAFFIC/data-00000-of-00003.arrow', 'Q-TRAFFIC/data-00001-of-00003.arrow']['PEMS08/data-00000-of-00001.arrow', 'PEMS_BAY/data-00000-of-00001.arrow']['alibaba_cluster_trace_2018/data-00000-of-00010.arrow', 'alibaba_cluster_trace_2018/data-00001-of-00010.arrow']['Q-TRAFFIC/data-00002-of-00003.arrow', 'SHMETRO/data-00000-of-00001.arrow']
['alibaba_cluster_trace_2018/data-00002-of-00010.arrow', 'alibaba_cluster_trace_2018/data-00003-of-00010.arrow']








data-00000-of-00001.arrow:   0%|          | 0.00/36.4M [00:00<?, ?B/s]

data-00002-of-00010.arrow:   0%|          | 0.00/304M [00:00<?, ?B/s]

Training: |                                                                                                   …

data-00000-of-00003.arrow:   0%|          | 0.00/353M [00:00<?, ?B/s]

data-00000-of-00010.arrow:   0%|          | 0.00/305M [00:00<?, ?B/s]

data-00000-of-00001.arrow:   0%|          | 0.00/62.6M [00:00<?, ?B/s]

data-00002-of-00003.arrow:   0%|          | 0.00/353M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

data-00000-of-00001.arrow:   0%|          | 0.00/67.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

data-00000-of-00001.arrow:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

data-00003-of-00010.arrow:   0%|          | 0.00/305M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

data-00001-of-00010.arrow:   0%|          | 0.00/306M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

data-00001-of-00003.arrow:   0%|          | 0.00/353M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

data-00000-of-00001.arrow:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Test that a model predicts something

In [None]:
test_loader = DataLoader(ds, shuffle=False, batch_size=None)

model.eval()

for x, y in test_loader:
    with torch.no_grad():
        x = x.float()
        y_pred = model(x)
    print("Input:", x.shape)
    print("Target:", y.shape)
    break  # Only test one batch