In [1]:
# import sys

# !git clone https://github.com/TimeBinFM/binconvfm.git
# %cd binconvfm
# !{sys.executable} -m pip install .

In [2]:
from binconvfm.utils.download.quick import get_file_names_per_dataset, get_target_dataset, dataset_to_window_tensors
from binconvfm.utils.download.gift_eval import list_arrow_files

## Base dataset preparation

In [3]:
dataset_name = "Salesforce/GiftEvalPretrain"
file_names_per_dataset = get_file_names_per_dataset(dataset_name)

files_per_ds = 4

def get_ds(ds_name):
    file_names = file_names_per_dataset[ds_name][:files_per_ds]
    return get_target_dataset(dataset_name, file_names).select_columns(['target'])

ds1 = get_ds('buildings_900k')
# ds2 = get_ds('borg_cluster_data_2011')

Generating train split: 0 examples [00:00, ? examples/s]

## Model preparation

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl
from torch import nn

class LinearRegressionModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(32, 1)

    def forward(self, x):
        # x: [batch_size, 32, 1] => squeeze last dim to [batch_size, 32]
        x = x.squeeze(-1)
        return self.linear(x)  # Output shape: [batch_size, 1]

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.float()
        y = y.float()
        y = y.squeeze(-1)  # shape: [batch_size, 1]
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        # GPU-accelerated AdamW (fused)
        return torch.optim.AdamW(self.parameters(), lr=1e-3, fused=True)

## Preprocess data

In [5]:
import torch
from torch.utils.data import Dataset

class TensorDataset(Dataset):
    def __init__(self, tensor: torch.Tensor):
        self.tensor = tensor

    def __len__(self):
        return self.tensor.size(0)

    def __getitem__(self, idx):
        x = self.tensor[idx, :-1]
        y = self.tensor[idx, -1]
        return x, y

In [6]:
window_size = 32
prediction_depth = 1
step = 1

window_tensor = dataset_to_window_tensors(ds1, window_size, prediction_depth, step)

# def preprocess_data(item):
#     target = item['target']
#     target_shape = target.shape
    
#     target_tensor = torch.cat(list(target), dim=0) if len(target_shape) > 1 else target
#     windowed_tensor = target_tensor.unfold(dimension=0, size=window_size+prediction_depth, step=step)

#     return {
#         'target': windowed_tensor
#     }
    
# ds_preprocessed = ds1.map(preprocess_data, batched=False)

In [7]:
window_tensor_dataset = TensorDataset(window_tensor)

## Train a model

In [8]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.profilers import PyTorchProfiler
from torch.utils.data import DataLoader

# ----------------------------
# Dataset and DataLoader
# ----------------------------
# window_tensor_dataset must be a torch.Tensor dataset or TensorDataset
dataloader = DataLoader(
    window_tensor_dataset,
    shuffle=False,
    batch_size=8192 * 4,       # large batch size to fully utilize GPU
    num_workers=32,        # CPU parallelism for data loading
    pin_memory=True,       # speeds up CPU->GPU transfer
    persistent_workers=True,  # keep workers alive between epochs
    prefetch_factor=4,     # prefetch batches per worker
)

# # ----------------------------
# # PyTorch Lightning Profiler
# # ----------------------------
# profiler = PyTorchProfiler(
#     schedule=torch.profiler.schedule(wait=50, warmup=50, active=5, repeat=1),
#     on_trace_ready=torch.profiler.tensorboard_trace_handler("/workspace"),
#     record_shapes=True,
#     profile_memory=True,
#     with_stack=True,
# )

# ----------------------------
# Trainer
# ----------------------------
trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu",         # force GPU
    devices=1,                 # single GPU
    precision=16,              # automatic mixed precision for speed
    # profiler=profiler,
    gradient_clip_val=0.0,     # avoid gradient clipping overhead
)

# ----------------------------
# Model to GPU
# ----------------------------
model = LinearRegressionModel().to("cuda")

# ----------------------------
# Training
# ----------------------------
trainer.fit(model, dataloader)

/venv/main/lib/python3.12/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/venv/main/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Pl

Training: |          | 0/? [00:00<?, ?it/s]

  loss = F.mse_loss(y_hat, y)
  loss = F.mse_loss(y_hat, y)
`Trainer.fit` stopped: `max_epochs=1` reached.


In [9]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import time

# ----------------------------
# Parameters
# ----------------------------
input_dim = 32      # adjust to match your dataset
output_dim = 1     # adjust to match your dataset
batch_size = 8192 * 4  # 32768
device = "cuda" if torch.cuda.is_available() else "cpu"

# ----------------------------
# DataLoader
# ----------------------------
dataloader = DataLoader(
    window_tensor_dataset,     # dataset defined outside this script
    shuffle=True,
    batch_size=batch_size,
    num_workers=32,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=4,
)

# ----------------------------
# Model
# ----------------------------
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

model = LinearRegressionModel(input_dim, output_dim).to(device)

# ----------------------------
# Optimizer & Loss
# ----------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, fused=True)
criterion = nn.MSELoss()
scaler = torch.cuda.amp.GradScaler()

# ----------------------------
# Training loop with average IPS logging every 1000 steps
# ----------------------------
model.train()
total_steps = 0
start_time = None
step_times = []

for epoch in range(1):  # max_epochs=1
    for step, (x, y) in enumerate(dataloader):
        if start_time is None:
            start_time = time.time()
        
        batch_start = time.time()

        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)  # ensure target shape matches output_dim

        optimizer.zero_grad()

        # forward + backward with mixed precision
        with torch.cuda.amp.autocast():
            y_hat = model(x)
            loss = criterion(y_hat, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # ----------------------------
        # IPS logging
        # ----------------------------
        batch_time = time.time() - batch_start
        step_times.append(batch_time)
        total_steps += 1

        if total_steps % 1000 == 0:
            avg_ips = 1000 / sum(step_times[-1000:])
            print(f"Step {total_steps}: Avg IPS over last 1000 steps = {avg_ips:.2f}")

# ----------------------------
# Final average IPS
# ----------------------------
total_time = time.time() - start_time
overall_avg_ips = total_steps / total_time
print(f"Training complete. Total steps: {total_steps}, Total time: {total_time}, Overall Avg IPS: {overall_avg_ips:.2f}")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  return F.mse_loss(input, target, reduction=self.reduction)


Step 1000: Avg IPS over last 1000 steps = 541.73
Step 2000: Avg IPS over last 1000 steps = 462.66
Step 3000: Avg IPS over last 1000 steps = 411.61
Step 4000: Avg IPS over last 1000 steps = 351.45
Step 5000: Avg IPS over last 1000 steps = 405.42
Step 6000: Avg IPS over last 1000 steps = 424.06
Step 7000: Avg IPS over last 1000 steps = 446.51
Step 8000: Avg IPS over last 1000 steps = 428.11
Step 9000: Avg IPS over last 1000 steps = 447.23
Step 10000: Avg IPS over last 1000 steps = 401.10
Step 11000: Avg IPS over last 1000 steps = 479.73
Step 12000: Avg IPS over last 1000 steps = 541.13
Step 13000: Avg IPS over last 1000 steps = 489.01
Step 14000: Avg IPS over last 1000 steps = 476.63
Step 15000: Avg IPS over last 1000 steps = 421.62
Training complete. Total steps: 15118, Total time: 582.0803904533386, Overall Avg IPS: 25.97


  return F.mse_loss(input, target, reduction=self.reduction)


In [10]:
582 / 60

9.7