In [1]:
import pandas as pd
from pathlib import Path
import numpy as np, pandas as pd, torch, pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import math
from typing import Tuple, List
import yaml

In [None]:
# 数据集

class LOBDataset(Dataset):
    def __init__(self, df: pd.DataFrame, data_cfg: dict):
        self.seq_len = data_cfg["seq_len"]
        self.horizon = data_cfg["horizon"]
        self.alpha = data_cfg["alpha"]
        self.X = df[data_cfg["feature_order"]].ffill().values.astype(np.float32)
        mid = ((df["ask1"] + df["bid1"]) / 2).values.astype(np.float32)
        self.mid = mid
        self.idxs = np.arange(self.seq_len, len(df) - self.horizon)

    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, i):
        k = self.idxs[i]
        x = self.X[k - self.seq_len : k]
        past = self.mid[k - self.seq_len : k].mean()
        future = self.mid[k + 1 : k + 1 + self.horizon].mean()
        pct = (future - past) / past
        y = 2 if pct > self.alpha else (0 if pct < -self.alpha else 1)
        return torch.from_numpy(x), torch.tensor(y, dtype=torch.long)


class LOBDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df: pd.DataFrame,
        batch: int,
        val_ratio: float,
        data_cfg: dict,
        num_workers: int = 4,
    ):
        super().__init__()
        self.df, self.batch, self.val_ratio = df, batch, val_ratio
        self.data_cfg = data_cfg
        self.num_workers = num_workers

    def setup(self, stage=None):
        ds = LOBDataset(self.df, self.data_cfg)
        n_val = int(len(ds) * self.val_ratio)
        self.train_set, self.val_set = torch.utils.data.random_split(
            ds, [len(ds) - n_val, n_val]
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            batch_size=self.batch,
            shuffle=True,
            persistent_workers=True,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_set,
            batch_size=self.batch,
            num_workers=self.num_workers,
            persistent_workers=True,
        )

In [3]:
# 模型类
import torch.nn as nn


class DeepLOBLightning(pl.LightningModule):
    def __init__(
        self,
        input_width,
        input_size,
        in_channels,
        out_channels,
        kernel_size,
        stride,
        lr,
        neg_slope,
        hidden_size,
    ):
        super().__init__()
        self.save_hyperparameters()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )  # -> (B,16,100,input_width // 2)
        self.conv2 = nn.Conv2d(
            out_channels, out_channels, kernel_size, stride=stride
        )  # -> (B,16,100,input_width // 4)
        self.conv3 = nn.Conv2d(
            out_channels, out_channels, kernel_size=(1, self.hparams.input_width // 4)
        )  # -> (B,16,100,1)
        act = nn.LeakyReLU(0.01)

        # Inception 分支
        def branch(k):  # k x 1 卷积分支
            return nn.Sequential(
                nn.Conv2d(16, 8, (1, 1)),
                nn.Conv2d(8, 8, (k, 1), padding='same'),
                act,
            )

        self.branch1 = nn.Conv2d(16, 8, (1, 1))
        self.branch3, self.branch10, self.branch20 = branch(3), branch(10), branch(20)
        self.branchP = nn.Sequential(
            nn.MaxPool2d((3, 1), 1, (1, 0)), nn.Conv2d(16, 8, (1, 1))
        )

        self.lstm = nn.LSTM(
            input_size=input_size, hidden_size=hidden_size, batch_first=True
        )
        self.fc = nn.Linear(64, 3)
        self.act = act
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):  # x (B,100,40)
        x = x.unsqueeze(1)  # -> (B,1,100,40)
        x = self.act(self.conv1(x))
        x = self.act(self.conv2(x))
        x = self.act(self.conv3(x))  # (B,16,100,1)

        B, C, T, W = x.shape  # W=1
        b1 = self.act(self.branch1(x))
        b3 = self.branch3(x)
        b10 = self.branch10(x)
        b20 = self.branch20(x)
        bp = self.act(self.branchP(x))
        x = torch.cat([b1, b3, b10, b20, bp], dim=1)  # (B,40,100,1)
        x = x.squeeze(-1).permute(0, 2, 1)  # (B,40,100)

        lstm_out, _ = self.lstm(x)  # (100,B,64)
        logits = self.fc(lstm_out[:, -1, :])  # 只取最后时刻
        return logits

    def training_step(self, batch, batch_idx):
        X, y = batch
        logits = self(X)
        loss = self.loss(logits, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        logits = self(X)
        preds = logits.argmax(dim=1)
        acc = (preds == y).float().mean()
        self.log("val_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        return acc  # 如果你想在 validation_epoch_end 里拿到所有 batch 的 acc

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
    


In [4]:
import yaml
import re
from pathlib import Path

with open("config/config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

_sci_re = re.compile(r"""^[+-]?            # 可选正负号
                        (?:\d+\.\d*|\d*\.\d+|\d+)  # 整数或小数
                        [eE][+-]?\d+$    # e/E + 指数
                     """, re.VERBOSE)

def _cast_sci(obj):
    """
    递归把符合科学计数法格式的 str 转为 float
    """
    if isinstance(obj, dict):
        return {k: _cast_sci(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_cast_sci(v) for v in obj]
    if isinstance(obj, str) and _sci_re.match(obj):
        return float(obj)
    return obj

cfg = _cast_sci(cfg)


In [5]:
data = pd.read_parquet(
    Path(cfg["data"]["path"]) / cfg["data"]["parquet_file"]
)

dm = LOBDataModule(
    df        = data,
    batch     = cfg["datamodule"]["batch_size"],
    val_ratio = cfg["datamodule"]["val_ratio"],
    data_cfg  = cfg["data"]
)
dm.setup()


model = DeepLOBLightning(
    input_width = cfg["model"]["input_width"],
    input_size  = cfg["model"]["input_size"],
    in_channels = cfg["model"]["in_channels"],
    out_channels= cfg["model"]["out_channels"],
    kernel_size = tuple(cfg["model"]["kernel_size"]),
    stride      = tuple(cfg["model"]["stride"]),
    lr          = cfg["model"]["lr"],
    neg_slope   = cfg["model"]["neg_slope"],
    hidden_size = cfg["model"]["hidden_size"],
)

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

checkpoint_callback = ModelCheckpoint(
    dirpath    = cfg["checkpoint"]["dirpath"],
    filename   = cfg["checkpoint"]["filename"],
    monitor    = cfg["checkpoint"]["monitor"],
    mode       = cfg["checkpoint"]["mode"],
    save_top_k = cfg["checkpoint"]["save_top_k"],
)

logger = TensorBoardLogger(
    save_dir = cfg["logger"]["tensorboard"]["save_dir"],
    name     = cfg["logger"]["tensorboard"]["name"],
)


In [6]:
print("alpha:", cfg["data"]["alpha"], type(cfg["data"]["alpha"]))

alpha: 0.0001 <class 'float'>


In [7]:
trainer = Trainer(
    accelerator         = cfg["trainer"]["accelerator"],
    devices             = cfg["trainer"]["devices"],
    max_epochs          = cfg["trainer"]["max_epochs"],
    callbacks           = [checkpoint_callback],
    logger              = logger,
    log_every_n_steps   = cfg["trainer"]["log_every_n_steps"],
)

trainer.fit(model, dm)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/sumen/Documents/workspace/25summer/.venv/lib/python3.13/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:658: Checkpoint directory /Users/sumen/Documents/workspace/25summer/DeepLOB/checkpoints exists and is not empty.

   | Name     | Type             | Params | Mode 
-------------------------------------------------------
0  | conv1    | Conv2d           | 48     | train
1  | conv2    | Conv2d           | 528    | train
2  | conv3    | Conv2d           | 1.3 K  | train
3  | branch1  | Conv2d           | 136    | train
4  | branch3  | Sequential       | 336    | train
5  | branch10 | Sequential       | 784    | train
6  | branch20 | Sequential       | 1.4 K  | train
7  | branchP  | Sequential       | 136    | train
8  | lstm     | LSTM             | 27.1 K | train
9  | fc       | Linear           | 195    | train
10 | act      | LeakyReLU        | 0    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Traceback (most recent call last):
  File [35m"<string>"[0m, line [35m1[0m, in [35m<module>[0m
    from multiprocessing.spawn import spawn_main; [31mspawn_main[0m[1;31m(tracker_fd=77, pipe_handle=98)[0m
                                                  [31m~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.0_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/spawn.py"[0m, line [35m122[0m, in [35mspawn_main[0m
    exitcode = _main(fd, parent_sentinel)
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.0_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/spawn.py"[0m, line [35m132[0m, in [35m_main[0m
    self = reduction.pickle.load(from_parent)
[1;35mAttributeError[0m: [35mCan't get attribute 'LOBDataset' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>[0m

Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined