In [1]:


from src.utils.path import PathManager
from src.config.app_config import AppConfig
from src.observability.instrumentation import Instrumentation
from src.pipeline.step import BasePipelineStep
from src.pipeline.context import PipelineContext
from src.utils.filesystem import FileSystem
from src.utils.logger import logs

In [2]:
cfg = AppConfig.load()
pm = PathManager()
inst = Instrumentation()
d = '2015-01-01'

raw_dir = pm.raw_dir(d)
parquet_dir = pm.parquet_dir(d)
symbol_dir = pm.fact_dir(d)
normalize_dir = pm.canonical_dir(d)
meta_dir = pm.meta_dir(d)

FileSystem.ensure_dir(raw_dir)
FileSystem.ensure_dir(parquet_dir)
FileSystem.ensure_dir(symbol_dir)
FileSystem.ensure_dir(normalize_dir)
FileSystem.ensure_dir(meta_dir)

ctx = PipelineContext(
    date=d,
    raw_dir=raw_dir,
    parquet_dir=parquet_dir,
    fact_dir=symbol_dir,
    canonical_dir=normalize_dir,
    meta_dir=meta_dir
)


In [3]:
class CsvConvertStep(BasePipelineStep):
    def __init__(self, engine, inst=None):
        super().__init__(inst)
        self.engine = engine

    def run(self, ctx: PipelineContext):
        input_dir = ctx.raw_dir
        out_dir = ctx.parquet_dir

        for zfile in input_dir.glob("*.7z"):
            out_files = self._build_out_files(zfile, out_dir)
            if self._all_exist(out_files):
                print(f"[CsvConvertStep] skip {zfile.name}")
                continue
            print(f"[CsvConvertStep]  {zfile.name} {out_files}")
            self.engine.convert(zfile, out_files)

    def _detect_type(self, filename):
        """
        根据文件名约定识别 file_type：
            SH_Stock_OrderTrade.csv.7z → SH_MIXED
            SH_Order.csv.7z           → SH_ORDER
            SH_Trade.csv.7z           → SH_TRADE
            SZ_Order.csv.7z           → SZ_ORDER
            SZ_Trade.csv.7z           → SZ_TRADE
        """
        lower = filename.lower()

        if lower.startswith("sh_stock_ordertrade"):
            return "SH_MIXED"

        if lower.startswith("sh_order"):
            return "SH_ORDER"
        if lower.startswith("sh_trade"):
            return "SH_TRADE"

        if lower.startswith("sz_order"):
            return "SZ_ORDER"
        if lower.startswith("sz_trade"):
            return "SZ_TRADE"

        raise RuntimeError(f"无法识别文件类型: {filename}")

    def _build_out_files(self, zfile: Path, parquet_dir: Path) -> dict[str, Path]:
        file_type = self._detect_type(zfile.stem)
        if file_type == "SH_MIXED":
            return {
                "sh_order": parquet_dir / "sh_order.parquet",
                "sh_trade": parquet_dir / "sh_trade.parquet",
            }

        stem = zfile.stem.replace(".csv", "")
        return {
            stem.lower(): parquet_dir / f"{stem.lower()}.parquet"
        }

    @staticmethod
    def _all_exist(out_files: dict[str, Path]) -> bool:
        return all(p.exists() for p in out_files.values())


from pathlib import Path
from src.engines.extractor_engine import ExtractorEngine


class ParquetAppendWriter:
    def __init__(self):
        self._schemas: dict[Path, pa.Schema] = {}
        self._writers: dict[Path, pq.ParquetWriter] = {}

    def write_batches(self, path: Path, batches: list[pa.RecordBatch]) -> None:
        if not batches:
            return

        writer = self._writers.get(path)
        if writer is None:
            # schema 来自第一个 batch
            schema = batches[0].schema
            path.parent.mkdir(parents=True, exist_ok=True)
            writer = pq.ParquetWriter(
                path,
                schema,
                compression="zstd",
            )
            self._writers[path] = writer
            self._schemas[path] = schema

        table = pa.Table.from_batches(batches, schema=self._schemas[path])
        writer.write_table(table)

    def close(self) -> None:
        for writer in self._writers.values():
            writer.close()
        self._writers.clear()


import pyarrow.compute as pc


class ConvertEngine:
    ORDER_TYPES = ["A", "D", "M"]
    TRADE_TYPE = "T"
    TICK_COL = "TickType"

    def __init__(self):
        self.extractor = ExtractorEngine
        self.order_set = pa.array(self.ORDER_TYPES)
        self.trade_value = pa.scalar(self.TRADE_TYPE)

    def convert(self, zfile: Path, out_files: dict[str, Path]) -> None:
        reader = self.extractor.open_reader(zfile)
        writer = ParquetAppendWriter()
        try:
            for batch in reader:
                batch = self.extractor.cast_strings(batch)

                if len(out_files) == 1:
                    # 非拆分
                    key = next(iter(out_files))
                    writer.write_batches(out_files[key], [batch])
                else:
                    # 拆分
                    for key, sub_batch in self._split(batch, out_files).items():
                        if sub_batch.num_rows:
                            writer.write_batches(out_files[key], [sub_batch])
        finally:
            writer.close()

    def _split(self, batch: pa.RecordBatch, out_files: dict[str, Path]) -> dict[str, pa.RecordBatch]:
        """返回 (order_batch, trade_batch)"""
        if self.TICK_COL not in batch.schema.names:
            raise ValueError(f"missing column: {self.TICK_COL}")

        idx = batch.schema.get_field_index(self.TICK_COL)
        tick_arr = batch.column(idx)

        order_mask = pc.is_in(tick_arr, self.order_set)
        trade_mask = pc.equal(tick_arr, self.trade_value)

        result = {}
        for key in out_files:
            if "order" in key:
                result[key] = batch.filter(order_mask)
            elif "trade" in key:
                result[key] = batch.filter(trade_mask)

        return result


In [4]:


from src import DateTimeUtils
from functools import reduce


class NormalizeEngine:
    """
    NormalizeEngine（冻结契约版）

    - 输入：交易所级 parquet
    - 输出：canonical order / trade parquet
    - symbol 只是字段，不做拆分
    """

    VALID_EVENTS = {"ADD", "CANCEL", "TRADE"}
    batch_size = 1_000_0000

    def execute(self, input_file: Path, output_dir: Path) -> None:
        exchange, kind = input_file.stem.split("_", 1)
        out_path = output_dir / input_file.name

        pf = pq.ParquetFile(input_file)
        writer = None

        for batch in pf.iter_batches(self.batch_size):
            table = pa.Table.from_batches([batch])
            table = self.filter_a_share_arrow(table)
            if table.num_rows == 0:
                continue

            table = parse_events_arrow(
                table,
                exchange=exchange,
                kind=kind,
            )

            if table.num_rows == 0:
                continue
            if writer is None:
                writer = pq.ParquetWriter(out_path, table.schema)

            writer.write_table(table)

        if writer:
            writer.close()

    def filter_a_share_arrow(self, table: pa.Table) -> pa.Table:
        symbol = pc.cast(table["SecurityID"], pa.string())

        # prefixes = [
        #     "600", "601", "603", "605", "688",
        #     "000", "001", "002", "003", "300",
        # ]
        prefixes = [
            "60", "688",
            "00", "300",
        ]

        masks = [pc.starts_with(symbol, p) for p in prefixes]

        mask = reduce(pc.or_, masks)

        return table.filter(mask)


class NormalizeStep(BasePipelineStep):
    def __init__(self, engine: NormalizeEngine, inst=None):
        super().__init__(inst)
        self.engine = engine

    def run(self, ctx: PipelineContext) -> PipelineContext:
        input_dir: Path = ctx.parquet_dir
        output_dir: Path = ctx.canonical_dir

        for file in list(input_dir.glob("*.parquet")):
            filename = file.stem
            output_file = output_dir / filename

            if output_file.exists():
                logs.info(f'')
                continue
            self.engine.execute(
                input_file=input_dir / file,
                output_dir=output_dir,
            )


from datetime import datetime

# =============================================================================
# Internal Event Schema（唯一真相）
# =============================================================================
# from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Optional
from typing import Literal

import pyarrow as pa
import pyarrow.compute as pc

EventKind = Literal["order", "trade"]

INTERNAL_SCHEMA = pa.schema(
    [('symbol', pa.string()),
     ("ts", pa.int64()),
     ("event", pa.string()),
     ("order_id", pa.int64()),
     ("side", pa.string()),
     ("price", pa.float64()),
     ("volume", pa.int64()),
     ("buy_no", pa.int64()),
     ("sell_no", pa.int64()),
     ]
)


@dataclass(frozen=True)
class ExchangeDefinition:
    symbol_field: str
    time_field: str
    event_field: str
    event_mapping: Dict
    price_field: str
    volume_field: str
    side_field: Optional[str]
    side_mapping: Optional[Dict]
    id_field: str
    buy_no_field: Optional[str]
    sell_no_field: Optional[str]


EXCHANGE_REGISTRY = {
    # 上海
    'sh': {
        "order": ExchangeDefinition(
            symbol_field='SecurityID',
            time_field="TickTime",
            event_field="TickType",
            event_mapping={"A": "ADD", "D": "CANCEL"},
            price_field="Price",
            volume_field="Volume",
            side_field="Side",
            side_mapping={'1': "B", '2': "S"},
            id_field="SubSeq",
            buy_no_field=None,
            sell_no_field=None,
        ),
        "trade": ExchangeDefinition(
            symbol_field='SecurityID',
            time_field="TickTime",
            event_field="TickType",
            event_mapping={"T": "TRADE"},
            price_field="Price",
            volume_field="Volume",
            side_field="Side",
            side_mapping={'1': "B", '2': "S"},
            id_field="SubSeq",
            buy_no_field="BuyNo",
            sell_no_field="SellNo",
        ),
    },

    # 深圳
    'sz': {
        "order": ExchangeDefinition(
            symbol_field='SecurityID',
            time_field="OrderTime",
            event_field="OrderType",
            event_mapping={'0': "CANCEL", '1': "ADD", '2': "ADD", '3': "ADD"},
            price_field="Price",
            volume_field="Volume",
            side_field="Side",
            side_mapping={'1': "B", '2': "S"},
            id_field="SubSeq",
            buy_no_field=None,
            sell_no_field=None,
        ),

        "trade": ExchangeDefinition(
            symbol_field='SecurityID',
            time_field="TickTime",
            event_field="ExecType",
            event_mapping={'1': "TRADE", '2': "CANCEL"},
            price_field="TradePrice",
            volume_field="TradeVolume",
            side_field=None,
            side_mapping=None,
            id_field="SubSeq",
            buy_no_field="BuyNo",
            sell_no_field="SellNo",
        ),
    },
}


# MAPPING_kind = {
#     '1':'order',
#     '2':'trade',
# }

# =============================================================================
# 2. TickTime -> offset_us （执行层：Arrow vectorized）
# =============================================================================


def trade_time_to_base_us(trade_time) -> int:
    """
    使用 DateTimeUtils 作为唯一语义来源
    """
    d = DateTimeUtils.extract_date(trade_time)

    base_dt = datetime(
        d.year,
        d.month,
        d.day,
        tzinfo=DateTimeUtils.SH_TZ,
    )
    return int(base_dt.timestamp() * 1_000_000)


def _mod(a: pa.Array, b: int) -> pa.Array:
    """
    Arrow-safe modulo, version independent:
        a % b == a - floor(a / b) * b
    """
    return pc.subtract(
        a,
        pc.multiply(
            pc.cast(pc.floor(pc.divide(a, b)), pa.int64()),
            pa.scalar(b, pa.int64()),
        ),
    )


def tick_to_offset_us(col: pa.Array) -> pa.Array:
    t = pc.cast(col, pa.int64())

    # HH
    hh = pc.cast(pc.floor(pc.divide(t, 1_000_000)), pa.int64())

    # MM
    mm_all = pc.cast(pc.floor(pc.divide(t, 10_000)), pa.int64())
    mm = _mod(mm_all, 100)

    # SS
    ss_all = pc.cast(pc.floor(pc.divide(t, 100)), pa.int64())
    ss = _mod(ss_all, 100)

    # mmm (milliseconds)
    ms = _mod(t, 1_000)

    return pc.add(
        pc.add(
            pc.add(
                pc.multiply(hh, pa.scalar(3_600_000_000, pa.int64())),
                pc.multiply(mm, pa.scalar(60_000_000, pa.int64())),
            ),
            pc.multiply(ss, pa.scalar(1_000_000, pa.int64())),
        ),
        pc.multiply(ms, pa.scalar(1_000, pa.int64())),
    )


def map_dict(col: pa.Array, mapping: dict) -> pa.Array:
    keys = pa.array(list(mapping.keys()))
    vals = pa.array(list(mapping.values()))
    idx = pc.index_in(col, keys)
    return pc.take(vals, idx)


def zeros(n: int) -> pa.Array:
    return pa.array([0] * n, type=pa.int64())


def parse_events_arrow(
        table: pa.Table,
        kind: Literal["order", "trade"] = '',
        exchange: str = ''
) -> pa.Table:
    """
    输入：
        Arrow Table（单 symbol / 单 kind / 单 exchange）
    输出：
        Arrow Table（InternalEvent schema）
    """
    if table.num_rows == 0:
        return pa.Table.from_arrays([])

    try:
        definition = EXCHANGE_REGISTRY[exchange][kind]
    except KeyError:
        raise KeyError(f"No registry for exchange={exchange}, kind={kind}")
    # # ---------------------------------------------------------------------
    #     # ts
    #     # ---------------------------------------------------------------------
    # # print(table["TradeTime"][0])
    base_us = trade_time_to_base_us(table["TradeTime"][0].as_py())
    offset_us = tick_to_offset_us(table[definition.time_field])  # Array
    ts = pc.add(offset_us, pa.scalar(base_us, pa.int64()))  # Array

    # ---------------------------------------------------------------------
    # event
    # ---------------------------------------------------------------------
    event = map_dict(table[definition.event_field], definition.event_mapping)

    # ---------------------------------------------------------------------
    # side
    # ---------------------------------------------------------------------
    if definition.side_field and definition.side_mapping:
        side = map_dict(table[definition.side_field], definition.side_mapping)
    else:
        side = pa.nulls(table.num_rows)
    #
    # ---------------------------------------------------------------------
    # buy / sell no
    # ---------------------------------------------------------------------
    buy_no = (
        table[definition.buy_no_field]
        if definition.buy_no_field
        else zeros(table.num_rows)
    )
    sell_no = (
        table[definition.sell_no_field]
        if definition.sell_no_field
        else zeros(table.num_rows)
    )
    out = pa.table(
        {"symbol": pc.cast(table[definition.symbol_field], pa.string()),
         "ts": ts,
         "event": event,
         "order_id": pc.cast(table[definition.id_field], pa.int64()),
         "side": side,
         "price": pc.cast(table[definition.price_field], pa.float64()),
         "volume": pc.cast(table[definition.volume_field], pa.int64()),
         "buy_no": pc.cast(buy_no, pa.int64()),
         "sell_no": pc.cast(sell_no, pa.int64()),
         }
    )

    return out.cast(INTERNAL_SCHEMA)




In [5]:
#!filepath: src/engines/symbol_split_engine.py
from __future__ import annotations

import pyarrow as pa
from typing import Iterable


class SymbolSplitEngine:
    """
    SymbolSplitEngine（纯逻辑）：

    Input:
        - canonical Events.parquet（Arrow Table / Reader）
        - symbol: str

    Output:
        - bytes（该 symbol 的 parquet 内容）

    约束：
        - 不做 IO
        - 不依赖 Path
        - 不接触 Meta
    """

    def __init__(self, symbol_field: str = "symbol"):
        self.symbol_field = symbol_field

    # --------------------------------------------------
    def split_one(
            self,
            table: pa.Table,
            symbol: str,
    ) -> bytes:
        """
        从 canonical table 中切出某一个 symbol
        """
        mask = pa.compute.equal(table[self.symbol_field], symbol)
        sub = table.filter(mask)

        sink = pa.BufferOutputStream()
        pq.write_table(sub, sink)

        return sink.getvalue().to_pybytes()

    # --------------------------------------------------
    def split_many(
            self,
            table: pa.Table,
            symbols: Iterable[str],
    ) -> dict[str, bytes]:
        """
        一次切多个 symbol（可选优化）
        """
        result: dict[str, bytes] = {}

        for sym in symbols:
            result[sym] = self.split_one(table, sym)

        return result


In [12]:
#!filepath: src/steps/symbol_split_step.py
from __future__ import annotations

import pyarrow.parquet as pq

from src.pipeline.step import PipelineStep
from src.pipeline.meta import MetaRegistry


class SymbolSplitStep(PipelineStep):
    """
    SymbolSplitStep（Meta-aware，冻结版）

    Semantic:
        canonical Events.parquet
            → symbol/{symbol}/{date}/Trade.parquet
    SymbolSplitStep — DAILY-CLOSED (data-driven) FINAL VERSION

    Semantics (FROZEN):

    - Meta is DATE-scoped.
    - Daily universe is defined ONLY by that day's meta.outputs.
    - First run (no meta):
        * Read canonical once
        * Discover symbols appearing on THIS date
        * Full split
        * Write meta (universe = discovered symbols)
    - Subsequent runs:
        * Universe = meta.outputs.keys()
        * If all outputs valid -> SKIP (NO canonical IO)
        * If some outputs invalid/missing -> read canonical and repair ONLY those symbols
    - Does NOT detect symbols missing due to upstream canonical issues.
    """

    def __init__(
            self,
            engine: SymbolSplitEngine,
            inst=None,
    ):
        self.engine = engine
        self.inst = inst

    # --------------------------------------------------
    def run(self, ctx):
        input_dir: Path = ctx.canonical_dir
        output_dir: Path = ctx.fact_dir

        meta_dir: Path = ctx.meta_dir

        outputs = {}
        for file in list(input_dir.glob("*.parquet")):
            # ① 修正 step 语义：pipeline step + file
            step_key = f"{self.__class__.__name__}:{file.stem}"

            meta = MetaRegistry(
                meta_file=meta_dir / step_key,
                step=file.stem,
                date=ctx.date,
                engine_version="v1",
                input_file=input_dir,
            )
            manifest = meta.load()

            # ---------------------------------------------
            # ① 决定需要 split 的 symbol
            # ---------------------------------------------
            if manifest is None or meta.is_input_changed():
                table = pq.read_table(file, columns=["symbol"])
                symbols = table["symbol"].unique().to_pylist()
            else:
                status = meta.validate_outputs()
                symbols = [k for k, ok in status.items() if not ok]

            if not symbols:
                continue
            # ② 读取 canonical table（一次）
            table = pq.read_table(file)
            # ③ 执行 split（纯逻辑）
            payloads = self.engine.split_many(table, symbols)

            # ④ 写文件 + 记录 meta
            meta.begin_new()

            for sym, data in payloads.items():
                out_file = output_dir / sym / file.name.split('_')[1]
                FileSystem.safe_write(out_file, data)
                meta.record_output(sym, out_file)

            meta.commit()











In [13]:

cs = CsvConvertStep(engine=ConvertEngine(), inst=inst)
cs.run(ctx)

ns = NormalizeStep(engine=NormalizeEngine(), inst=inst)
ns.run(ctx)

sp = SymbolSplitStep(engine=SymbolSplitEngine(), inst=inst)
sp.run(ctx)



[CsvConvertStep]  SZ_Order.csv.7z {'sz_order': PosixPath('/home/wsw/data/parquet/2015-01-01/sz_order.parquet')}
[CsvConvertStep]  SH_Stock_OrderTrade.csv.7z {'sh_order': PosixPath('/home/wsw/data/parquet/2015-01-01/sh_order.parquet'), 'sh_trade': PosixPath('/home/wsw/data/parquet/2015-01-01/sh_trade.parquet')}
[CsvConvertStep]  SZ_Trade.csv.7z {'sz_trade': PosixPath('/home/wsw/data/parquet/2015-01-01/sz_trade.parquet')}


FileNotFoundError: [Errno 2] Failed to open local file '/home/wsw/data/canonical/2015-01-01/sh_order.parquet'. Detail: [errno 2] No such file or directory

In [14]:
t = pq.read_table('/home/wsw/data/canonical/2015-01-01/sh_order.parquet')

In [None]:
t.shape

In [None]:
t.slice(0, 5)

In [None]:
t.take([0, 1, 2])

In [None]:
import json

In [None]:
with open('/home/wsw/data/meta/2015-01-01/SymbolSplitStep:sh_order.parquet.json') as f:
    details = json.load(f)

In [None]:
details

In [None]:
with open('/home/wsw/data/meta/2015-01-01/SymbolSplitStep:sh_trade.parquet.json') as f:
    details2 = json.load(f)
details2

In [3]:
#!filepath: src/engines/orderbook_rebuild_engine.py
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Deque, Literal, Optional
from collections import defaultdict, deque

import pyarrow as pa
import pyarrow.parquet as pq

from src.pipeline.context import EngineContext
from src import logs


# ===============================
# Order / Book
# ===============================
@dataclass(slots=True)
class Order:
    order_id: int
    side: Literal["B", "S"]
    price: float
    volume: int
    ts: int  # 你原注释要求 ts 必须是 int


class OrderBook:
    """
    极简但正确的 OrderBook（重建用）— Arrow-friendly / 快版本

    关键优化（不改语义）：
    - bids/asks: price -> deque(order_id) 维持 FIFO
    - CANCEL 不再 ids.remove(order_id)（O(n)）
      改为：orders.pop(order_id) + level_volume 扣减 + lazy deletion
    - 每个价位维护 level_volume，snapshot 不再 sum(orders[oid].volume ...)
    """

    def __init__(self) -> None:
        # order_id -> Order
        self.orders: Dict[int, Order] = {}

        # price -> FIFO order ids (may contain stale ids; lazy cleaned)
        self.bids: Dict[float, Deque[int]] = defaultdict(deque)
        self.asks: Dict[float, Deque[int]] = defaultdict(deque)

        # price -> aggregated volume on that level
        self.bid_qty: Dict[float, int] = defaultdict(int)
        self.ask_qty: Dict[float, int] = defaultdict(int)

        self.last_ts: Optional[int] = None

    # --------------------------------------------------
    def add_order(self, *, ts: int, order_id: int, side: Optional[str], price: Optional[float], volume: Optional[int]) -> None:
        if side not in ("B", "S") or price is None or volume is None:
            return
        if order_id in self.orders:
            # 交易所可能重复下发，忽略
            self.last_ts = ts
            return

        o = Order(order_id=order_id, side=side, price=float(price), volume=int(volume), ts=int(ts))
        self.orders[order_id] = o

        if side == "B":
            self.bids[o.price].append(order_id)
            self.bid_qty[o.price] += o.volume
        else:
            self.asks[o.price].append(order_id)
            self.ask_qty[o.price] += o.volume

        self.last_ts = ts

    # --------------------------------------------------
    def cancel_order(self, *, ts: int, order_id: int) -> None:
        o = self.orders.pop(order_id, None)
        if o is None:
            self.last_ts = ts
            return

        if o.side == "B":
            self.bid_qty[o.price] -= o.volume
            if self.bid_qty[o.price] <= 0:
                self.bid_qty.pop(o.price, None)
                # bids[o.price] deque 里可能还有 stale ids，无需立刻清理
                self.bids.pop(o.price, None)
        else:
            self.ask_qty[o.price] -= o.volume
            if self.ask_qty[o.price] <= 0:
                self.ask_qty.pop(o.price, None)
                self.asks.pop(o.price, None)

        self.last_ts = ts

    # --------------------------------------------------
    def trade(self, *, ts: int, order_id: int, volume: Optional[int]) -> None:
        """
        仍然按你原语义：用 order_id 减 volume，<=0 视为撤单
        """
        if volume is None:
            self.last_ts = ts
            return

        o = self.orders.get(order_id)
        if o is None:
            self.last_ts = ts
            return

        dv = int(volume)
        if dv <= 0:
            self.last_ts = ts
            return

        # 扣减聚合量
        if o.side == "B":
            self.bid_qty[o.price] -= dv
        else:
            self.ask_qty[o.price] -= dv

        o.volume -= dv

        if o.volume <= 0:
            # cancel_order 会再扣一次 o.volume（已变负）会出错，所以这里走专用清理
            self._remove_filled(ts=ts, o=o)
        else:
            # 仍有剩余，保证聚合量不为负（保护）
            if o.side == "B" and self.bid_qty[o.price] < 0:
                self.bid_qty[o.price] = 0
            if o.side == "S" and self.ask_qty[o.price] < 0:
                self.ask_qty[o.price] = 0
            self.last_ts = ts

    def _remove_filled(self, *, ts: int, o: Order) -> None:
        # o 已经在 orders 里
        self.orders.pop(o.order_id, None)

        # 价位聚合量如果被扣到 <=0，直接移除价位
        if o.side == "B":
            if self.bid_qty.get(o.price, 0) <= 0:
                self.bid_qty.pop(o.price, None)
                self.bids.pop(o.price, None)
        else:
            if self.ask_qty.get(o.price, 0) <= 0:
                self.ask_qty.pop(o.price, None)
                self.asks.pop(o.price, None)

        self.last_ts = ts

    # --------------------------------------------------
    def snapshot_table(self, depth: int = 10) -> pa.Table:
        """
        输出 L2 快照（最小集）：
          ts, side, level, price, volume

        注意：volume 直接来自 bid_qty/ask_qty（O(depth log P)）
        """
        ts = self.last_ts if self.last_ts is not None else 0

        rows_ts: list[int] = []
        rows_side: list[str] = []
        rows_level: list[int] = []
        rows_price: list[float] = []
        rows_vol: list[int] = []

        # 买盘：高到低
        bid_prices = sorted(self.bid_qty.keys(), reverse=True)[:depth]
        for lvl, p in enumerate(bid_prices, start=1):
            q = int(self.bid_qty[p])
            rows_ts.append(ts)
            rows_side.append("B")
            rows_level.append(lvl)
            rows_price.append(float(p))
            rows_vol.append(q)

        # 卖盘：低到高
        ask_prices = sorted(self.ask_qty.keys(), reverse=False)[:depth]
        for lvl, p in enumerate(ask_prices, start=1):
            q = int(self.ask_qty[p])
            rows_ts.append(ts)
            rows_side.append("S")
            rows_level.append(lvl)
            rows_price.append(float(p))
            rows_vol.append(q)

        schema = pa.schema(
            [
                ("ts", pa.int64()),
                ("side", pa.string()),
                ("level", pa.int16()),
                ("price", pa.float64()),
                ("volume", pa.int64()),
            ]
        )

        return pa.table(
            {
                "ts": pa.array(rows_ts, type=pa.int64()),
                "side": pa.array(rows_side, type=pa.string()),
                "level": pa.array(rows_level, type=pa.int16()),
                "price": pa.array(rows_price, type=pa.float64()),
                "volume": pa.array(rows_vol, type=pa.int64()),
            },
            schema=schema,
        )


# ===============================
# Engine
# ===============================
class OrderBookRebuildEngine:
    """
    OrderBook 重建引擎（Offline + Realtime 共用）
    - Arrow-only IO
    - 不再走 pandas / itertuples / NormalizedEvent.from_row
    - 仍然保持你的唯一真相：所有事件最终只走 _apply
    """

    def __init__(self) -> None:
        self.book: Optional[OrderBook] = None

    # ======================================================
    def execute(self, ctx: EngineContext) -> None:
        if self.book is None:
            self.book = OrderBook()

        if ctx.mode == "offline":
            assert ctx.input_path and ctx.output_path
            self._run_offline(ctx.input_path, ctx.output_path)
        else:
            # realtime: ctx.event 仍保留，但这里建议你后续也改为原始字段
            assert ctx.event is not None
            ev = ctx.event
            self._apply(
                ts=int(ev.ts),
                event=ev.event,
                order_id=int(ev.order_id),
                side=ev.side,
                price=float(ev.price) if ev.price is not None else None,
                volume=int(ev.volume) if ev.volume is not None else None,
            )
            if ctx.emit_snapshot:
                assert ctx.output_path is not None
                self._emit_snapshot(ctx.output_path)

    # ======================================================
    def _run_offline(self, input_path: Path, output_path: Path) -> None:
        pf = pq.ParquetFile(input_path)

        # 只读重建需要的列（真裁剪）
        cols = ["ts", "event", "order_id", "side", "price", "volume"]

        for batch in pf.iter_batches(columns=cols):
            ts_arr = batch.column(batch.schema.get_field_index("ts"))
            ev_arr = batch.column(batch.schema.get_field_index("event"))
            oid_arr = batch.column(batch.schema.get_field_index("order_id"))
            side_arr = batch.column(batch.schema.get_field_index("side"))
            price_arr = batch.column(batch.schema.get_field_index("price"))
            vol_arr = batch.column(batch.schema.get_field_index("volume"))

            # 必须逐事件推进状态（orderbook 的本质），但避免构造对象
            for i in range(batch.num_rows):
                self._apply(
                    ts=int(ts_arr[i].as_py()),
                    event=ev_arr[i].as_py(),
                    order_id=int(oid_arr[i].as_py()),
                    side=side_arr[i].as_py(),
                    price=price_arr[i].as_py(),
                    volume=vol_arr[i].as_py(),
                )

        self._emit_snapshot(output_path)

    # ======================================================
    def _apply(
        self,
        *,
        ts: int,
        event: str,
        order_id: int,
        side: Optional[str],
        price: Optional[float],
        volume: Optional[int],
    ) -> None:
        assert self.book is not None

        if event == "ADD":
            self.book.add_order(ts=ts, order_id=order_id, side=side, price=price, volume=volume)
        elif event == "CANCEL":
            self.book.cancel_order(ts=ts, order_id=order_id)
        elif event == "TRADE":
            self.book.trade(ts=ts, order_id=order_id, volume=volume)
        else:
            raise ValueError(f"Unknown event={event}")

    # ======================================================
    def _emit_snapshot(self, out: Path) -> None:
        assert self.book is not None
        table = self.book.snapshot_table(depth=10)
        pq.write_table(table, out)
        # logs.info(f"[OrderBook] snapshot written → {out}")


In [1]:
import pyarrow.compute as pc
import pyarrow.parquet as pq

table = pq.read_table('~/data/parquet/2015-01-01/sz_order.parquet')

In [4]:
sym = table["ExchangeID"]

In [6]:
ree = pc.run_end_encode(sym)

In [12]:
ree.chunk(0).run_ends

<pyarrow.lib.Int32Array object at 0x7fb6d812c460>
[
  131072
]

In [29]:
single_array = ree.combine_chunks()
len(single_array)

5000000

In [16]:
run_ends = single_array.run_ends

In [19]:
run_values = single_array.values

In [22]:
# Convert ONLY per-symbol info to Python
values_py = run_values.to_pylist()
run_ends_py = run_ends.to_pylist()

In [27]:
from typing import Dict, Tuple

index: Dict[str, Tuple[int, int]] = {}
start = 0

for sym_val, end_exclusive in zip(values_py, run_ends_py):
    end_exclusive = int(end_exclusive)
    index[str(sym_val)] = (start, end_exclusive - start)
    start = end_exclusive


In [28]:
index

{'2': (4912985, 87015)}

In [30]:
from src.meta.symbol_accessor import SymbolAccessor
from src.meta.meta import BaseMeta

In [31]:
path = '~/data/meta/2015-01-01/sh_order.normalize.manifest.json'

In [3]:
from src.engines.parser_engine import parse_events_arrow
from typing import Dict, Tuple

import pyarrow.compute as pc
import pyarrow.parquet as pq

table = pq.read_table('~/data/parquet/2015-01-01/sz_order.parquet')
table = parse_events_arrow(
                table,
                exchange='sz',
                kind='order',
            )

sort_indices = pc.sort_indices(
            table,
            sort_keys=[
                ("symbol", "ascending"),
                ("ts", "ascending"),
            ],
        )
table = table.take(sort_indices)
sym = table["symbol"]

ree = pc.run_end_encode(sym)
single_array = ree.combine_chunks()
run_ends = single_array.run_ends
run_values = single_array.values
# Convert ONLY per-symbol info to Python
values_py = run_values.to_pylist()
run_ends_py = run_ends.to_pylist()
index: Dict[str, Tuple[int, int]] = {}
start = 0

for sym_val, end_exclusive in zip(values_py, run_ends_py):
    end_exclusive = int(end_exclusive)
    index[str(sym_val)] = (start, end_exclusive - start)
    start = end_exclusive

In [63]:
ree.chunk(0).run_ends, ree.chunk(0).values

(<pyarrow.lib.Int32Array object at 0x7fb5e65c1000>
 [
   3936,
   9829,
   10109,
   12336,
   13080,
   14639,
   17108,
   17985,
   18699,
   19737,
   ...
   4989057,
   4989147,
   4994984,
   4995548,
   4996230,
   4997308,
   4997796,
   4998057,
   4998806,
   5000000
 ],
 <pyarrow.lib.StringArray object at 0x7fb5e65c0ee0>
 [
   "000001",
   "000002",
   "000004",
   "000006",
   "000007",
   "000008",
   "000009",
   "000010",
   "000011",
   "000012",
   ...
   "301633",
   "301636",
   "301638",
   "301656",
   "301658",
   "301662",
   "301665",
   "301668",
   "301678",
   "302132"
 ])

In [64]:
struct = ree.combine_chunks()      # → StructArray
# values = struct.field("values")    # ← 正确
# run_ends = struct.field("run_ends")
values = struct.values
run_ends = struct.run_ends

In [65]:
values_py = values.to_pylist()
run_ends_py = run_ends.to_pylist()

index: Dict[str, Tuple[int, int]] = {}
start = 0

for sym_val, end_exclusive in zip(values_py, run_ends_py):
    end_exclusive = int(end_exclusive)
    index[str(sym_val)] = (start, end_exclusive - start)
    start = end_exclusive

In [4]:
index

{'000001': (0, 3936),
 '000002': (3936, 5893),
 '000004': (9829, 280),
 '000006': (10109, 2227),
 '000007': (12336, 744),
 '000008': (13080, 1559),
 '000009': (14639, 2469),
 '000010': (17108, 877),
 '000011': (17985, 714),
 '000012': (18699, 1038),
 '000014': (19737, 1139),
 '000016': (20876, 1940),
 '000017': (22816, 3235),
 '000019': (26051, 919),
 '000020': (26970, 930),
 '000021': (27900, 6054),
 '000025': (33954, 1490),
 '000026': (35444, 423),
 '000027': (35867, 672),
 '000028': (36539, 333),
 '000029': (36872, 516),
 '000030': (37388, 768),
 '000031': (38156, 1176),
 '000032': (39332, 1497),
 '000034': (40829, 4388),
 '000035': (45217, 1068),
 '000036': (46285, 1107),
 '000037': (47392, 526),
 '000039': (47918, 2982),
 '000042': (50900, 456),
 '000045': (51356, 702),
 '000048': (52058, 962),
 '000049': (53020, 2785),
 '000050': (55805, 1686),
 '000055': (57491, 373),
 '000056': (57864, 1202),
 '000058': (59066, 807),
 '000059': (59873, 1182),
 '000060': (61055, 4212),
 '000061'

In [44]:
from src.engines.parser_engine import parse_events_arrow
table = pq.read_table('~/data/parquet/2015-01-01/sz_order.parquet')
table = parse_events_arrow(
                table,
                exchange='sz',
                kind='order',
            )

sort_indices = pc.sort_indices(
            table,
            sort_keys=[
                ("symbol", "ascending"),
                ("ts", "ascending"),
            ],
        )
table = table.take(sort_indices)
symbol_col = table["symbol"]
symbols = symbol_col.to_pylist()

index: Dict[str, Tuple[int, int]] = {}

start = 0
current = symbols[0]

for i in range(1, len(symbols)):
    if symbols[i] != current:
        index[current] = (start, i - start)
        current = symbols[i]
        start = i

# 最后一个 symbol
index[current] = (start, len(symbols) - start)

In [45]:
index

{'000001': (0, 3936),
 '000002': (3936, 5893),
 '000004': (9829, 280),
 '000006': (10109, 2227),
 '000007': (12336, 744),
 '000008': (13080, 1559),
 '000009': (14639, 2469),
 '000010': (17108, 877),
 '000011': (17985, 714),
 '000012': (18699, 1038),
 '000014': (19737, 1139),
 '000016': (20876, 1940),
 '000017': (22816, 3235),
 '000019': (26051, 919),
 '000020': (26970, 930),
 '000021': (27900, 6054),
 '000025': (33954, 1490),
 '000026': (35444, 423),
 '000027': (35867, 672),
 '000028': (36539, 333),
 '000029': (36872, 516),
 '000030': (37388, 768),
 '000031': (38156, 1176),
 '000032': (39332, 1497),
 '000034': (40829, 4388),
 '000035': (45217, 1068),
 '000036': (46285, 1107),
 '000037': (47392, 526),
 '000039': (47918, 2982),
 '000042': (50900, 456),
 '000045': (51356, 702),
 '000048': (52058, 962),
 '000049': (53020, 2785),
 '000050': (55805, 1686),
 '000055': (57491, 373),
 '000056': (57864, 1202),
 '000058': (59066, 807),
 '000059': (59873, 1182),
 '000060': (61055, 4212),
 '000061'

In [4]:
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
path = '~/data/feature_l0_dir/2015-01-01/sh_trade_l0.parquet'

In [6]:
table = pq.read_table(path)

In [11]:
table.shape

(238, 14)

In [14]:
p = table.to_pandas()

In [15]:
p.head()

Unnamed: 0,minute,open,high,low,close,volume,notional,trade_count,l0_amount,l0_avg_trade_size,l0_range,l0_abs_move,l0_log_volume,l0_log_trade_count
0,2025-12-01 01:25:00,113.4,113.4,113.4,113.4,15077,1709731.8,7,1709731.8,2153.857143,0.0,0.0,9.620992,2.079442
1,2025-12-01 01:30:00,113.38,113.55,113.2,113.22,20775,2356859.74,47,2352145.5,442.021277,0.35,0.16,9.941554,3.871201
2,2025-12-01 01:31:00,113.0,113.0,112.6,112.83,14734,1662245.56,36,1662437.22,409.277778,0.4,0.17,9.597981,3.610918
3,2025-12-01 01:32:00,112.88,113.07,112.83,112.9,15168,1712518.82,30,1712467.2,505.6,0.24,0.02,9.627009,3.433987
4,2025-12-01 01:33:00,113.02,113.05,112.68,112.68,9402,1061441.83,21,1059417.36,447.714286,0.37,0.34,9.148784,3.091042


In [17]:
p['minute'].to_numpy()

array(['2025-12-01T01:25:00.000000', '2025-12-01T01:30:00.000000',
       '2025-12-01T01:31:00.000000', '2025-12-01T01:32:00.000000',
       '2025-12-01T01:33:00.000000', '2025-12-01T01:34:00.000000',
       '2025-12-01T01:35:00.000000', '2025-12-01T01:36:00.000000',
       '2025-12-01T01:37:00.000000', '2025-12-01T01:38:00.000000',
       '2025-12-01T01:39:00.000000', '2025-12-01T01:40:00.000000',
       '2025-12-01T01:41:00.000000', '2025-12-01T01:42:00.000000',
       '2025-12-01T01:43:00.000000', '2025-12-01T01:44:00.000000',
       '2025-12-01T01:45:00.000000', '2025-12-01T01:46:00.000000',
       '2025-12-01T01:47:00.000000', '2025-12-01T01:48:00.000000',
       '2025-12-01T01:49:00.000000', '2025-12-01T01:50:00.000000',
       '2025-12-01T01:51:00.000000', '2025-12-01T01:52:00.000000',
       '2025-12-01T01:53:00.000000', '2025-12-01T01:54:00.000000',
       '2025-12-01T01:55:00.000000', '2025-12-01T01:56:00.000000',
       '2025-12-01T01:57:00.000000', '2025-12-01T01:58:00.0000