*Still in progress – I will continue refining the functions as I gain a deeper understanding of real world trading scenarios described in the research paper->particularly the implications of trade execution and strategy*



In [None]:
!pip install ray

Collecting ray
  Downloading ray-2.47.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (20 kB)
Downloading ray-2.47.1-cp311-cp311-manylinux2014_x86_64.whl (68.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.9/68.9 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ray
Successfully installed ray-2.47.1


In [None]:
!pip install "ray[rllib]" lz4


Collecting lz4
  Downloading lz4-4.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tensorboardX>=1.9 (from ray[rllib])
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting gymnasium==1.0.0 (from ray[rllib])
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting ormsgpack==1.7.0 (from ray[rllib])
  Downloading ormsgpack-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ormsgpack-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (220 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.6/220.6 kB[0m [31m16.5 MB/s[0m eta [36m

In [None]:
from __future__ import annotations
import time, math, random, datetime as dt
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional

In [None]:
import ray
from ray import tune
from ray.rllib.models import ModelCatalog
from ray.rllib.algorithms.ppo import PPO
from ray.rllib.algorithms.impala import Impala

In [None]:
print(ray.__version__)

2.47.1


In [None]:
import gym
from gym import spaces

In [None]:
class SimpleLOBEnv(gym.Env):


    def __init__(self, parent_qty: int, episode_minutes: int = 60):
        super().__init__()
        self.parent_qty = parent_qty
        self.ep_steps   = episode_minutes * 60
        self.observation_space = spaces.Box(low=-10, high=10, shape=(8,), dtype=float)
        self.action_space = spaces.MultiDiscrete([3,
                                                  5,
                                                  4])

        self.reset()

    def reset(self):
        self.t          = 0
        self.qty_left   = self.parent_qty
        self.arrival_px = 100.0
        self.pnl        = 0.0
        return self._obs()

    def step(self, action):
        skill, px_bucket, size_bucket = action
        clip_size   = max(1, int((size_bucket + 1) / 4 * 100))
        clip_size   = min(clip_size, self.qty_left)
        fill_prob   = {0: 0.3, 1: 0.8, 2: 0.5}[skill]
        slippage_bp = {0: -0.5, 1:  +1.5, 2: +0.6}[skill]
        shares_filled = 0
        if random.random() < fill_prob:
            shares_filled = clip_size
            self.qty_left -= shares_filled
            self.pnl += -(slippage_bp/10000) * shares_filled
        self.t += 1
        done  = (self.t >= self.ep_steps) or (self.qty_left <= 0)
        reward = self.pnl if done else 0.0
        return self._obs(), reward, done, {}

    def _obs(self):
        remaining   = self.qty_left / self.parent_qty
        seconds_left= max(0, self.ep_steps - self.t)
        return [0,  3_000, 3_200, 0.0, seconds_left, self.qty_left, self.pnl, 0.2]

# bell shaped volume curve...

In [None]:
def vwap_curve(parent_qty: int, minutes: int = 60) -> List[int]:
    x = [i / minutes for i in range(minutes+1)]
    pdf = [math.exp(-0.5*((x_i-0.5)/0.15)**2) for x_i in x]
    cum = [sum(pdf[:i+1]) for i in range(len(pdf))]
    cum = [c / cum[-1] for c in cum]
    return [int(parent_qty * c) for c in cum]

In [None]:
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork
ModelCatalog.register_custom_model("tiny_mlp", FullyConnectedNetwork)

In [None]:
SKILL_NAMES = ["passive", "sweep", "dark"]

def policy_mapping_fn(agent_id):
    return agent_id

skill_policies = {name: (None, SimpleLOBEnv(1000).observation_space,
                         SimpleLOBEnv(1000).action_space, {})
                  for name in SKILL_NAMES}


In [None]:
class MetaPolicyEnv(gym.Env):

    def __init__(self, parent_qty: int):
        super().__init__()
        self.inner = SimpleLOBEnv(parent_qty)
        self.observation_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=float)
        self.action_space      = spaces.Discrete(3)
        self.schedule          = vwap_curve(parent_qty)

    def reset(self):
        self.t = 0
        self.cum_filled = 0
        return self._obs()

    def _obs(self):
        target = self.schedule[min(self.t//60, len(self.schedule)-1)]
        ahead  = (self.cum_filled - target) / max(1, target)
        return [ahead, 0.2, 0.05]

    def step(self, action):
        """Pick a skill, let it act for 5 seconds, aggregate reward (CE later)."""
        skill_idx = int(action)
        total_reward = 0.0
        done = False
        for _ in range(5):
            _, r, done, _ = self.inner.step([skill_idx, 2, 2])
            total_reward += r
            self.cum_filled = self.inner.parent_qty - self.inner.qty_left
            if done:
                break
        self.t += 5
        return self._obs(), total_reward, done, {}

In [None]:
def train_skills(num_iters: int = 50):
    algo = Impala.Config().environment(SimpleLOBEnv, env_config={"parent_qty": 1000}) \
        .multi_agent(policies=skill_policies, policy_mapping_fn=policy_mapping_fn) \
        .framework("torch") \
        .training(model={"custom_model": "tiny_mlp"}, train_batch_size=2048) \
        .rollouts(num_rollout_workers=4) \
        .build()

    for i in range(num_iters):
        res = algo.train()
        print(f"Skill iterati on {i:03d}: reward={res['episode_reward_mean']:.4f}")


In [None]:
def train_meta(num_iters: int = 50, parent_qty: int = 1000):
    algo = PPO.Config().environment(MetaPolicyEnv, env_config={"parent_qty": parent_qty}) \
        .framework("torch") \
        .training(model={"fcnet_hidden": [64, 64]}) \
        .rollouts(num_rollout_workers=4) \
        .build()

    for i in range(num_iters):
        res = algo.train()
        print(f"Meta iteration {i:03d}: reward={res['episode_reward_mean']:.4f}")


## childOder->FIX

In [None]:
@dataclass
class ChildOrder:
    side: str  # buy/sell
    qty: int
    px: float
    tif: int

class FIXRouter:
    MAX_CLIP = 500

    @staticmethod
    def send(order: ChildOrder):
        if order.qty > FIXRouter.MAX_CLIP:
            raise ValueError("clip too large – guard‑rail tripped")
        print(f"[FIX] {order.side.upper()} {order.qty}@{order.px:.2f} tif={order.tif}ms")

In [None]:
from ray.rllib.algorithms.impala import ImpalaConfig
from ray.rllib.algorithms.ppo import PPOConfig
