# Tiny Trading-R1 — Pairwise Ranking RL Variant

In [None]:

# Install deps
import sys, subprocess, pkgutil
def pip_install(pkg): 
    if pkg not in [m.name for m in pkgutil.iter_modules()]:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
for p in ["numpy","pandas","torch","scikit-learn","matplotlib"]:
    pip_install(p)

import numpy as np, pandas as pd, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import random
SEED=42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


In [None]:

def simulate_market(num_days=600, tickers=None):
    if tickers is None:
        tickers = ["NVDA","AAPL","AMZN","META","MSFT","SPY","GOOG","TSLA","NFLX","AMD"]
    dates = pd.bdate_range(end=pd.Timestamp.today().normalize(), periods=num_days)
    rows=[]
    for tic in tickers:
        q = np.cumsum(np.random.normal(0,0.02,size=num_days))
        s = np.cumsum(np.random.normal(0,0.03,size=num_days))
        m = np.cumsum(np.random.normal(0,0.02,size=num_days))
        i = np.cumsum(np.random.normal(0,0.015,size=num_days))
        eps = np.random.normal(0,0.01,size=num_days)
        r = 0.1*q + 0.07*s + 0.08*m + 0.06*i + 0.6*np.concatenate([[0],eps[:-1]]) + eps
        price = 100*np.exp(np.cumsum(r*0.02))
        for d in range(num_days):
            rows.append({"date":dates[d],"ticker":tic,"price":price[d],
                         "f_quality":q[d],"f_sentiment":s[d],"f_momentum":m[d],"f_insider":i[d]})
    df = pd.DataFrame(rows).sort_values(["ticker","date"]).reset_index(drop=True)
    df["ret1"]=df.groupby("ticker")["price"].pct_change().fillna(0.0)
    return df

def make_features_and_labels(df, horizons=(5,21), vol_lookback=60):
    df=df.copy()
    df["mom10"]=df.groupby("ticker")["price"].pct_change(10).fillna(0.0)
    df["mom30"]=df.groupby("ticker")["price"].pct_change(30).fillna(0.0)
    df["vol"]=(df.groupby("ticker")["ret1"].rolling(vol_lookback).std().reset_index(level=0,drop=True)).fillna(df["ret1"].std())
    zbars=[]
    for h in horizons:
        fwd=df.groupby("ticker")["price"].pct_change(h).shift(-h)
        z=(fwd/(df["vol"]+1e-8)).rename(f"z{h}")
        df[f"z{h}"]=z; zbars.append(z)
    df["zbar"]=pd.concat(zbars,axis=1).mean(axis=1)
    def lab(z):
        if z <= -1.0: return 0
        if z <= -0.25: return 1
        if z < 0.25: return 2
        if z < 1.0: return 3
        return 4
    df["label"]=df["zbar"].apply(lab).astype(int)
    FEATS=["f_quality","f_sentiment","f_momentum","f_insider","mom10","mom30","vol"]
    X=df[FEATS].copy().fillna(0.0); y=df["label"].values
    return df, X, y, FEATS

df = simulate_market()
df, X, y, FEATS = make_features_and_labels(df)
SECTION_NAMES=["market","fundamentals","sentiment","technicals","insider","risks"]
CLAIMS=[
    ("rev_growth_pos",  lambda r: r["f_quality"]>0),
    ("sentiment_pos",   lambda r: r["f_sentiment"]>0),
    ("momentum_pos",    lambda r: r["f_momentum"]>0),
    ("insider_buying",  lambda r: r["f_insider"]>0),
    ("vol_elevated",    lambda r: r["vol"]>0.02),
    ("mom10_pos",       lambda r: r["mom10"]>0),
    ("mom30_pos",       lambda r: r["mom30"]>0),
    ("divergence_risk", lambda r: (r["mom10"]>0) and (r["f_sentiment"]<0)),
]
def teacher_structure(n): 
    return np.ones((n,len(SECTION_NAMES)),dtype=np.float32)
def teacher_claims(df_rows):
    out=np.zeros((len(df_rows),len(CLAIMS)),dtype=np.float32)
    for i,r in enumerate(df_rows):
        for j,(_,fn) in enumerate(CLAIMS):
            try: out[i,j]=1.0 if fn(r) else 0.0
            except: out[i,j]=0.0
    return out


In [None]:

class MarketToy(Dataset):
    def __init__(self, X, y, df_rows, scaler=None, keep_index=False):
        self.rows = df_rows.reset_index(drop=True)
        self.keep_index = keep_index
        Xv = X.values.astype(np.float32) if isinstance(X, pd.DataFrame) else X.astype(np.float32)
        if scaler is None:
            self.scaler = StandardScaler()
            Xv = self.scaler.fit_transform(Xv).astype(np.float32)
        else:
            self.scaler = scaler
            Xv = self.scaler.transform(Xv).astype(np.float32)
        self.X = Xv
        self.y = y.astype(np.int64)
        self.t_struct = teacher_structure(len(Xv))
        raw = df_rows[["f_quality","f_sentiment","f_momentum","f_insider","mom10","mom30","vol"]].to_dict(orient="records")
        self.t_claims = teacher_claims(raw)
        self.idx = np.arange(len(self.X))
    def __len__(self): return len(self.X)
    def __getitem__(self,i):
        tup = (self.X[i], self.y[i], self.t_struct[i], self.t_claims[i])
        if self.keep_index: return tup + (self.idx[i],)
        return tup

class TinyPolicy(nn.Module):
    def __init__(self, in_dim, hidden=64, n_sections=6, n_claims=8, n_labels=5):
        super().__init__()
        self.enc = nn.Sequential(nn.Linear(in_dim,hidden), nn.ReLU(), nn.Linear(hidden,hidden), nn.ReLU())
        self.h_struct = nn.Linear(hidden, n_sections)
        self.h_claims = nn.Linear(hidden, n_claims)
        self.h_dec = nn.Linear(hidden, n_labels)
    def forward(self, x):
        h = self.enc(x)
        return self.h_struct(h), self.h_claims(h), self.h_dec(h)
    def sample(self, x):
        ls, lc, ld = self.forward(x)
        ps=torch.sigmoid(ls); pc=torch.sigmoid(lc); pd=F.softmax(ld,dim=-1)
        s=torch.bernoulli(ps).detach(); c=torch.bernoulli(pc).detach()
        d=torch.distributions.Categorical(pd).sample().detach()
        logp_s=(torch.log(ps+1e-8)*s + torch.log(1-ps+1e-8)*(1-s)).sum(1)
        logp_c=(torch.log(pc+1e-8)*c + torch.log(1-pc+1e-8)*(1-c)).sum(1)
        logp_d=torch.log(pd.gather(1,d.view(-1,1))+1e-8).squeeze(1)
        ent = (-pd*torch.log(pd+1e-8)).sum(1) + (-ps*torch.log(ps+1e-8) - (1-ps)*torch.log(1-ps+1e-8)).sum(1) + (-pc*torch.log(pc+1e-8) - (1-pc)*torch.log(1-pc+1e-8)).sum(1)
        return (s,c,d),(logp_s,logp_c,logp_d), ent


In [None]:

LABELS=["SSell","Sell","Hold","Buy","SBuy"]
SIGN=torch.tensor([-1,-1,0,1,1],dtype=torch.float32)

def structure_score(s): return s.mean(dim=1)
def claims_truth(c, batch_raw):
    truth=[]
    for r in batch_raw:
        tr=[]
        for _,fn in CLAIMS:
            try: tr.append(1.0 if fn(r) else 0.0)
            except: tr.append(0.0)
        truth.append(tr)
    truth=torch.tensor(truth,dtype=torch.float32, device=c.device)
    coverage=(c>0.5).float().mean(dim=1)
    included=(c>0.5).float()
    correct=((included*truth).sum(dim=1) / (included.sum(dim=1)+1e-6))
    return coverage*correct
def decision_score(d, y_true, zbar):
    d_sign=SIGN[d]; y_sign=SIGN[y_true]
    both_hold=((d_sign==0)&(y_sign==0)).float()*0.5
    base=( (torch.sign(d_sign)*torch.sign(y_sign)>0).float() - (torch.sign(d_sign)*torch.sign(y_sign)<0).float()*1.5 ) + both_hold
    return base * torch.clamp(torch.abs(zbar),0,2.0)/2.0
def composite_reward(s,c,d,batch_rows,y_true,zbar,wS=0.2,wC=0.3,wD=0.5):
    raw=[{k:float(batch_rows[k][i]) for k in ["f_quality","f_sentiment","f_momentum","f_insider","mom10","mom30","vol"]} 
         for i in range(len(batch_rows["f_quality"]))]
    rS=structure_score(s)
    rC=claims_truth(c, raw)
    rD=decision_score(d, y_true, zbar)
    return wS*rS + wC*rC + wD*rD, rS, rC, rD

def train_sft(model, loader, opt, epochs=2, dec_weight=1.0):
    bce=nn.BCEWithLogitsLoss(); ce=nn.CrossEntropyLoss()
    model.train()
    for ep in range(epochs):
        total=0.0
        for batch in loader:
            if len(batch)==5: xb,yb,ts,tc,_=batch
            else: xb,yb,ts,tc=batch
            xb=torch.tensor(xb, dtype=torch.float32, device=device)
            yb=torch.tensor(yb, dtype=torch.long, device=device)
            ts=torch.tensor(ts, dtype=torch.float32, device=device)
            tc=torch.tensor(tc, dtype=torch.float32, device=device)
            opt.zero_grad()
            ls,lc,ld=model(xb)
            loss=bce(ls,ts)+bce(lc,tc)+dec_weight*ce(ld,yb)
            loss.backward(); opt.step()
            total += float(loss.item())
        print(f"[SFT] epoch {ep+1}/{epochs} loss={total/len(loader):.4f}")


In [None]:

def batch_rows_dict(df_rows, idxs):
    sub=df_rows.iloc[list(idxs)]
    d={k: torch.tensor(sub[k].values, dtype=torch.float32, device=device) 
       for k in ["f_quality","f_sentiment","f_momentum","f_insider","mom10","mom30","vol","zbar"]}
    return d

def train_pairwise_rft(model, rl_loader, df_rows, opt, epochs=2, entropy_coef=0.001, ce_anchor=0.05):
    ce=nn.CrossEntropyLoss()
    for g in opt.param_groups: g["lr"]=g["lr"]*0.5
    model.train()
    for ep in range(epochs):
        stats=[]
        for batch in rl_loader:
            xb,yb,_,_,idxs=batch
            xb=torch.tensor(xb, dtype=torch.float32, device=device)
            yb=torch.tensor(yb, dtype=torch.long, device=device)
            rows = batch_rows_dict(df_rows, idxs)
            # candidate 1
            (s1,c1,d1),(lp_s1,lp_c1,lp_d1), ent1 = model.sample(xb)
            R1,_,_,_ = composite_reward(s1,c1,d1,rows,yb,rows["zbar"])
            # candidate 2
            (s2,c2,d2),(lp_s2,lp_c2,lp_d2), ent2 = model.sample(xb)
            R2,_,_,_ = composite_reward(s2,c2,d2,rows,yb,rows["zbar"])
            logp1 = (lp_s1+lp_c1+lp_d1); logp2=(lp_s2+lp_c2+lp_d2)
            A1 = 0.5*(R1 - R2); A2 = -A1
            pol_loss = -(A1.detach()*logp1 + A2.detach()*logp2).mean()
            ent = torch.cat([ent1,ent2],dim=0).mean()
            _,_,ld = model(xb)
            anchor = ce(ld, yb)
            loss = pol_loss - entropy_coef*ent + ce_anchor*anchor
            opt.zero_grad(); loss.backward(); opt.step()
            stats.append([R1.mean().item(),R2.mean().item(),A1.abs().mean().item(),ent.item()])
        s=np.array(stats).mean(axis=0)
        print(f"[Rank-RFT] epoch {ep+1}/{epochs} R1={s[0]:.3f} R2={s[1]:.3f} | |A|={s[2]:.3f} | H={s[3]:.3f}")


In [None]:

def sharpe(returns):
    if len(returns)==0: return 0.0
    r=np.array(returns); mu=r.mean(); sd=r.std()+1e-9
    return (mu/sd)*np.sqrt(252)
def max_drawdown(equity):
    eq=np.array(equity); peak=np.maximum.accumulate(eq); dd=(eq-peak)/peak
    return dd.min() if len(dd) else 0.0

def run_walk_forward(df, X, y, feats, n_folds=3, train_days=320, test_days=120, batch=256):
    model=TinyPolicy(len(feats)).to(device)
    all_equity=[1.0]; all_returns=[]
    start_idx=0; fold=0
    while True:
        fold+=1
        if start_idx+train_days+test_days >= len(df): break
        tr_idx=range(start_idx, start_idx+train_days)
        te_idx=range(start_idx+train_days, start_idx+train_days+test_days)
        start_idx += test_days
        if fold>n_folds: break
        scaler=StandardScaler().fit(X.iloc[tr_idx])
        tr_ds=MarketToy(X.iloc[tr_idx], y[tr_idx], pd.concat([df.iloc[tr_idx][feats], df.iloc[tr_idx][["zbar"]]], axis=1), scaler=scaler)
        tr_ds_rl=MarketToy(X.iloc[tr_idx], y[tr_idx], pd.concat([df.iloc[tr_idx][feats], df.iloc[tr_idx][["zbar"]]], axis=1), scaler=scaler, keep_index=True)
        te_ds=MarketToy(X.iloc[te_idx], y[te_idx], pd.concat([df.iloc[te_idx][feats], df.iloc[te_idx][["zbar"]]], axis=1), scaler=scaler)
        tr_loader=DataLoader(tr_ds,batch_size=batch,shuffle=True,drop_last=False)
        rl_loader=DataLoader(tr_ds_rl,batch_size=batch,shuffle=False,drop_last=False)
        te_loader=DataLoader(te_ds,batch_size=batch,shuffle=False,drop_last=False)
        opt=torch.optim.Adam(model.parameters(), lr=3e-3)
        print(f"\n=== Fold {fold} ===")
        train_sft(model, tr_loader, opt, epochs=2, dec_weight=1.0)
        df_tr_rows=pd.concat([df.iloc[tr_idx][feats], df.iloc[tr_idx][["zbar"]]], axis=1).reset_index(drop=True)
        train_pairwise_rft(model, rl_loader, df_tr_rows, opt, epochs=2, entropy_coef=0.001, ce_anchor=0.05)
        # Test
        model.eval(); eq=all_equity[-1]
        equity_curve=[]; daily_rets=[]
        with torch.no_grad():
            xb=torch.tensor(te_ds.X, dtype=torch.float32, device=device)
            _,_,ld=model(xb)
            pred=ld.argmax(1).cpu().numpy()
            pos=np.where(pred<=1,-1,np.where(pred>=3,1,0))
            fwd21=df.iloc[te_idx].groupby("ticker")["price"].pct_change(21).shift(-21).fillna(0.0).values
            pnl = pos * fwd21; daily = pnl/21.0
            for r in daily:
                eq *= (1.0 + r)
                equity_curve.append(eq); daily_rets.append(r)
        all_equity += equity_curve; all_returns += daily_rets
    S=sharpe(all_returns); MDD=max_drawdown(all_equity)
    return all_equity, S, MDD

equity, S, MDD = run_walk_forward(df, X, y, FEATS)
print(f"Sharpe: {S:.2f} | Max Drawdown: {MDD:.2%}")


In [None]:

import matplotlib.pyplot as plt
plt.figure(figsize=(8,4)); plt.plot(equity); plt.grid(True)
plt.title("Equity Curve — Pairwise Ranking RL (Toy)"); plt.xlabel("Test Days"); plt.ylabel("Equity")
plt.show()
