# 01 â€” Build training dataset + Kronos embeddings (512d)

Runs on Colab T4. Pulls yfinance daily bars, builds TF-align/SMC/TA vectors via in-repo preprocessors, encodes Kronos embeddings, and saves `training_data/v1/dataset.parquet`.

In [None]:
!pip -q install yfinance pandas numpy pyarrow duckdb torch huggingface_hub tqdm

In [None]:
import os, sys, json, pathlib

# If running in Colab, clone the repo first. Set REPO_URL env if needed.
REPO_URL = os.getenv("REPO_URL", "https://github.com/your-org/AI_TRADER.git")
REPO_DIR = os.getenv("REPO_DIR", "AI_TRADER")
if not pathlib.Path("backend").exists():
    if not pathlib.Path(REPO_DIR).exists():
        !git clone $REPO_URL $REPO_DIR
    %cd $REPO_DIR

sys.path.append(os.path.abspath("backend"))
os.makedirs("training_data/v1", exist_ok=True)

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import torch
from tqdm import tqdm

from app.ml.preprocess.normalize import (
    normalize_ohlcv_120,
    build_tf_align_vec,
    build_smc_vec,
    build_ta_vec,
)
from app.services.kronos_loader import load_kronos_hf
from app.services.feature_engine import compute_ta_features, compute_smc_features

torch.set_grad_enabled(False)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Config
LOOKBACK = 120
HORIZONS = [3, 5, 10]
START = os.getenv("DATA_START", "2020-01-01")
END = os.getenv("DATA_END", None)  # None == today
TICKER_FILE = os.getenv("TICKER_FILE", "training_data/nifty200_symbols.txt")
DEFAULT_TICKERS = ["RELIANCE.NS", "TCS.NS", "HDFCBANK.NS", "INFY.NS", "ICICIBANK.NS"]

if pathlib.Path(TICKER_FILE).exists():
    with open(TICKER_FILE) as f:
        TICKERS = [t.strip() for t in f if t.strip()]
else:
    TICKERS = DEFAULT_TICKERS

OUT_PATH = pathlib.Path("training_data/v1/dataset.parquet")
print(f"Using {len(TICKERS)} tickers; saving to {OUT_PATH}")

In [None]:
def fetch_daily(sym: str) -> pd.DataFrame:
    df = yf.download(sym, start=START, end=END, interval="1d", auto_adjust=False, progress=False)
    if df.empty:
        return df
    df = df.rename(columns=str.lower)[["open", "high", "low", "close", "volume"]].dropna()
    df.reset_index(inplace=True)
    df.rename(columns={"index": "date", "Date": "date"}, inplace=True)
    df["date"] = pd.to_datetime(df["date"])
    df.set_index("date", inplace=True)
    return df


def add_labels(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for h in HORIZONS:
        out[f"ret_{h}"] = (out["close"].shift(-h) / out["close"]) - 1.0
        out[f"up_{h}"] = (out[f"ret_{h}"] > 0).astype(np.int32)
    return out

In [None]:
# Feature extraction helpers aligned with backend
def compute_alignment(window: pd.DataFrame) -> dict:
    if window.empty:
        return {"monthly_bias": 0.0, "weekly_bias": 0.0, "daily_bias": 0.0, "h4_align": 0.0, "h1_align": 0.0}
    # Resample from the available daily window
    wk = window.resample("W-FRI").agg({"open": "first", "high": "max", "low": "min", "close": "last", "volume": "sum"}).dropna()
    mo = window.resample("ME").agg({"open": "first", "high": "max", "low": "min", "close": "last", "volume": "sum"}).dropna()
    h4 = window.resample("4h").agg({"open": "first", "high": "max", "low": "min", "close": "last", "volume": "sum"}).dropna()
    h1 = window.copy()  # already 1H if provided; with daily data it's sparse but harmless

    def bias(df: pd.DataFrame) -> float:
        enriched = compute_ta_features(df)
        if enriched.empty:
            return 0.0
        latest = enriched.iloc[-1]
        return 1.0 if latest.get("ema_fast", 0) > latest.get("ema_slow", 0) else -1.0

    return {
        "monthly_bias": bias(mo),
        "weekly_bias": bias(wk),
        "daily_bias": bias(window),
        "h4_align": bias(h4),
        "h1_align": bias(h1),
    }


def compute_feature_dict(window: pd.DataFrame) -> dict:
    enriched = compute_ta_features(window)
    enriched = compute_smc_features(enriched)
    if enriched.empty:
        return {}
    latest = enriched.iloc[-1].to_dict()
    # ensure raw OHLCV for TA vec builders
    latest.update({
        "open": float(window.iloc[-1]["open"]),
        "high": float(window.iloc[-1]["high"]),
        "low": float(window.iloc[-1]["low"]),
        "close": float(window.iloc[-1]["close"]),
        "volume": float(window.iloc[-1]["volume"]),
    })
    return latest

In [None]:
# Kronos 512d encoder
kronos = load_kronos_hf(device=device, max_context=512)

def kronos_embed(batch_norm: np.ndarray) -> np.ndarray:
    # batch_norm: (B,120,5)
    x = torch.tensor(batch_norm, dtype=torch.float32, device=device)
    if x.shape[-1] == 5:  # pad amount channel if tokenizer expects 6
        amt = torch.zeros(x.shape[0], x.shape[1], 1, device=device)
        x = torch.cat([x, amt], dim=-1)
    z = kronos.tokenizer.embed(x)
    if isinstance(z, tuple):
        z = z[0]
    emb = z.mean(dim=1).detach().cpu().numpy().astype(np.float32)
    if emb.shape[1] < 512:
        pad = np.zeros((emb.shape[0], 512 - emb.shape[1]), dtype=np.float32)
        emb = np.concatenate([emb, pad], axis=1)
    elif emb.shape[1] > 512:
        emb = emb[:, :512]
    return emb

In [None]:
rows = []

for sym in tqdm(TICKERS):
    df = fetch_daily(sym)
    if df.empty or len(df) < LOOKBACK + max(HORIZONS) + 10:
        continue
    df = add_labels(df)

    batch_ohlcv = []
    batch_meta = []

    for i in range(LOOKBACK - 1, len(df) - max(HORIZONS)):
        window = df.iloc[i - LOOKBACK + 1 : i + 1]
        ohlcv = window[["open", "high", "low", "close", "volume"]].values.astype(np.float32)
        if ohlcv.shape[0] != LOOKBACK:
            continue
        norm = normalize_ohlcv_120(ohlcv)
        alignment = build_tf_align_vec(compute_alignment(window))
        feat_dict = compute_feature_dict(window)
        smc_vec = build_smc_vec(feat_dict)
        ta_vec = build_ta_vec(feat_dict)
        context = np.concatenate([alignment, smc_vec, ta_vec]).astype(np.float32)

        y_ret = np.array([df.iloc[i][f"ret_{h}"] for h in HORIZONS], dtype=np.float32)
        y_up = np.array([df.iloc[i][f"up_{h}"] for h in HORIZONS], dtype=np.float32)
        if np.any(np.isnan(y_ret)):
            continue

        batch_ohlcv.append(norm)
        batch_meta.append((df.index[i], context, y_ret, y_up))

        if len(batch_ohlcv) >= 64:
            emb = kronos_embed(np.stack(batch_ohlcv, axis=0))
            for (asof, ctx, y_r, y_u), e, o in zip(batch_meta, emb, batch_ohlcv):
                rows.append({
                    "symbol": sym,
                    "asof": pd.to_datetime(asof),
                    "ohlcv_norm": o,
                    "kronos_emb": e,
                    "context": ctx,
                    "y_ret": y_r,
                    "y_up": y_u,
                })
            batch_ohlcv, batch_meta = [], []

    if batch_ohlcv:
        emb = kronos_embed(np.stack(batch_ohlcv, axis=0))
        for (asof, ctx, y_r, y_u), e, o in zip(batch_meta, emb, batch_ohlcv):
            rows.append({
                "symbol": sym,
                "asof": pd.to_datetime(asof),
                "ohlcv_norm": o,
                "kronos_emb": e,
                "context": ctx,
                "y_ret": y_r,
                "y_up": y_u,
            })

print(f"Total samples: {len(rows)}")
df_out = pd.DataFrame(rows)
df_out.to_parquet(OUT_PATH, index=False)
print(f"Saved to {OUT_PATH}")