# Multi-Asset Polygon Dataset: Descriptive Statistics

This notebook reproduces the **same preprocessing pipeline** as `Datasets/multi_asset_dataset.py` and then computes descriptive statistics. It uses the Polygon OHLCV parquet files and the engineered features that are fed into training (rolling z-scored returns, ranges, and volatility features).


## Configuration
Update these settings to match your training configuration (e.g., `train_jepa_initial.py`).


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import sys


def find_project_root(start: Path) -> Path:
    p = start.resolve()
    for candidate in [p, *p.parents]:
        if (candidate / "src").exists() and (candidate / "configs").exists():
            return candidate
    raise RuntimeError("Could not locate project root containing src/ and configs/")


def resolve_project_path(path_value: str | Path, project_root: Path) -> Path:
    p = Path(path_value)
    if p.is_absolute():
        return p
    return (project_root / p).resolve()


PROJECT_ROOT = find_project_root(Path.cwd())
print(f"Project root: {PROJECT_ROOT}")

# Dataset args that match the training
root_path = resolve_project_path("Data/polygon", PROJECT_ROOT)
data_path = Path("data_raw_1m")
start_date = None
regular_hours_only = True
timeframe = "15min" # resampling timeframe

# data args that match the training
rolling_window = 252
train_split = 0.7
test_split = 0.15

tickers = None # optional
file_suffix = ".parquet"

print(f"Data dir: {root_path / data_path}")


Project root: C:\python\koulu\Gradu
Data dir: C:\python\koulu\Gradu\Data\polygon\data_raw_1m


## Helper functions (mirrors the dataset code)


In [13]:
import os


def to_datetime(series):
    try:
        return pd.to_datetime(series, unit="ms", utc=True)
    except Exception:
        return pd.to_datetime(series, utc=True)


def true_range(high, low, close):
    prev_close = close.shift(1)
    a = (high - low).abs()
    b = (high - prev_close).abs()
    c = (low - prev_close).abs()
    return pd.concat([a, b, c], axis=1).max(axis=1)


def filter_regular_us_hours(dt: pd.Series) -> pd.Series:
    dt_et = dt.dt.tz_convert("America/New_York")
    minutes = dt_et.dt.hour * 60 + dt_et.dt.minute
    is_weekday = dt_et.dt.weekday < 5
    in_rth = (minutes >= 570) & (minutes <= 960)
    return is_weekday & in_rth


def resample_ohlcv(df: pd.DataFrame, rule: str) -> pd.DataFrame:
    df = df.set_index("_dt")
    agg = {
        "open": "first",
        "high": "max",
        "low": "min",
        "close": "last",
        "volume": "sum",
    }
    out = df.resample(rule).agg(agg).dropna()
    return out.reset_index()


def rolling_zscore(series: pd.Series, window: int) -> pd.Series:
    m = series.rolling(window, min_periods=window).mean().shift(1)
    s = series.rolling(window, min_periods=window).std().shift(1)
    return (series - m) / s


def rolling_zscore_df(df: pd.DataFrame, window: int) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index, columns=df.columns, dtype=float)
    for c in df.columns:
        out[c] = rolling_zscore(df[c], window)
    return out


def iter_asset_files(path: Path, tickers=None, suffix=".parquet"):
    if tickers:
        for t in tickers:
            yield t, path / f"{t}{suffix}"
    else:
        for fname in os.listdir(path):
            if fname.endswith(suffix):
                asset_id = Path(fname).stem
                yield asset_id, path / fname


## Load assets and compute features
This step follows the exact feature engineering used for training: log returns, range/volatility features, and rolling z-scoring.


In [14]:
feature_columns = [
    "ret_close", "ret_open", "ret_high", "ret_low",
    "hl_range", "tr", "vol20", "log_vol", "vol_z20",
]


def load_asset(fpath: Path):
    df = pd.read_parquet(fpath)
    time_col = "timestamp"

    if start_date is not None:
        start_dt = pd.to_datetime(start_date, utc=True)
        dt = to_datetime(df[time_col])
        df = df.loc[dt >= start_dt].copy()

    dt = to_datetime(df[time_col])
    if regular_hours_only:
        mask = filter_regular_us_hours(dt)
        df = df.loc[mask].copy()
        dt = dt.loc[mask]

    df = df.assign(_dt=dt).sort_values("_dt", ascending=True).reset_index(drop=True)

    if timeframe != "1min":
        df = resample_ohlcv(df, timeframe)

    for col in ["open", "high", "low", "close"]:
        df[f"log_{col}"] = np.log(df[col].astype(float))

    df["ret_close"] = df["log_close"].diff()
    df["ret_open"] = df["log_open"].diff()
    df["ret_high"] = df["log_high"].diff()
    df["ret_low"] = df["log_low"].diff()

    df["hl_range"] = (df["high"] - df["low"]) / df["close"].replace(0, np.nan)
    df["tr"] = true_range(df["high"], df["low"], df["close"]) / df["close"].replace(0, np.nan)
    df["vol20"] = df["ret_close"].rolling(20, min_periods=20).std()

    df["log_vol"] = np.log(df["volume"].astype(float) + 1.0)
    df["vol_z20"] = (
        df["volume"] - df["volume"].rolling(20, min_periods=20).mean()
    ) / df["volume"].rolling(20, min_periods=20).std()

    base = df[feature_columns].copy().replace([np.inf, -np.inf], np.nan)
    base_valid = base.dropna()
    valid_index = base_valid.index

    Z = rolling_zscore_df(base.loc[valid_index], rolling_window)
    Z = Z.replace([np.inf, -np.inf], np.nan).dropna()
    final_index = Z.index

    base_final = base.loc[final_index].reset_index(drop=True)
    z_final = Z.reset_index(drop=True)

    raw_ohlcv = df.loc[final_index, ["open", "high", "low", "close", "volume"]].reset_index(drop=True)
    dt_final = df.loc[final_index, "_dt"].reset_index(drop=True)

    return {
        "base": base_final,
        "z": z_final,
        "ohlcv": raw_ohlcv,
        "dates": dt_final,
    }


data_dir = root_path / data_path
assets = []

for asset_id, fpath in iter_asset_files(data_dir, tickers=tickers, suffix=file_suffix):
    if not fpath.exists():
        continue
    data = load_asset(fpath)
    if len(data["z"]) == 0:
        continue
    assets.append({"asset": asset_id, **data})

print(f"Loaded {len(assets)} assets from {data_dir}")


Loaded 33 assets from C:\python\koulu\Gradu\Data\polygon\data_raw_1m


## Data coverage summary
Summarize date ranges and row counts after preprocessing (aligned to the training features).


In [15]:
coverage_rows = []
for item in assets:
    dates = item["dates"]
    coverage_rows.append({
        "asset": item["asset"],
        "rows": len(dates),
        "start": dates.min(),
        "end": dates.max(),
    })

coverage = pd.DataFrame(coverage_rows).sort_values("asset")
coverage


Unnamed: 0,asset,rows,start,end
0,AAPL,33175,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
1,AMGN,33168,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
2,AMZN,33172,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
3,AXP,32461,2021-02-26 17:15:00+00:00,2026-01-23 21:00:00+00:00
4,BA,33169,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
5,CAT,33166,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
6,CRM,33170,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
7,CSCO,33169,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00
8,CVX,32476,2021-02-26 16:30:00+00:00,2026-01-23 21:00:00+00:00
9,DIA,33119,2021-02-26 15:00:00+00:00,2026-01-23 21:00:00+00:00


## Train/val/test split boundaries
Uses the **global date alignment** logic from `Dataset_Finance_MultiAsset`.


In [16]:
all_dates = pd.Index(pd.concat([a["dates"] for a in assets]).unique()).sort_values()
num_total = len(all_dates)
num_train = int(num_total * train_split)
num_test = int(num_total * test_split)
num_val = num_total - num_train - num_test

train_end = all_dates[max(num_train - 1, 0)] if num_total else None
val_end = all_dates[max(num_train + num_val - 1, 0)] if num_total else None

split_info = {
    "global_total_rows": num_total,
    "train_end": train_end,
    "val_end": val_end,
    "num_train_dates": num_train,
    "num_val_dates": num_val,
    "num_test_dates": num_test,
}

split_info


{'global_total_rows': 33179,
 'train_end': Timestamp('2024-08-01 15:15:00+0000', tz='UTC'),
 'val_end': Timestamp('2025-04-29 16:30:00+0000', tz='UTC'),
 'num_train_dates': 23225,
 'num_val_dates': 4978,
 'num_test_dates': 4976}

## Split counts per asset


In [17]:
split_rows = []
for item in assets:
    dates = item["dates"]
    if train_end is None:
        train_rows = val_rows = test_rows = 0
    else:
        train_rows = (dates <= train_end).sum()
        val_rows = ((dates > train_end) & (dates <= val_end)).sum()
        test_rows = (dates > val_end).sum()
    split_rows.append({
        "asset": item["asset"],
        "train_rows": int(train_rows),
        "val_rows": int(val_rows),
        "test_rows": int(test_rows),
    })

split_counts = pd.DataFrame(split_rows).sort_values("asset")
split_counts


Unnamed: 0,asset,train_rows,val_rows,test_rows
0,AAPL,23223,4977,4975
1,AMGN,23219,4976,4973
2,AMZN,23221,4977,4974
3,AXP,22515,4974,4972
4,BA,23219,4976,4974
5,CAT,23217,4976,4973
6,CRM,23221,4976,4973
7,CSCO,23220,4976,4973
8,CVX,22650,4906,4920
9,DIA,23193,4968,4958


## Descriptive statistics (training features)
These are **z-scored features** used by the model.


In [7]:
percentiles = [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]

per_asset_stats = []
for item in assets:
    stats = item["z"].describe(percentiles=percentiles).T
    stats["asset"] = item["asset"]
    stats["feature"] = stats.index
    per_asset_stats.append(stats.reset_index(drop=True))

z_feature_stats = pd.concat(per_asset_stats, ignore_index=True)
z_feature_stats.head()


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,asset,feature
0,31936.0,-0.004274,1.093663,-38.225452,-2.889363,-1.339307,-0.427663,-0.002636,0.427601,1.326837,2.813851,19.124546,AAPL,ret_close
1,31936.0,-0.005049,1.100147,-33.050385,-2.962942,-1.40059,-0.434689,-0.00237,0.433534,1.394349,2.958009,28.890424,AAPL,ret_open
2,31936.0,-0.001268,1.093429,-28.314668,-2.919234,-1.330166,-0.379219,-0.014286,0.364016,1.298018,3.23819,25.876878,AAPL,ret_high
3,31936.0,-0.008161,1.119698,-38.072313,-3.165895,-1.326686,-0.377248,0.013285,0.388389,1.31765,2.872121,22.590527,AAPL,ret_low
4,31936.0,0.007781,1.083773,-1.886331,-1.215506,-1.010599,-0.647809,-0.272555,0.31723,1.965726,4.185876,20.847402,AAPL,hl_range


### Global statistics across assets (z-scored features)


In [8]:
if assets:
    z_global = pd.concat([a["z"] for a in assets], ignore_index=True)
    z_global_stats = z_global.describe(percentiles=percentiles).T
else:
    z_global_stats = pd.DataFrame()

z_global_stats


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
ret_close,1053888.0,-0.003671,1.13,-101.954521,-2.831585,-1.362992,-0.432136,0.000981,0.431374,1.350964,2.809285,63.86547
ret_open,1053888.0,-0.003668,1.111085,-81.386081,-2.959741,-1.420296,-0.438623,0.001409,0.439127,1.409988,2.90923,68.706146
ret_high,1053888.0,0.000327,1.147987,-99.055903,-2.825973,-1.33729,-0.38624,-0.014516,0.364746,1.303648,3.201369,72.136436
ret_low,1053888.0,-0.008036,1.14977,-81.92485,-3.160788,-1.323694,-0.369857,0.0147,0.392388,1.320229,2.782983,56.983902
hl_range,1053888.0,0.0108,1.106689,-1.95531,-1.179992,-0.973377,-0.626262,-0.272608,0.290705,1.928432,4.186965,94.542943
tr,1053888.0,0.014999,1.210933,-1.846685,-1.03777,-0.857961,-0.550811,-0.257355,0.201087,1.679659,4.448765,105.436417
vol20,1053888.0,0.043031,1.225127,-3.296782,-1.730701,-1.319002,-0.722478,-0.249741,0.498521,2.420503,4.096561,83.444833
log_vol,1053888.0,-0.003124,1.050759,-5.530284,-1.843456,-1.421297,-0.737326,-0.158176,0.548455,2.031909,3.170673,7.269848
vol_z20,1053888.0,0.004315,1.0156,-2.312401,-1.076139,-0.873835,-0.605338,-0.356419,0.18587,2.543237,3.619779,4.39309


## Descriptive statistics (pre-zscore engineered features)
This table is useful to report the **raw engineered signals** before normalization.


In [9]:
per_asset_base_stats = []
for item in assets:
    stats = item["base"].describe(percentiles=percentiles).T
    stats["asset"] = item["asset"]
    stats["feature"] = stats.index
    per_asset_base_stats.append(stats.reset_index(drop=True))

base_feature_stats = pd.concat(per_asset_base_stats, ignore_index=True)
base_feature_stats.head()


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,asset,feature
0,31936.0,2.2e-05,0.003371,-0.09804,-0.00911,-0.0041,-0.001211,3.9e-05,0.001291,0.004139,0.008857,0.059715,AAPL,ret_close
1,31936.0,2.2e-05,0.003476,-0.10901,-0.009436,-0.004333,-0.001233,4.2e-05,0.001322,0.004366,0.009313,0.076408,AAPL,ret_open
2,31936.0,2.2e-05,0.003169,-0.078614,-0.008498,-0.003878,-0.000996,0.0,0.001025,0.003764,0.009067,0.075421,AAPL,ret_high
3,31936.0,2.2e-05,0.003297,-0.111185,-0.009164,-0.003773,-0.001,7.5e-05,0.001121,0.003922,0.008263,0.06213,AAPL,ret_low
4,31936.0,0.003729,0.002855,0.0,0.0009,0.001232,0.002017,0.002963,0.004513,0.008706,0.014245,0.080573,AAPL,hl_range


### Global statistics across assets (engineered features)


In [10]:
if assets:
    base_global = pd.concat([a["base"] for a in assets], ignore_index=True)
    base_global_stats = base_global.describe(percentiles=percentiles).T
else:
    base_global_stats = pd.DataFrame()

base_global_stats


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
ret_close,1053888.0,1.5e-05,0.003303,-0.204603,-0.008451,-0.003889,-0.001124,2.8e-05,0.001165,0.003867,0.008477,0.218227
ret_open,1053888.0,1.5e-05,0.003315,-0.20279,-0.008886,-0.00409,-0.001146,3e-05,0.001192,0.004076,0.008776,0.238508
ret_high,1053888.0,1.5e-05,0.003142,-0.191138,-0.007863,-0.003583,-0.000945,0.0,0.000926,0.003472,0.0086,0.232226
ret_low,1053888.0,1.5e-05,0.003186,-0.20891,-0.008655,-0.003552,-0.000911,6.4e-05,0.001019,0.003624,0.007971,0.189131
hl_range,1053888.0,0.003397,0.002829,0.0,0.000703,0.001026,0.001757,0.00262,0.004054,0.00835,0.01427,0.160855
tr,1053888.0,0.00352,0.003533,0.0,0.000706,0.001032,0.001765,0.002632,0.00408,0.008725,0.016503,0.234271
vol20,1053888.0,0.002659,0.001962,0.000182,0.00069,0.000987,0.001559,0.002167,0.003128,0.005824,0.010184,0.049044
log_vol,1053888.0,12.289423,1.449717,7.151485,9.661917,10.27815,11.240972,12.102264,13.112513,14.958028,16.514972,19.374332
vol_z20,1053888.0,0.073089,1.126135,-2.360455,-1.146167,-0.915771,-0.603454,-0.321991,0.279236,2.894197,4.054639,4.245504


## Raw OHLCV descriptive statistics
Useful for reporting price/volume scales after filtering and resampling.


In [11]:
per_asset_ohlcv_stats = []
for item in assets:
    stats = item["ohlcv"].describe(percentiles=percentiles).T
    stats["asset"] = item["asset"]
    stats["field"] = stats.index
    per_asset_ohlcv_stats.append(stats.reset_index(drop=True))

ohlcv_stats = pd.concat(per_asset_ohlcv_stats, ignore_index=True)
ohlcv_stats.head()


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max,asset,field
0,31936.0,183.7859,39.41311,116.8,121.8435,129.5375,150.49,175.6875,213.67,257.6938,276.3365,287.845,AAPL,open
1,31936.0,184.123,39.45697,117.07,122.1,129.8275,150.81,175.99,214.0,258.01,276.6997,288.62,AAPL,high
2,31936.0,183.4493,39.36906,116.21,121.62425,129.29,150.2188,175.3522,213.2602,257.3326,275.9888,287.34,AAPL,low
3,31936.0,183.7927,39.41314,116.4,121.84675,129.5625,150.51,175.695,213.67,257.6962,276.3355,287.85,AAPL,close
4,31936.0,2126165.0,1724525.0,256289.0,474351.9,626365.25,1056050.0,1616149.0,2583938.0,5334482.0,8783963.0,24090200.0,AAPL,volume


### Global statistics across assets (raw OHLCV)


In [12]:
if assets:
    ohlcv_global = pd.concat([a["ohlcv"] for a in assets], ignore_index=True)
    ohlcv_global_stats = ohlcv_global.describe(percentiles=percentiles).T
else:
    ohlcv_global_stats = pd.DataFrame()

ohlcv_global_stats


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
open,1053888.0,219.274802,135.0763,10.966,23.315045,47.083895,124.87,188.64,297.78,485.88,611.1901,979.97
high,1053888.0,219.629181,135.2639,11.041,23.405222,47.1467,125.11,188.99,298.308875,486.5596,612.01,984.7
low,1053888.0,218.915969,134.8856,10.813,23.21587,47.02,124.61,188.28,297.2807,485.1082,610.2913,977.0301
close,1053888.0,219.276764,135.0778,10.965,23.309545,47.0833,124.8765,188.64,297.7879,485.8777,611.2207,980.085
volume,1053888.0,874013.959874,3071036.0,1275.0,15706.87,29089.0,76188.0,180278.5,495098.0,3134648.0,14871710.0,259516700.0
