# HRM Behavior Dataset Validation (Colab)

This notebook validates the HRM-formatted behavior dataset (`behavior-v1`) by:
- Loading `inputs/labels` arrays and `dataset.json`
- Reading `vocab.json`
- Printing shapes/metadata and a token preview



In [None]:
#@title Setup
DATA_ROOT = "/content/behavior-v1"  #@param {type:"string"}
print("DATA_ROOT:", DATA_ROOT)


In [None]:
#@title Validate dataset structure and preview tokens
import os, json
import numpy as np

assert os.path.isdir(DATA_ROOT), f"Missing {DATA_ROOT}"

vocab_path = os.path.join(DATA_ROOT, 'vocab.json')
vocab = json.load(open(vocab_path, 'r')) if os.path.exists(vocab_path) else []
print('vocab_size', len(vocab))

for split in ['train','val','test']:
  d = os.path.join(DATA_ROOT, split)
  p_inputs = os.path.join(d, 'all__inputs.npy')
  p_labels = os.path.join(d, 'all__labels.npy')
  p_meta   = os.path.join(d, 'dataset.json')

  if not os.path.exists(p_inputs):
    print(split, '(missing)')
    continue

  inputs = np.load(p_inputs, mmap_mode='r')
  labels = np.load(p_labels, mmap_mode='r')
  meta = json.load(open(p_meta,'r'))

  print(split, 'inputs', inputs.shape, 'labels', labels.shape, 'seq_len(meta)', meta.get('seq_len'), 'vocab(meta)', meta.get('vocab_size'))

  p_pids = os.path.join(d, 'all__puzzle_identifiers.npy')
  p_pidx = os.path.join(d, 'all__puzzle_indices.npy')
  p_gidx = os.path.join(d, 'all__group_indices.npy')
  if os.path.exists(p_pids):
    pids = np.load(p_pids)
    pidx = np.load(p_pidx)
    gidx = np.load(p_gidx)
    print(split, 'puzzles', pids.shape, 'puzzle_indices', pidx.shape, 'group_indices', gidx.shape)

  if inputs.shape[0] > 0 and len(vocab):
    row = inputs[0][:32].tolist()
    toks = [vocab[t] if 0 <= t < len(vocab) else f'<unk:{t}>' for t in row]
    print(split, 'preview', toks)
