# Wave Field LLM -- S2 Benchmark (55M params, 50M tokens)

Runs both **Standard Transformer** and **SPECTRE-Wave V4.3.3** at S2 scale.

| | S1 (verified) | S2 (this notebook) |
|---|---|---|
| Params | 22M | 55M |
| Tokens | 20M | 50M |
| Dataset | WikiText-2 | **WikiText-103** |
| embed/layers/heads | 384/8/8 | 512/12/8 |
| T4 time | ~25 min | ~2-3 hrs |
| A100 time | ~12 min | ~55 min |

**S1 results**: Wave PPL 229 vs Standard PPL 171 (1.34x gap on Colab T4)

**Key changes for S2:**
- Uses WikiText-103 (103M tokens) instead of WikiText-2 (2.6M) to avoid 19x data repetition
- torch.compile disabled (incompatible with gradient checkpointing)
- Bilinear gather dtype fix for fp16 AMP efficiency

In [None]:
# Cell 1: Setup
!git clone https://github.com/Pankh-AI/wave-field-llm.git
%cd wave-field-llm
!pip install -q tokenizers datasets

# Verify we have V4.3.3 code (NOT V4.3.4 which regressed)
import sys
sys.path.insert(0, '.')
from src import __version__
print(f'\nCode version: {__version__}')
assert __version__ == '4.3.3', f'ERROR: Expected V4.3.3, got {__version__}! Do NOT run with V4.3.4+.'
print('Version check PASSED')

In [None]:
# Cell 2: GPU check + VRAM estimation
import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')

if not torch.cuda.is_available():
    raise RuntimeError('No GPU! Go to Runtime > Change runtime type > T4 GPU')

gpu_name = torch.cuda.get_device_name()
vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f'GPU: {gpu_name}')
print(f'VRAM: {vram_gb:.1f} GB')

# S2 VRAM estimate: ~55M model (110MB weights) + optimizer (440MB) + activations (~4GB at batch=12)
# Total: ~5-6GB. T4 (16GB) and A100 (40/80GB) both have plenty of headroom.
if vram_gb < 10:
    print(f'WARNING: Only {vram_gb:.1f} GB VRAM. S2 needs ~6GB. May OOM at batch=12.')
    print('Consider reducing batch size: os.environ["BATCH_SIZE"] = "8"')
else:
    print(f'VRAM OK: {vram_gb:.1f} GB (S2 needs ~6GB)')

# Time estimate
if 'T4' in gpu_name:
    print('\nEstimated time: ~2-3 hours (both models)')
    print('Standard: ~45-60 min, Wave: ~1.5-2 hrs')
elif 'A100' in gpu_name:
    print('\nEstimated time: ~55 min (both models)')
elif 'V100' in gpu_name:
    print('\nEstimated time: ~1.5-2 hours (both models)')
else:
    print(f'\nUnknown GPU: {gpu_name}. Time varies.')

In [None]:
# Cell 3: Run S2 benchmark (both Standard and Wave)
import os
os.environ['SCALE'] = 'S2'              # S2 only (55M params, 50M tokens)
os.environ['DATASET'] = '103'           # WikiText-103 (103M tokens, avoids 19x repetition)
os.environ['MONITOR'] = '0'             # Skip monitor for speed on Colab
# os.environ['DATASET'] = 'owt'         # Uncomment to use OpenWebText instead
# os.environ['BATCH_SIZE'] = '8'        # Uncomment if OOM on low-VRAM GPUs
# os.environ['MODEL'] = 'wave'          # Uncomment to run Wave only (skip Standard)

!python benchmarks/benchmark_scaling.py

In [None]:
# Cell 4: Results analysis
import json

with open('results/scaling_s2.json') as f:
    data = json.load(f)

print('=' * 60)
print('  S2 BENCHMARK RESULTS (55M params, 50M tokens)')
print(f'  Dataset: {data["metadata"].get("dataset", "unknown")}')
print('=' * 60)

results = {}
for r in data['results']:
    results[r['run_name']] = r
    print(f"\n  {r['run_name']}")
    print(f"    PPL:    {r['best_ppl']:.2f}")
    print(f"    Acc:    {r['best_acc']:.2f}%")
    print(f"    Params: {r['params']:,}")
    print(f"    Speed:  {r['tokens_per_sec']:,} tok/s")
    print(f"    Time:   {r['total_time_s']:.0f}s ({r['total_time_s']/60:.0f} min)")
    print(f"    Epochs: {r.get('epochs', '?')}")

# Gap analysis
wave_r = next((r for r in data['results'] if 'SPECTRE' in r['run_name']), None)
std_r = next((r for r in data['results'] if 'Standard' in r['run_name']), None)
if wave_r and std_r:
    std_ppl = std_r['best_ppl']
    wave_ppl = wave_r['best_ppl']
    gap = wave_ppl / std_ppl
    print(f'\n  GAP: {wave_ppl:.1f} / {std_ppl:.1f} = {gap:.2f}x')
    print(f'  S1 gap was 1.34x (Colab T4) -- does scaling help?')
    if gap < 1.34:
        print(f'  YES! Gap narrowed from 1.34x to {gap:.2f}x at S2 scale.')
    elif gap > 1.34:
        print(f'  NO. Gap widened from 1.34x to {gap:.2f}x at S2 scale.')
    else:
        print(f'  Same gap at S2 scale.')

In [None]:
# Cell 5: Download results (optional)
from google.colab import files
files.download('results/scaling_s2.json')