# Wave Field LLM — S3 Benchmark (100M params, 100M tokens)

V4.3.7 SPECTRE-Wave on Colab T4 (16GB). Optimized for free tier.

| | S1 | S2 | **S3 (this)** |
|---|---|---|---|
| Params | 22M | 55M | **100M** |
| Tokens | 20M | 50M | **100M** |
| T4 time | 25 min | 2.5 hrs | **~2.5 hrs** |

**Optimizations:** batch=20 (T4 has 16GB vs 6GB 3060), local checkpoints + Drive backup every 15 min, monitor off.

**If disconnected:** Re-run all cells. Auto-resumes from Drive checkpoint.

In [None]:
# Cell 1: Setup everything (Drive + repo + deps + GPU check)
from google.colab import drive
drive.mount('/content/drive')

import os, shutil, subprocess

# Drive dirs for persistence
DRIVE_DIR = '/content/drive/MyDrive/wave-field-llm'
DRIVE_CKPT = os.path.join(DRIVE_DIR, 'checkpoints')
DRIVE_CACHE = os.path.join(DRIVE_DIR, 'cache')
os.makedirs(DRIVE_CKPT, exist_ok=True)
os.makedirs(DRIVE_CACHE, exist_ok=True)

# Clone / update repo
if not os.path.isdir('/content/wave-field-llm'):
    !git clone https://github.com/Pankh-AI/wave-field-llm.git /content/wave-field-llm
else:
    !cd /content/wave-field-llm && git pull --ff-only
%cd /content/wave-field-llm
!pip install -q tokenizers datasets

# Version check
import sys; sys.path.insert(0, '.')
from src import __version__
print(f'\nV{__version__}')
assert __version__ >= '4.3.7', f'Need V4.3.7+, got {__version__}'

# GPU
import torch
assert torch.cuda.is_available(), 'No GPU! Runtime > Change runtime type > T4'
gpu = torch.cuda.get_device_name()
vram = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f'{gpu} — {vram:.1f} GB VRAM')

# Optimal batch size for available VRAM
# S3: 100M params. With gradient checkpointing:
#   batch=20 uses ~7-8GB, batch=24 uses ~9-10GB, batch=28 uses ~11-12GB
if vram >= 36:      # A100 40/80GB
    BATCH = 48
elif vram >= 14:    # T4 16GB
    BATCH = 20
elif vram >= 10:    # various 12GB GPUs
    BATCH = 12
else:
    BATCH = 8
print(f'Batch size: {BATCH} (auto-selected for {vram:.0f}GB)')
print(f'Tokens/step: {BATCH * 512:,} — steps: {100_000_000 // (BATCH * 512):,}')
print(f'Estimated time: ~{100_000_000 / (BATCH * 512) * 0.4 / 60:.0f}-{100_000_000 / (BATCH * 512) * 0.6 / 60:.0f} min')

In [None]:
# Cell 2: Prepare local dirs + restore Drive cache
import os, shutil

# Local dirs (fast SSD)
os.makedirs('results/checkpoints', exist_ok=True)
os.makedirs('results/cache', exist_ok=True)
os.makedirs('results/data', exist_ok=True)
os.makedirs('results/monitor', exist_ok=True)

# Restore tokenizer cache from Drive (avoids re-tokenizing 133M tokens)
for f in os.listdir(DRIVE_CACHE):
    src = os.path.join(DRIVE_CACHE, f)
    dst = os.path.join('results/cache', f)
    if not os.path.exists(dst):
        print(f'Restoring cache: {f}')
        shutil.copy2(src, dst)

# Restore checkpoint from Drive for resume
resumed = False
for f in ['spectre-wave_s3_resume.pt', 'spectre-wave_s3.pt']:
    src = os.path.join(DRIVE_CKPT, f)
    dst = os.path.join('results/checkpoints', f)
    if os.path.exists(src) and not os.path.exists(dst):
        print(f'Restoring checkpoint: {f} ({os.path.getsize(src)/1e6:.0f} MB)')
        shutil.copy2(src, dst)
        resumed = True

if resumed:
    import torch
    ckpt = torch.load('results/checkpoints/spectre-wave_s3_resume.pt', map_location='cpu', weights_only=False)
    print(f'\nResuming from step {ckpt["step"]}, {ckpt["tokens_seen"]/1e6:.1f}M tokens, best PPL {ckpt["best_ppl"]:.2f}')
    del ckpt
else:
    print('Fresh start (no checkpoint on Drive)')

In [None]:
# Cell 3: Train S3 with Drive backup thread
import os, shutil, threading, time

# Background thread: copy checkpoints to Drive every 15 min
# Training saves locally (fast), we backup to Drive periodically (slow but non-blocking)
_stop_backup = threading.Event()

def _drive_backup_loop():
    while not _stop_backup.is_set():
        _stop_backup.wait(900)  # 15 min
        if _stop_backup.is_set():
            break
        for f in os.listdir('results/checkpoints'):
            if f.startswith('spectre-wave_s3'):
                src = os.path.join('results/checkpoints', f)
                dst = os.path.join(DRIVE_CKPT, f)
                try:
                    shutil.copy2(src, dst)
                except Exception:
                    pass
        # Also backup tokenizer cache (one-time, small)
        for f in os.listdir('results/cache'):
            src = os.path.join('results/cache', f)
            dst = os.path.join(DRIVE_CACHE, f)
            if not os.path.exists(dst):
                try:
                    shutil.copy2(src, dst)
                except Exception:
                    pass

backup_thread = threading.Thread(target=_drive_backup_loop, daemon=True)
backup_thread.start()
print('Drive backup thread started (every 15 min)')

# Run training
os.environ['SCALE'] = 'S3'
os.environ['MODEL'] = 'wave'
os.environ['DATASET'] = '103'
os.environ['RESUME'] = '1'
os.environ['MONITOR'] = '0'            # OFF for speed (saves I/O + ~5% GPU)
os.environ['BATCH_SIZE'] = str(BATCH)   # T4-optimized batch size

!python benchmarks/benchmark_scaling.py

# Stop backup thread
_stop_backup.set()

# Final Drive save (training complete)
print('\nSaving final checkpoints to Drive...')
for f in os.listdir('results/checkpoints'):
    if 's3' in f:
        shutil.copy2(os.path.join('results/checkpoints', f), os.path.join(DRIVE_CKPT, f))
        print(f'  Saved: {f}')
for f in os.listdir('results/cache'):
    dst = os.path.join(DRIVE_CACHE, f)
    if not os.path.exists(dst):
        shutil.copy2(os.path.join('results/cache', f), dst)
# Copy results JSON
for f in ['scaling_s3.json', 'scaling_benchmark.json']:
    src = os.path.join('results/data', f)
    if os.path.exists(src):
        shutil.copy2(src, os.path.join(DRIVE_DIR, f))
        print(f'  Saved: {f}')
print('Done! Results on Drive.')

In [None]:
# Cell 4: Results
import json, os

for path in ['results/data/scaling_s3.json', 'results/data/scaling_benchmark.json']:
    if not os.path.exists(path):
        continue
    with open(path) as f:
        data = json.load(f)
    print('=' * 60)
    print(f'  S3 RESULTS — {data["metadata"].get("dataset", "?")} — {data["metadata"].get("gpu", "?")}')
    print('=' * 60)
    for r in data['results']:
        print(f'\n  {r["run_name"]}')
        print(f'    PPL: {r["best_ppl"]:.2f}  Acc: {r["best_acc"]:.1f}%  Params: {r["params"]:,}')
        print(f'    Speed: {r["tokens_per_sec"]:,} tok/s  Time: {r["total_time_s"]/60:.0f} min')
        curve = r.get('curve', [])
        if curve:
            print(f'\n    {"Tokens":>8} {"PPL":>8} {"Acc":>7}')
            for pt in curve:
                print(f'    {pt["tokens_M"]:>6.1f}M {pt["ppl"]:>8.2f} {pt["acc"]:>6.1f}%')
    break
else:
    # No results file — check checkpoint
    ckpt_path = 'results/checkpoints/spectre-wave_s3_resume.pt'
    if os.path.exists(ckpt_path):
        import torch
        ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
        total_steps = 100_000_000 // (BATCH * 512)
        pct = ckpt['tokens_seen'] / 100e6 * 100
        print(f'Training incomplete ({pct:.0f}% done)')
        print(f'  Step {ckpt["step"]}/{total_steps} | {ckpt["tokens_seen"]/1e6:.1f}M/100M tokens')
        print(f'  Best PPL: {ckpt["best_ppl"]:.2f} | Best Acc: {ckpt["best_acc"]:.1f}%')
        print(f'  Re-run Cell 3 to continue.')
    else:
        print('No results or checkpoint found. Run Cell 3 first.')

In [None]:
# Cell 5: Verify results (run after training completes)
!python tests/test_causality.py
print('\n' + '='*60)
!python tests/verify_results.py --scale S3 --dataset 103 --skip-generation

In [None]:
# Cell 6: Download results
import shutil, os
from google.colab import files

# Save everything to Drive first
for f in ['scaling_s3.json', 'scaling_benchmark.json', 'verification_s3.json']:
    src = os.path.join('results/data', f)
    if os.path.exists(src):
        shutil.copy2(src, os.path.join(DRIVE_DIR, f))
        files.download(src)
        print(f'Saved + downloaded: {f}')