# DBA Behavioral Benchmark Runner

This notebook runs the behavioral benchmark suite on Google Colab's free GPU.

**Setup:**
1. Checkpoints should be in Google Drive at the configured path
2. Results will be saved back to Drive
3. Optionally sends notification on completion

In [None]:
#@title Configuration { display-mode: "form" }
#@markdown ### Google Drive Paths
DRIVE_CHECKPOINT_DIR = "/content/drive/MyDrive/DBA/checkpoints/100k" #@param {type:"string"}
DRIVE_RESULTS_DIR = "/content/drive/MyDrive/DBA/results" #@param {type:"string"}

#@markdown ### Benchmark Settings
TESTS_PER_CATEGORY = 30 #@param {type:"integer"}
SEED = 42 #@param {type:"integer"}
MAX_NEW_TOKENS = 50 #@param {type:"integer"}

#@markdown ### Notification (optional)
NOTIFY_EMAIL = "" #@param {type:"string"}
NOTIFY_WEBHOOK = "" #@param {type:"string"}

#@markdown ### GitHub Repo
GITHUB_REPO = "theapemachine/caramba" #@param {type:"string"}
GITHUB_BRANCH = "main" #@param {type:"string"}

In [None]:
#@title 1. Check GPU and Mount Drive
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

from google.colab import drive
drive.mount('/content/drive')
print("\nDrive mounted successfully!")

In [None]:
#@title 2. Install Dependencies
!pip install -q tiktoken pyyaml
print("Dependencies installed!")

In [None]:
#@title 3. Clone/Update Repository
import os
from pathlib import Path

REPO_DIR = Path("/content/caramba")

if REPO_DIR.exists():
    print("Updating existing repo...")
    !cd {REPO_DIR} && git fetch && git reset --hard origin/{GITHUB_BRANCH}
else:
    print("Cloning repo...")
    !git clone --depth 1 -b {GITHUB_BRANCH} https://github.com/{GITHUB_REPO}.git {REPO_DIR}

print(f"\nRepo ready at {REPO_DIR}")

In [None]:
#@title 4. Discover Checkpoints
from pathlib import Path
import re

checkpoint_dir = Path(DRIVE_CHECKPOINT_DIR)

if not checkpoint_dir.exists():
    raise FileNotFoundError(f"Checkpoint directory not found: {checkpoint_dir}")

# Find all .pt files
checkpoints = list(checkpoint_dir.rglob("*.pt"))
print(f"Found {len(checkpoints)} checkpoint(s):")
for ckpt in checkpoints:
    size_mb = ckpt.stat().st_size / 1e6
    print(f"  - {ckpt.name} ({size_mb:.0f} MB)")

In [None]:
#@title 5. Run Benchmark
import os
import sys
from datetime import datetime
from pathlib import Path

# Set up paths
os.chdir("/content/caramba/research/dba")
sys.path.insert(0, "/content/caramba")

# Create output directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path(DRIVE_RESULTS_DIR) / f"behavioral_{timestamp}"
output_dir.mkdir(parents=True, exist_ok=True)

# Build checkpoint file list
ckpt_args = " ".join([f'"{str(c)}"' for c in checkpoints])

print(f"Output directory: {output_dir}")
print(f"Tests per category: {TESTS_PER_CATEGORY}")
print(f"Seed: {SEED}")
print("\n" + "="*70)
print("Starting benchmark...")
print("="*70 + "\n")

# Run the benchmark
!PYTHONPATH=/content/caramba python -m behavioral_suite_v2.multi_checkpoint_eval \
    --checkpoint-files {ckpt_args} \
    --output-dir "{output_dir}" \
    --tests-per-category {TESTS_PER_CATEGORY} \
    --seed {SEED} \
    --max-new-tokens {MAX_NEW_TOKENS} \
    --device cuda \
    --no-browser \
    --verbose

print("\n" + "="*70)
print("Benchmark complete!")
print(f"Results saved to: {output_dir}")
print("="*70)

In [None]:
#@title 6. Send Notification (Optional)
import json
import urllib.request
import urllib.parse
from pathlib import Path

# Load results summary
results_file = output_dir / "results.json"
summary = ""
if results_file.exists():
    with open(results_file) as f:
        data = json.load(f)
    
    summary = "**DBA Benchmark Results**\n\n"
    for model_id in data.get('model_ids', []):
        s = data['summaries'].get(model_id, {})
        summary += f"**{model_id}**\n"
        summary += f"  - Content Match: {s.get('content_match_rate', 0)*100:.1f}%\n"
        summary += f"  - Avg Score: {s.get('soft_score_avg', 0):.2f}\n"
        summary += f"  - Rep Loops: {s.get('repetition_loops', 0)}\n\n"

# Send webhook notification
if NOTIFY_WEBHOOK:
    try:
        payload = {
            "text": f"Benchmark complete!\n\n{summary}\nResults: {output_dir}"
        }
        data = json.dumps(payload).encode('utf-8')
        req = urllib.request.Request(NOTIFY_WEBHOOK, data=data, headers={'Content-Type': 'application/json'})
        urllib.request.urlopen(req)
        print("Webhook notification sent!")
    except Exception as e:
        print(f"Failed to send webhook: {e}")

# Send email notification (using Colab's built-in)
if NOTIFY_EMAIL:
    try:
        from google.colab import output
        # This will prompt for Gmail auth if not already done
        print(f"Email notification would be sent to: {NOTIFY_EMAIL}")
        print("(Email sending requires additional setup with Gmail API)")
    except Exception as e:
        print(f"Email notification not available: {e}")

print("\n" + summary)

In [None]:
#@title 7. Display Results Summary
from IPython.display import HTML, display
from pathlib import Path

# Check if dashboard was generated
dashboard_file = output_dir / "report.html"
if dashboard_file.exists():
    print(f"Dashboard saved to: {dashboard_file}")
    print("\nYou can download it from Google Drive and open locally.")
else:
    print("No dashboard file found.")

# List all output files
print("\nOutput files:")
for f in sorted(output_dir.iterdir()):
    size_kb = f.stat().st_size / 1024
    print(f"  - {f.name} ({size_kb:.1f} KB)")