# Setup

Verify environment, load Llama-3.1-8B (base), and run tokenization + probability-extraction sanity checks so later experiments arenâ€™t confounded by tokenizer quirks (e.g., `"X"` vs `" X"`).


In [None]:
# Notebook path setup: make repo imports work regardless of where you run this from
from pathlib import Path
import sys

cwd = Path.cwd().resolve()
repo_candidates = [cwd, cwd.parent]
repo_root = next((p for p in repo_candidates if (p / 'bayesian_llm').exists()), None)
if repo_root is None:
    raise RuntimeError(f'Could not find repo root from cwd={cwd}.')

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print('Repo root:', repo_root)


In [None]:
# Environment check (no installs happen automatically)
import os
import pkgutil
from pathlib import Path

REQUIRED = [
    'torch',
    'transformers',
    'accelerate',
    'huggingface_hub',
    'transformer_lens',
]

missing = [p for p in REQUIRED if pkgutil.find_loader(p) is None]
print('Missing packages:', missing if missing else 'None')

if missing:
    req = Path('requirements.txt')
    if not req.exists():
        req = Path('../requirements.txt')
    print(f'Install with: pip install -r {req}')

print('HF_TOKEN set:', bool(os.getenv('HF_TOKEN')))


In [None]:
# Config
MODEL_ID = 'meta-llama/Llama-3.1-8B'  # base model
DTYPE = 'float16'  # change to 'bfloat16' if your GPU supports it
DEVICE_MAP = 'auto'

print('MODEL_ID:', MODEL_ID)
print('DTYPE:', DTYPE)
print('DEVICE_MAP:', DEVICE_MAP)


In [None]:
# Load model + tokenizer (can take a while)
import torch

from bayesian_llm.llm import load_hf_causal_lm

dtype = {
    'float16': torch.float16,
    'bfloat16': torch.bfloat16,
    'float32': torch.float32,
}[DTYPE]

try:
    loaded = load_hf_causal_lm(MODEL_ID, torch_dtype=dtype, device_map=DEVICE_MAP)
    model, tokenizer = loaded.model, loaded.tokenizer
    print('Loaded:', MODEL_ID)
    print('Vocab size:', tokenizer.vocab_size)
    print('Model dtype:', next(model.parameters()).dtype)
    print('First param device:', next(model.parameters()).device)
except Exception as e:
    print('Failed to load model.')
    print('Common fixes:')
    print('- Ensure you have accepted the model license on HuggingFace')
    print('- Set env var HF_TOKEN to a valid token')
    print('- If on CPU/MPS, expect this to be slow and may OOM')
    raise


In [None]:
# Tokenization sanity check for candidate answers
import pandas as pd

CANDIDATES = ['X', ' X', '\nX', 'Y', ' Y', '\nY', 'H', ' H', 'T', ' T']
rows = []
for s in CANDIDATES:
    ids = tokenizer.encode(s, add_special_tokens=False)
    rows.append({
        'string': repr(s),
        'n_tokens': len(ids),
        'token_ids': ids,
        'decoded': tokenizer.decode(ids),
    })

df = pd.DataFrame(rows)
display(df)
print('Single-token variants:', df[df.n_tokens == 1]['string'].tolist())


In [None]:
# Next-token probability extraction sanity check
import torch
from bayesian_llm.llm import normalized_next_token_prob

prompt_abstract = (
    'Two random generators. Generator A: 50% X. Generator B: 75% X. '
    'Sequence: X X Y X X. '
    'Predict the next output (X or Y):'
)

p_x = normalized_next_token_prob(
    model, tokenizer, prompt_abstract,
    a_variants=[' X', 'X', '\nX'],
    b_variants=[' Y', 'Y', '\nY'],
)

print('P(next is X | {X,Y}):', round(p_x, 4))


In [None]:
# Ground-truth Bayes for the canonical A=0.5 vs B=0.75 task
from bayesian_llm.bayes import two_generator_posterior_predictive

# Sequence: X X Y X X has n_X=4 out of 5
true_p = two_generator_posterior_predictive(n_x=4, n_total=5)
print('True Bayes P(next is X):', round(true_p, 4))
print('LLM - Bayes error:', round(p_x - true_p, 4))


In [None]:
# Minimal reproducibility footprint
import platform
import transformers

print('Python:', platform.python_version())
print('Torch:', torch.__version__)
print('Transformers:', transformers.__version__)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('CUDA device:', torch.cuda.get_device_name(0))
