# Behavioral Tests

Quantify when next-token probabilities match the normative Bayesian posterior, and diagnose systematic deviations via competing cognitive models (tempered Bayes, recency/leak, frequency, Markov/induction).


In [None]:
# Notebook path setup: make repo imports work regardless of where you run this from
from pathlib import Path
import sys

cwd = Path.cwd().resolve()
repo_candidates = [cwd, cwd.parent]
repo_root = next((p for p in repo_candidates if (p / 'bayesian_llm').exists()), None)
if repo_root is None:
    raise RuntimeError(f'Could not find repo root from cwd={cwd}.')

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print('Repo root:', repo_root)


In [None]:
# Imports + configuration
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from bayesian_llm.bayes import two_generator_posterior_predictive
from bayesian_llm.data import make_sequence, permute_sequence, set_seed
from bayesian_llm.llm import load_hf_causal_lm, normalized_next_token_prob
from bayesian_llm.metrics import mae, pearson_r

set_seed(0)

MODEL_ID = 'meta-llama/Llama-3.1-8B'
N_TOTAL = 10
N_PERMUTATIONS = 10   # per (n_X) count; increase for tighter order-variance estimates
TEMPERATURE = 1.0

PRINT_EXAMPLES = True

print('MODEL_ID:', MODEL_ID)
print('N_TOTAL:', N_TOTAL)
print('N_PERMUTATIONS:', N_PERMUTATIONS)


In [None]:
# Load model (reuse this kernel for the whole notebook)
import torch

dtype = torch.float16
loaded = load_hf_causal_lm(MODEL_ID, torch_dtype=dtype, device_map='auto')
model, tokenizer = loaded.model, loaded.tokenizer

print('Loaded:', MODEL_ID)
print('dtype:', next(model.parameters()).dtype)
print('device:', next(model.parameters()).device)


In [None]:
# Prompt builders (controls for order/independence wording)

def prompt_two_generators(sequence_tokens, *, control: str = 'base'):
    seq_str = ' '.join(sequence_tokens)
    if control == 'base':
        prefix = 'Two random generators. Generator A: 50% X. Generator B: 75% X.'
    elif control == 'independent':
        prefix = 'Two random generators. Generator A: 50% X. Generator B: 75% X. Draws are independent.'
    elif control == 'order_irrelevant':
        prefix = 'Two random generators. Generator A: 50% X. Generator B: 75% X. Draws are independent. Order does not matter.'
    else:
        raise ValueError(f'Unknown control={control}')

    return (
        f"{prefix} Sequence: {seq_str}. "
        'Predict the next output (X or Y):'
    )

# Quick check
example = prompt_two_generators(['X','X','Y','X','X'], control='order_irrelevant')
print(example)


In [None]:
# Evidence sweep with permutation (order) variance

controls = ['base', 'independent', 'order_irrelevant']
results = []
rng = np.random.default_rng(0)

for control in controls:
    for n_x in tqdm(range(N_TOTAL + 1), desc=f'sweep[{control}]'):
        base_seq = make_sequence(n_x=n_x, n_total=N_TOTAL, x='X', y='Y')
        true_p = two_generator_posterior_predictive(n_x=n_x, n_total=N_TOTAL)

        preds = []
        for _ in range(N_PERMUTATIONS):
            seq = permute_sequence(base_seq, rng=rng)
            prompt = prompt_two_generators(seq, control=control)
            p_x = normalized_next_token_prob(
                model, tokenizer, prompt,
                a_variants=[' X', 'X', '\nX'],
                b_variants=[' Y', 'Y', '\nY'],
                temperature=TEMPERATURE,
            )
            preds.append(p_x)

        preds = np.asarray(preds)
        results.append({
            'control': control,
            'n_X': n_x,
            'true_bayes': true_p,
            'llm_mean': float(preds.mean()),
            'llm_std': float(preds.std(ddof=1) if len(preds) > 1 else 0.0),
            'llm_min': float(preds.min()),
            'llm_max': float(preds.max()),
            'drift_max_minus_min': float(preds.max() - preds.min()),
        })

df = pd.DataFrame(results)
print('Rows:', len(df))
display(df.head(10))


In [None]:
# Plot: Bayes curve vs LLM mean (with permutation std as error bars)

fig, ax = plt.subplots(figsize=(9, 5))

for control in controls:
    sub = df[df.control == control].sort_values('n_X')
    ax.errorbar(
        sub.n_X, sub.llm_mean, yerr=sub.llm_std,
        label=f'LLM mean ({control})',
        marker='o', capsize=3, alpha=0.8
    )

# Bayes is same across controls
base = df[df.control == 'base'].sort_values('n_X')
ax.plot(base.n_X, base.true_bayes, color='black', linewidth=2, label='True Bayes')

ax.set_title('Evidence sweep (A=0.5 vs B=0.75)')
ax.set_xlabel('n_X in N_TOTAL')
ax.set_ylabel('P(next is X | {X,Y})')
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3)
ax.legend()
plt.show()

# Summary stats per control
rows = []
for control in controls:
    sub = df[df.control == control]
    rows.append({
        'control': control,
        'MAE_to_Bayes': mae(sub.true_bayes, sub.llm_mean),
        'Pearson_r': pearson_r(sub.true_bayes, sub.llm_mean),
        'Mean_order_drift': float(sub.drift_max_minus_min.mean()),
        'Max_order_drift': float(sub.drift_max_minus_min.max()),
    })

summary = pd.DataFrame(rows).sort_values('MAE_to_Bayes')
display(summary)


In [None]:
# Semantic-prior control: compare X/Y vs H/T wording on the *same evidence*

def prompt_coins(sequence_tokens):
    seq_str = ' '.join(sequence_tokens)
    return (
        'Two coins. Type A is fair (50% heads). Type B is biased (75% heads). '
        f'Sequence: {seq_str}. '
        'Predict the next toss (H or T):'
    )

N_TOTAL_SEM = 10
N_PERM_SEM = 5
rng = np.random.default_rng(1)
rows = []

for n_h in tqdm(range(N_TOTAL_SEM + 1), desc='semantic_sweep'):
    base_seq_ht = ['H'] * n_h + ['T'] * (N_TOTAL_SEM - n_h)
    base_seq_xy = ['X'] * n_h + ['Y'] * (N_TOTAL_SEM - n_h)

    true_p = two_generator_posterior_predictive(n_x=n_h, n_total=N_TOTAL_SEM)

    preds_xy = []
    preds_ht = []
    for _ in range(N_PERM_SEM):
        seq_ht = permute_sequence(base_seq_ht, rng=rng)
        seq_xy = permute_sequence(base_seq_xy, rng=rng)

        p_xy = normalized_next_token_prob(
            model, tokenizer, prompt_two_generators(seq_xy, control='order_irrelevant'),
            a_variants=[' X', 'X', '\nX'], b_variants=[' Y', 'Y', '\nY'],
        )
        p_ht = normalized_next_token_prob(
            model, tokenizer, prompt_coins(seq_ht),
            a_variants=[' H', 'H', '\nH'], b_variants=[' T', 'T', '\nT'],
        )
        preds_xy.append(p_xy)
        preds_ht.append(p_ht)

    rows.append({
        'n_success': n_h,
        'true_bayes': true_p,
        'llm_xy_mean': float(np.mean(preds_xy)),
        'llm_ht_mean': float(np.mean(preds_ht)),
        'xy_minus_bayes': float(np.mean(preds_xy) - true_p),
        'ht_minus_bayes': float(np.mean(preds_ht) - true_p),
    })

df_sem = pd.DataFrame(rows)
display(df_sem.head())

fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(df_sem.n_success, df_sem.true_bayes, color='black', linewidth=2, label='True Bayes')
ax.plot(df_sem.n_success, df_sem.llm_xy_mean, marker='o', label='LLM (X/Y, order_irrelevant)')
ax.plot(df_sem.n_success, df_sem.llm_ht_mean, marker='x', label='LLM (H/T wording)')
ax.set_title('Semantic control: abstract vs coin wording')
ax.set_xlabel('n_success in N_TOTAL_SEM')
ax.set_ylabel('P(next is success | {success,failure})')
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3)
ax.legend()
plt.show()

print('MAE(X/Y):', mae(df_sem.true_bayes, df_sem.llm_xy_mean))
print('MAE(H/T):', mae(df_sem.true_bayes, df_sem.llm_ht_mean))


In [None]:
# Fit simple cognitive models to the LLM curve (diagnosis, not claiming ground truth)

# We fit against the 'order_irrelevant' curve to reduce order artifacts.
sub = df[df.control == 'order_irrelevant'].sort_values('n_X')
xs = sub.n_X.to_numpy()
ys = sub.llm_mean.to_numpy()

# Model 1: Tempered Bayes (likelihood exponent λ)
# Posterior over A/B: loglik scaled by λ. Predictive then computed normally.
def tempered_bayes_predict(n_x, n_total, lam):
    # A: p=0.5, B: p=0.75
    pA, pB = 0.5, 0.75
    n_y = n_total - n_x
    loglikA = n_x * math.log(pA) + n_y * math.log(1 - pA)
    loglikB = n_x * math.log(pB) + n_y * math.log(1 - pB)
    logpostA = math.log(0.5) + lam * loglikA
    logpostB = math.log(0.5) + lam * loglikB
    m = max(logpostA, logpostB)
    postA = math.exp(logpostA - m)
    postB = math.exp(logpostB - m)
    Z = postA + postB
    postA /= Z
    postB /= Z
    return pA * postA + pB * postB

# Model 2: Linear frequency heuristic: a*(n_x/n_total)+b clipped
def freq_predict(n_x, n_total, a, b):
    p = a * (n_x / n_total) + b
    return min(1.0, max(0.0, p))

# Grid-search fit (kept simple / transparent)

best = {}

# Tempered Bayes
lam_grid = np.linspace(0.1, 2.0, 40)
errs = []
for lam in lam_grid:
    pred = np.array([tempered_bayes_predict(int(n), N_TOTAL, float(lam)) for n in xs])
    errs.append(((pred - ys) ** 2).mean())
best_lam = float(lam_grid[int(np.argmin(errs))])
best['tempered_bayes'] = {'lam': best_lam, 'mse': float(min(errs))}

# Frequency
a_grid = np.linspace(0.0, 1.5, 61)
b_grid = np.linspace(-0.25, 0.25, 81)
best_mse = float('inf')
best_ab = (None, None)
for a in a_grid:
    for b in b_grid:
        pred = np.array([freq_predict(int(n), N_TOTAL, float(a), float(b)) for n in xs])
        mse = float(((pred - ys) ** 2).mean())
        if mse < best_mse:
            best_mse = mse
            best_ab = (float(a), float(b))
best['frequency'] = {'a': best_ab[0], 'b': best_ab[1], 'mse': best_mse}

print('Best fits:')
for k,v in best.items():
    print('-', k, v)

# Plot fits
fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(xs, sub.true_bayes.to_numpy(), color='black', linewidth=2, label='True Bayes')
ax.plot(xs, ys, marker='o', label='LLM mean (order_irrelevant)')

p_tb = np.array([tempered_bayes_predict(int(n), N_TOTAL, best_lam) for n in xs])
ax.plot(xs, p_tb, linestyle='--', label=f'Tempered Bayes (λ={best_lam:.2f})')

a,b = best_ab
p_fr = np.array([freq_predict(int(n), N_TOTAL, a, b) for n in xs])
ax.plot(xs, p_fr, linestyle='--', label=f'Frequency (a={a:.2f}, b={b:.2f})')

ax.set_title('Diagnosing deviations: simple model fits')
ax.set_xlabel('n_X')
ax.set_ylabel('P(next is X | {X,Y})')
ax.set_ylim(0, 1)
ax.grid(True, alpha=0.3)
ax.legend()
plt.show()


In [None]:
# Takeaways for next steps (printed so you can iterate quickly)

best_control = summary.sort_values('MAE_to_Bayes').iloc[0].to_dict()
print('Best prompt control (by MAE):', best_control)

# Where does the LLM violate the known upper/lower bounds (0.5..0.75) for this task?
sub = df[df.control == 'order_irrelevant'].sort_values('n_X')
viol_upper = sub[sub.llm_mean > 0.75 + 1e-6][['n_X','llm_mean']].to_dict('records')
viol_lower = sub[sub.llm_mean < 0.50 - 1e-6][['n_X','llm_mean']].to_dict('records')
print('Upper-bound violations (should be <=0.75):', viol_upper[:10])
print('Lower-bound violations (should be >=0.50):', viol_lower[:10])

print('Next notebook suggestion: use logit-lens + activation patching to see which layers/heads push probabilities beyond Bayes.')
