# Quanta Maths - Out of Bounds experiments

Using the 6-digit model PhilipQuirke/QuantaMaths_add_d6_l2_h3_t20K_s173289 this CoLab tests:

- trained_padding: 6-digit questions in expected format
- no_operand_padding: 6-digit questions without zero padding
- shorter_with_padding: 5-digit questions in expected format
- longer_questions1: 9-digit questions without zero padding
- longer_questions2: 9-digit questions with zero padding

# Load libraries

In [None]:
!pip install huggingface_hub transformer_lens

In [None]:
from transformer_lens import HookedTransformerConfig, HookedTransformer
import torch
from huggingface_hub import hf_hub_download

In [None]:
import random
import pandas as pd

# Load 6 digit Model

In [None]:
repo_id  = "PhilipQuirke/QuantaMaths_add_d6_l2_h3_t20K_s173289"
model_pth_fname = "model.pth"  # or whatever your file is called

In [None]:
# Build the config — match your training params exactly!
cfg = HookedTransformerConfig(
    d_model=510,
    n_heads=3,
    d_head=170,
    n_layers=2,
    d_mlp=2040,
    n_ctx=22,
    d_vocab=15,
    act_fn="gelu",
    normalization_type="LN",
    device="cpu",           # or "cuda" if using GPU
)

# Instantiate model from config
model = HookedTransformer(cfg)

In [None]:
# Download the file
model_path = hf_hub_download(repo_id=repo_id, filename=model_pth_fname)
state_dict = torch.load(model_path, map_location="cpu")
model.load_state_dict(state_dict, strict=False)
model.eval()

In [None]:
char_to_token = {
    **{str(i): i for i in range(10)},
    '+': 10,
    '-': 11,
    '=': 12,
    '*': 13,
    '\\': 14,   # division token as a backslash
}

token_to_char = {v: k for k, v in char_to_token.items()}

In [None]:
import torch

def encode_str(s):
    tokens = [char_to_token[c] for c in s]
    return torch.tensor([tokens], dtype=torch.long)

def decode_tokens(t):
    return ''.join([token_to_char[int(tok)] for tok in t])

# Experiments

In [None]:
def pad_left(s, width):
    return s.rjust(width, '0')

def generate_sample(a_len, b_len, c_len, formatted=True, pad_operands=True):
    a = random.randint(0, 10**a_len - 1)
    b = random.randint(0, 10**b_len - 1)
    c = a + b

    a_str = str(a)
    b_str = str(b)
    c_str = str(c)

    if pad_operands:
        a_str = pad_left(a_str, a_len)
        b_str = pad_left(b_str, b_len)
        c_str = pad_left(c_str, c_len)

    prompt = f"{a_str}+{b_str}="

    if formatted:
        max_len = max(len(str(a)), len(str(b)), len(str(c)), c_len)
        c_str = pad_left(str(c), max_len)
        c_str = "+" + c_str
    else:
        c_str = "+" + str(c)

    full_str = prompt + c_str
    return prompt, c_str, full_str, a, b, c

# Generate 10 test samples for each case
results = {
    "trained_padding": [generate_sample(6, 6, 7, formatted=True, pad_operands=True) for _ in range(10)],
    "no_operand_padding": [generate_sample(4, 3, 7, pad_operands=False) for _ in range(10)],
    "shorter_with_padding": [generate_sample(5, 5, 7, pad_operands=True) for _ in range(10)],
    "longer_questions1": [generate_sample(9, 9, 10, pad_operands=False) for _ in range(10)],
    "longer_questions2": [generate_sample(9, 9, 10, pad_operands=True) for _ in range(10)],
}

def format_test_df(category, samples):
    df = pd.DataFrame(samples, columns=["prompt", "expected", "full_expected", "a", "b", "c"])
    df["category"] = category
    return df[["category", "prompt", "expected", "a", "b", "c"]]

df_all = pd.concat([
    format_test_df(cat, samples)
    for cat, samples in results.items()
], ignore_index=True)


In [None]:
from tqdm import tqdm

# Use this if not already loaded
def manual_generate(expr, max_new_tokens=10):
    model.eval()
    input_ids = encode_str(expr).to(model.cfg.device)
    n_ctx = model.cfg.n_ctx

    for _ in range(max_new_tokens):
        if input_ids.shape[1] > n_ctx:
            input_ids = input_ids[:, -n_ctx:]

        with torch.no_grad():
            logits = model(input_ids)[0, -1]
            next_token = logits.argmax(dim=-1, keepdim=True).unsqueeze(0)
            input_ids = torch.cat([input_ids, next_token], dim=1)

    output_ids = input_ids[0]
    return decode_tokens(output_ids[len(expr):])


In [None]:
def evaluate_model(df):
    outputs = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = row["prompt"]
        expected = row["expected"]
        a, b, c = row["a"], row["b"], row["c"]

        # Set correct number of tokens needed
        max_tokens = len(expected)  # max needed characters

        try:
            pred = manual_generate(prompt, max_new_tokens=max_tokens)
        except Exception as e:
            pred = f"<error: {str(e)}>"

        # Check correctness
        try:
            numeric_correct = (int(pred.lstrip("+")) == c)
        except:
            numeric_correct = False

        format_correct = (pred == expected)

        outputs.append({
            "prompt": prompt,
            "expected": expected,
            "predicted": pred,
            "numeric_correct": numeric_correct,
            "format_correct": format_correct,
            "category": row["category"]
        })

    return pd.DataFrame(outputs)

In [None]:
# Run evaluation
results_df = evaluate_model(df_all)

In [None]:
pd.set_option('display.max_colwidth', None)     # Don't truncate columns
pd.set_option('display.expand_frame_repr', False)  # Don't wrap rows to multiple lines
pd.set_option('display.max_columns', None)      # Show all columns
print()
print(results_df)