# Explore Activations

The proposal/hypothesis is here https://docs.google.com/document/d/1x7n2iy1_LZXZNLQpxCzF84lZ8BEG6ZT3KWXC59erhJA 

In [None]:
import torch
from transformer_lens import HookedTransformer

# 1. Load an open-source model (e.g., GPT-2 or Llama-3-8B if memory allows)
# The proposal suggests models that don't call tools [cite: 90]
model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")

# 2. Define your "Experiment Setting" prompts [cite: 16-22]
prompts = [
    "Answer minimally: Given the numbers 25 and 9 calculate the sum",
    "Answer minimally: Given the numbers 25 and 9 calculate the product"
]

# 3. Cache activations to identify Categorization Circuits [cite: 101]
# We want the activations *after* the last word [cite: 45]
logits, cache = model.run_with_cache(prompts)

# 4. Extract activation vectors for the final token (the "Categorization" point)
# Shape: [batch, pos, d_model]
for i, prompt in enumerate(prompts):
    # Get the residual stream at the final layer for the last token
    final_resid = cache["resid_post", -1][i, -1, :] 
    print(f"Prompt {i} activation vector head: {final_resid[:5]}")

# 5. Mapping categories to responses via Activation Patching [cite: 105]
# Hypothetically: Swap the 'sum' activation into the 'product' forward pass
def patch_categorization(target_resid, hook):
    # Overwrite the target activation with the stored categorization vector
    target_resid[:, -1, :] = final_resid 
    return target_resid

# This allows you to see if the model now calculates 'sum' instead of 'product'
patched_logits = model.run_with_hooks(
    prompts[1], 
    fwd_hooks=[("blocks.11.hook_resid_post", patch_categorization)]
)