# Lab C.1: TransformerLens Setup - SOLUTIONS

This notebook contains solutions to all exercises from Lab C.1.

---

In [None]:
# Setup
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformer_lens import HookedTransformer
import plotly.express as px
import gc

torch.set_grad_enabled(False)
model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")

## Exercise 1: Different Prompts

Analyze different prompts: "The capital of Germany is", "The opposite of hot is", "Einstein developed the theory of"

In [None]:
# Solution: Analyze multiple prompts

prompts = [
    "The capital of Germany is",
    "The opposite of hot is",
    "Einstein developed the theory of"
]

for prompt in prompts:
    print(f"\n{'='*60}")
    print(f"Prompt: '{prompt}'")
    print(f"{'='*60}")
    
    # Tokenize and run
    tokens = model.to_tokens(prompt)
    logits, cache = model.run_with_cache(tokens)
    
    # Get top predictions
    probs = torch.softmax(logits[0, -1, :], dim=-1)
    top_probs, top_indices = torch.topk(probs, 5)
    
    print("\nTop 5 predictions:")
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
        token = model.tokenizer.decode(idx.item())
        print(f"  {i+1}. '{token}': {prob.item():.2%}")
    
    # Clean up
    del cache

# Discussion:
# - "The capital of Germany is" -> Should predict " Berlin"
# - "The opposite of hot is" -> Should predict " cold"
# - "Einstein developed the theory of" -> Should predict " relativity"

## Exercise 2: Attention Pattern Analysis

For the IOI prompt, find heads that attend to "John".

In [None]:
# Solution: Find heads attending to John

prompt = "John and Mary went to the store. John gave a book to"
tokens = model.to_tokens(prompt)
logits, cache = model.run_with_cache(tokens)

# Print tokens to find John's position
token_strs = model.to_str_tokens(tokens)
print("Tokens:")
for i, t in enumerate(token_strs):
    print(f"  {i}: '{t}'")

# John appears at position 0 (first occurrence) and position 9 (second occurrence)
john_pos_1 = 0  # First John
john_pos_2 = 9  # Second John
last_pos = -1   # Final position

# Get attention from last position to first John
attention_to_john = np.zeros((model.cfg.n_layers, model.cfg.n_heads))

for layer in range(model.cfg.n_layers):
    pattern = cache["pattern", layer][0]  # [n_heads, seq, seq]
    attention_to_john[layer, :] = pattern[:, last_pos, john_pos_1].detach().cpu().numpy()

# Visualize
fig = px.imshow(
    attention_to_john,
    labels={"x": "Head", "y": "Layer", "color": "Attention"},
    color_continuous_scale="Blues",
    title=f"Attention from last position to 'John' (position {john_pos_1})"
)
fig.show()

# Find top heads
print("\nTop 5 heads attending to first 'John':")
flat = attention_to_john.flatten()
top_idx = np.argsort(flat)[-5:][::-1]
for idx in top_idx:
    layer = idx // model.cfg.n_heads
    head = idx % model.cfg.n_heads
    print(f"  L{layer}H{head}: {attention_to_john[layer, head]:.3f}")

del cache

## Exercise 3: Layer Comparison

Compare logit lens between math ("1 + 1 =") and factual ("Eiffel Tower") knowledge.

In [None]:
# Solution: Compare different types of knowledge

def logit_lens_first_correct(model, prompt, expected_answer):
    """Find at which layer the correct answer first appears."""
    tokens = model.to_tokens(prompt)
    logits, cache = model.run_with_cache(tokens)
    
    W_U = model.W_U
    answer_token = model.to_single_token(expected_answer)
    
    results = []
    for layer in range(model.cfg.n_layers + 1):
        if layer == 0:
            resid = cache["resid_pre", 0][0, -1, :]
        else:
            resid = cache["resid_post", layer - 1][0, -1, :]
        
        resid_normed = model.ln_final(resid)
        logits_layer = resid_normed @ W_U
        probs = torch.softmax(logits_layer, dim=-1)
        
        # Check if correct answer is top-1
        top_token = logits_layer.argmax().item()
        is_correct = (top_token == answer_token)
        prob = probs[answer_token].item()
        
        results.append({
            'layer': layer,
            'is_correct': is_correct,
            'prob': prob,
            'top_pred': model.tokenizer.decode(top_token)
        })
    
    del cache
    
    # Find first layer where correct
    first_correct = None
    for r in results:
        if r['is_correct']:
            first_correct = r['layer']
            break
    
    return results, first_correct

# Test both prompts
math_prompt = "1 + 1 ="
math_answer = " 2"

factual_prompt = "The Eiffel Tower is in"
factual_answer = " Paris"

math_results, math_first = logit_lens_first_correct(model, math_prompt, math_answer)
factual_results, factual_first = logit_lens_first_correct(model, factual_prompt, factual_answer)

print(f"Math prompt: '{math_prompt}'")
print(f"  Expected: '{math_answer}'")
print(f"  First correct at layer: {math_first}")

print(f"\nFactual prompt: '{factual_prompt}'")
print(f"  Expected: '{factual_answer}'")
print(f"  First correct at layer: {factual_first}")

# Plot comparison
layers = [r['layer'] for r in math_results]
math_probs = [r['prob'] for r in math_results]
factual_probs = [r['prob'] for r in factual_results]

plt.figure(figsize=(10, 5))
plt.plot(layers, math_probs, 'b-o', label=f"Math: '{math_prompt}' → '{math_answer}'")
plt.plot(layers, factual_probs, 'r-o', label=f"Factual: '{factual_prompt}' → '{factual_answer}'")
plt.xlabel('Layer')
plt.ylabel('Probability of Correct Answer')
plt.title('Logit Lens: Math vs Factual Knowledge')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("\nConclusion:")
print("Math knowledge often emerges later in the network (layers 8-10)")
print("Factual knowledge sometimes emerges earlier, depending on frequency in training data")

## Challenge: Find Induction Heads (Preview)

Find heads that complete "Harry Potter...Harry" → " Potter"

In [None]:
# Challenge Solution: Find induction heads

prompt = "Harry Potter is a wizard. Harry"
tokens = model.to_tokens(prompt)
token_strs = model.to_str_tokens(tokens)

print("Tokens:")
for i, t in enumerate(token_strs):
    print(f"  {i}: '{t}'")

# Run model
logits, cache = model.run_with_cache(tokens)

# For induction, we look for heads that attend from the last position
# to the position AFTER the first "Harry" - that would be "Potter"
# First "Harry" is at position 0, so we want attention to position 1 (" Potter")

harry_pos = 0  # First "Harry"
after_harry = 1  # " Potter"
last_harry = len(token_strs) - 1  # Last "Harry"

print(f"\nLooking for attention from '{token_strs[last_harry]}' (pos {last_harry})")
print(f"to '{token_strs[after_harry]}' (pos {after_harry})")

# Get attention to position after first Harry
induction_attention = np.zeros((model.cfg.n_layers, model.cfg.n_heads))

for layer in range(model.cfg.n_layers):
    pattern = cache["pattern", layer][0]
    induction_attention[layer, :] = pattern[:, last_harry, after_harry].detach().cpu().numpy()

# Visualize
fig = px.imshow(
    induction_attention,
    labels={"x": "Head", "y": "Layer", "color": "Attention"},
    color_continuous_scale="YlOrRd",
    title="Induction-like Attention: Last 'Harry' → ' Potter'"
)
fig.show()

# Find top heads
print("\nTop 5 induction-like heads:")
flat = induction_attention.flatten()
top_idx = np.argsort(flat)[-5:][::-1]
for idx in top_idx:
    layer = idx // model.cfg.n_heads
    head = idx % model.cfg.n_heads
    print(f"  L{layer}H{head}: {induction_attention[layer, head]:.3f}")

# Verify prediction
probs = torch.softmax(logits[0, -1, :], dim=-1)
potter_token = model.to_single_token(" Potter")
print(f"\nProbability of ' Potter': {probs[potter_token].item():.2%}")

del cache

## Cleanup

In [None]:
gc.collect()
torch.cuda.empty_cache()
print("Cleanup complete!")