In [2]:
import os
import sys
import plotly.express as px
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
from typing import List, Optional, Tuple
import functools
from tqdm import tqdm
from IPython.display import display
import webbrowser
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv

# from plotly_utils import plot_comp_scores, plot_logit_attribution, plot_loss_difference

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "cpu")


# Loading and Running Models

In [3]:
gpt2_small: HookedTransformer = HookedTransformer.from_pretrained("gpt2-small")

Loaded pretrained model gpt2-small into HookedTransformer


In [7]:
gpt2_small.cfg.n_layers

12

In [8]:
gpt2_small.cfg.n_heads

12

In [9]:
gpt2_small.cfg.n_ctx

1024

In [10]:
model_description_text = '''## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!'''

loss = gpt2_small(model_description_text, return_type="loss")
print("Model loss:", loss)

Model loss: tensor(4.3443)


In [18]:
logits: Tensor = gpt2_small(model_description_text, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
true_tokens = gpt2_small.to_tokens(model_description_text).squeeze()[1:]

num_correct = (prediction == true_tokens).sum()
print(f"{num_correct}/{len(true_tokens)}")

33/111


# Caching and verifying activations

In [19]:
gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
gpt2_tokens = gpt2_small.to_tokens(gpt2_text)
gpt2_logits, gpt2_cache = gpt2_small.run_with_cache(gpt2_tokens, remove_batch_dim=True)

In [20]:
attn_patterns_layer_0 = gpt2_cache["pattern", 0]

In [21]:
# same as above
attn_patterns_layer_0_copy = gpt2_cache["blocks.0.attn.hook_pattern"]
t.testing.assert_close(attn_patterns_layer_0, attn_patterns_layer_0_copy)

In [37]:
# verify activations
layer0_pattern_from_cache = gpt2_cache["pattern", 0]
# print(layer0_pattern_from_cache.shape)
hook_q = gpt2_cache["q", 0]
hook_k = gpt2_cache["k", 0]

layer0_scores = einops.einsum(hook_q, hook_k, "seqQ nhead d_head, seqK nhead d_head -> nhead seqQ seqK")
layer0_scores = layer0_scores / (gpt2_small.cfg.d_head ** 0.5)

sq, sk = hook_q.shape[0], hook_k.shape[0]
ones = t.ones(sq, sk).to(device)
mask = t.triu(ones, diagonal=1).bool()
layer0_scores = layer0_scores.masked_fill_(mask, -1e5)
layer0_pattern_from_q_and_k = layer0_scores.softmax(dim=-1)


t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)

In [44]:
print(type(gpt2_cache))
attention_pattern = gpt2_cache["pattern", 0]
print(attention_pattern.shape)
gpt2_str_tokens = gpt2_small.to_str_tokens(gpt2_text)

print("Layer 0 Head Attention Patterns:")
display(cv.attention.attention_heads(
    tokens=gpt2_str_tokens,
    attention=attention_pattern,
    attention_head_names=[f"L0H{i}" for i in range(gpt2_small.cfg.n_heads)],
))


<class 'transformer_lens.ActivationCache.ActivationCache'>
torch.Size([12, 33, 33])
Layer 0 Head Attention Patterns:
