In [1]:
import os
import sys
import plotly.express as px
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
from typing import List, Optional, Tuple
import functools
from tqdm import tqdm
from IPython.display import display
import webbrowser
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv
from transformer.sample_transformer import get_log_probs
from plotly_utils import imshow, plot_logit_attribution
from utils import *

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import hf_hub_download

REPO_ID = "callummcdougall/attn_only_2L_half"
FILENAME = "attn_only_2L_half.pth"

weights_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

In [3]:
cfg = HookedTransformerConfig(
    d_model=768,
    d_head=64,
    n_heads=12,
    n_layers=2,
    n_ctx=2048,
    d_vocab=50278,
    attention_dir="causal",
    attn_only=True, # defaults to False
    tokenizer_name="EleutherAI/gpt-neox-20b", 
    seed=398,
    use_attn_result=True,
    normalization_type=None, # defaults to "LN", i.e. layernorm with weights & biases
    positional_embedding_type="shortformer"
)

In [4]:
model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_path, map_location=device)
model.load_state_dict(pretrained_weights)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<All keys matched successfully>

# Hooks

We have complete control over the neural networks that we are trying to understand. We can make 
precise edits and see how the model changes or behaves differently. TransformerLens provides hook points that lets us intervene on any activation inside our model.

In [5]:
def example_hook_function(
    activation_value: Float[Tensor, "batch heads seqQ seqK"],
    hook: HookPoint
) -> Float[Tensor, "batch heads seqQ seqK"]:

    # modify activation_value (can be inplace)
    return activation_value

In [6]:
loss = model.run_with_hooks(
    "hi im a robot", 
    return_type="loss",
    fwd_hooks=[
        ('blocks.1.attn.hook_pattern', example_hook_function) # can specify a filter instead of exact activation name
    ]
)

# Accessing activations

## Caculate induction scores with hooks

How can we use hooks to access activations without modifying them? As an example, we calculate the induction scores as we did in ```2-induction_heads.ipynb``` but using hooks.
There are a few differences.
1. We have additional batch dimension. So we take the average over the batch along with the diagonal. 
2. We calculate induction scores for all heads at once.

In [7]:
seq_len = 50
batch = 10
rep_tokens_10 = generate_repeated_tokens(model, seq_len, batch)

# global variable to store the induction scores for all attention heads, in our hook we write to this variable
induction_score_store = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=model.cfg.device) 

def induction_score_hook(pattern: Float[Tensor, "batch n_head seq_Q seq_K"], hook: HookPoint):
    diagonals = pattern.diagonal(offset=1-seq_len, dim1=-2, dim2=-1)
    scores = einops.reduce(diagonals, "batch n_head seq_len -> n_head", "mean")
    layer_idx = hook.layer()
    induction_score_store[layer_idx, :] = scores
    

pattern_hook_names_filter = lambda name: name.endswith("pattern") 

model.run_with_hooks(
    rep_tokens_10, 
    return_type=None, # For efficiency, we don't need to calculate the logits
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"}, 
    title="Induction Score by Head", 
    text_auto=".2f",
    width=900, height=400
)

## Find induction heads in GPT2-small

We will use the induction score hook we wrote above with slight change and use it to detect induction heads in GPT2-small.

In [8]:
gpt2_small = HookedTransformer.from_pretrained("gpt2-small")
rep_tokens = generate_repeated_tokens(gpt2_small, 50, 10)

induction_scores_gpt2 = t.zeros((gpt2_small.cfg.n_layers, gpt2_small.cfg.n_heads), device=gpt2_small.cfg.device) 

def induction_score_hook(pattern: Float[Tensor, "batch n_head seq_Q seq_K"], hook: HookPoint):
    diagonals = pattern.diagonal(offset=1-seq_len, dim1=-2, dim2=-1)
    scores = einops.reduce(diagonals, "batch n_head seq_len -> n_head", "mean")
    layer_idx = hook.layer()
    induction_scores_gpt2[layer_idx, :] = scores


gpt2_small.run_with_hooks(
    rep_tokens,
    return_type=None,
    fwd_hooks=[
        (pattern_hook_names_filter, induction_score_hook),
    ]
)

imshow(
    induction_scores_gpt2,
    labels={"x":"Head", "y":"Layer"},
    title="Induction Score by Head",
    text_auto=".lf",
    width=800
)

Loaded pretrained model gpt2-small into HookedTransformer


Heads that have high induction scores are 5.1, 5.5, 6.9, 7.2, 7.10

Let's visualize the attention patterns of these layers and confirm.

In [9]:
def visualize_pattern_hook(
    pattern: Float[Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    print("Layer: ", hook.layer())
    display(
        cv.attention.attention_patterns(
            tokens=gpt2_small.to_str_tokens(rep_tokens[0]), 
            attention=pattern.mean(0)
        )
    )

for layer in [5,6,7]:
    gpt2_small.run_with_hooks(
        rep_tokens,
        return_type=None,
        fwd_hooks=[
            (utils.get_act_name("pattern", layer), visualize_pattern_hook),
        ]
    )

Layer:  5


Layer:  6


Layer:  7


# Building Interpretability Tools

Mechanistic understanding requires answers to questions like "How much of the model's performance on some task is attributable to each component of the model?"

## Direct Logit Attribution
A simpler question would be "What are the direct contributions of this head to the output logits?"

Logits are computed as ```logits = residual @ W_U```.
Residual stream is simply the sum of all previous layers. Therefore,
- ```residual_0 = embed```
- ```residual_1 = residual_0 + attn_out_0```
- ```residual_2 = residual_1 + attn_out_1```
- ```residual = residual_2```

Rewriting we get, ```logits = (embed + attn_out_0 + attn_out_1) @ W_U```. We calculate the direct logit attribution for each component as follows:
- Component 1 - Embed (direct path from embedding to unembedding)
    + ```embed @ W_U```
- Component 2 - Each attention head of layer 0
    + ```attn_out_0 @ W_U```
- Component 3 - Each attention head of layer 1
    + ```attn_out_1 @ W_U```


In [10]:
def direct_logit_attribution(
        embed: Float[Tensor, "seq d_model"],
        l1_results: Float[Tensor, "seq nheads d_model"],
        l2_results: Float[Tensor, "seq nheads d_model"],
        W_U: Float[Tensor, "d_model d_vocab"],
        tokens: Int[Tensor, "seq"]
    ) -> Float[Tensor, "seq-1 n_components"]:
    # calculate direct attribution for correct tokens
    correct_tokens = tokens[1:]
    W_U_correct_tokens = W_U[:, correct_tokens]
    
    # component 1
    direct_path_attribution = einops.einsum(
        embed[:-1, :], W_U_correct_tokens, 
        "seq_1 d_model, d_model seq_1 -> seq_1", 
    ).unsqueeze(dim=-1)

    # component 2
    layer_1_attribution = einops.einsum(
        l1_results[:-1, :, :], W_U_correct_tokens,
        "seq_1 nheads d_model, d_model seq_1 -> seq_1 nheads"
    )

    # component 3
    layer_2_attribution = einops.einsum(
        l2_results[:-1, :, :], W_U_correct_tokens,
        "seq_1 nheads d_model, d_model seq_1 -> seq_1 nheads"
    )

    attributions = t.concat([direct_path_attribution, layer_1_attribution, layer_2_attribution], dim=-1)
    return attributions

In [11]:
text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."
logits, cache = model.run_with_cache(text, remove_batch_dim=True)
str_tokens = model.to_str_tokens(text)
tokens = model.to_tokens(text)

In [12]:
with t.inference_mode():
    embed = cache["embed"]
    l1_results = cache["result", 0]
    l2_results = cache["result", 1]
    logit_attr = direct_logit_attribution(embed, l1_results, l2_results, model.W_U, tokens[0])
    # Uses fancy indexing to get a len(tokens[0])-1 length tensor, where the kth entry is the predicted logit for the correct k+1th token
    correct_token_logits = logits[0, t.arange(len(tokens[0]) - 1), tokens[0, 1:]]
    t.testing.assert_close(logit_attr.sum(1), correct_token_logits, atol=1e-3, rtol=0)
    print("Tests passed!")

Tests passed!


In [13]:
embed = cache["embed"]
l1_results = cache["result", 0]
l2_results = cache["result", 1]
logit_attr = direct_logit_attribution(embed, l1_results, l2_results, model.W_U, tokens[0])
print(str_tokens[1])
plot_logit_attribution(model, logit_attr, tokens)

We


Let us do logit attribution on the sequence with repeated tokens (the one we use for testing induction heads)

In [19]:
seq_len = 50
batch = 1
(rep_tokens, rep_logits, rep_cache) = run_and_cache_model_repeated_tokens(model, seq_len, batch)

embed_out = rep_cache["embed"]
l1_out = rep_cache["result", 0]
l2_out = rep_cache["result", 1]
first_half_tokens = rep_tokens[0, : 1 + seq_len]
second_half_tokens = rep_tokens[0, seq_len:]

first_half_logit_attr = direct_logit_attribution(
    embed_out[0, : 1 + seq_len], 
    l1_out[0, : 1 + seq_len],
    l2_out[0, : 1 + seq_len],
    model.W_U,
    first_half_tokens
)

second_half_logit_attr = direct_logit_attribution(
    embed_out[0, seq_len : ],
    l1_out[0, seq_len : ],
    l2_out[0, seq_len : ],
    model.W_U,
    second_half_tokens
)

assert first_half_logit_attr.shape == (seq_len, 2*model.cfg.n_heads + 1)
assert second_half_logit_attr.shape == (seq_len, 2*model.cfg.n_heads + 1)

In [21]:
plot_logit_attribution(model, second_half_logit_attr, second_half_tokens, "Logit attribution (second half of repeated sequence)")

Heads 4 and 10 of the second layer seem to have the largest attribution on the correct tokens in the repeated half. This is further evidence that these heads are induction heads i.e., they are learning in-context.

# Intervening on Activations

## Ablation

Ablation is a simple causal intervetion on a model. 
Idea - Delete an activation of the model and analyze how the performance on a task changes.

Different kinds of ablation - 
1. Zero ablation - make the activation zero.
    * Problem - the specific activation maybe used as a bias. For example if the activation is always in the range [100, 102] then setting it to zero may break everything always.
2. Mean ablation - replace the activation with mean activation value over training distribution.
    * Problem - the mean of the activation actually may not occur. For example, if the activation is +1 or -1 uniformly then setting it to the mean (zero) will still break things.
3. Random ablation - replacte the activation with activation on random sample from training distribution.

Side - models trained on dropout could be robust to ablations.

### Induction head ablation

We will use zero ablation to study induction heads

In [22]:
def get_hook(head_index_to_ablate: int):
    def head_ablation_hook(
            v: Float[Tensor, "batch seq n_heads d_head"], 
            hook: HookPoint
    ) -> Float[Tensor, "batch seq n_heads d_head"]:
        v[:, :, head_index_to_ablate, :] = 0.0
    return head_ablation_hook

def cross_entropy_loss(logits, tokens):
    log_probs = F.log_softmax(logits, dim=-1)
    pred_log_probs = t.gather(log_probs[:, :-1], -1, tokens[:, 1:, None])[..., 0]
    return -pred_log_probs.mean()

def get_ablation_scores(
        model: HookedTransformer,
        tokens: Int[Tensor, "batch seq"]
) -> Float[Tensor, "n_layers n_heads"]:
    ablation_scores = t.zeros((model.cfg.n_layers, model.cfg.n_heads))
    
    model.reset_hooks()
    logits = model(tokens, return_type="logits")
    seq_len = (tokens.shape[1] - 1) // 2

    # we only need loss on the second half of the sequence 
    # as we are checking for induction heads
    # using repeated token sequence
    loss_without_ablation = cross_entropy_loss(logits[:, -seq_len:], tokens[:, -seq_len:])
    
    for layer in tqdm(range(model.cfg.n_layers)):
        for head in range(model.cfg.n_heads):
            temp_hook_fn = get_hook(head)
            ablated_logits = model.run_with_hooks(
                tokens,
                fwd_hooks=[(utils.get_act_name("v", layer), temp_hook_fn)]
            )
            loss_with_ablation = cross_entropy_loss(ablated_logits[:, -seq_len:], tokens[:, -seq_len:])
            ablation_scores[layer, head] = loss_with_ablation - loss_without_ablation
    return ablation_scores

ablation_scores = get_ablation_scores(model, rep_tokens)

100%|██████████| 2/2 [00:01<00:00,  1.82it/s]


In [23]:
imshow(
    ablation_scores, 
    labels={"x": "Head", "y": "Layer", "color": "Logit diff"},
    title="Loss Difference After Ablating Heads", 
    text_auto=".2f",
    width=900, height=400
)

Seems like head 7 of the first layer is the most important for induction-y behavior along with the heads 4 and 10 in layer two.