In [8]:
import os
import sys
import plotly.express as px
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
from typing import List, Optional, Tuple
import functools
from tqdm import tqdm
from IPython.display import display
import webbrowser
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv
from transformer.sample_transformer import get_log_probs
from plotly_utils import plot_loss_difference, imshow
from utils import *

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "cpu")


In [9]:
from huggingface_hub import hf_hub_download

REPO_ID = "callummcdougall/attn_only_2L_half"
FILENAME = "attn_only_2L_half.pth"

weights_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

In [10]:
cfg = HookedTransformerConfig(
    d_model=768,
    d_head=64,
    n_heads=12,
    n_layers=2,
    n_ctx=2048,
    d_vocab=50278,
    attention_dir="causal",
    attn_only=True, # defaults to False
    tokenizer_name="EleutherAI/gpt-neox-20b", 
    seed=398,
    use_attn_result=True,
    normalization_type=None, # defaults to "LN", i.e. layernorm with weights & biases
    positional_embedding_type="shortformer"
)

In [11]:
model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_path, map_location=device)
model.load_state_dict(pretrained_weights)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<All keys matched successfully>

# Hooks

We have complete control over the neural networks that we are trying to understand. We can precise edits and see how the model changes or behaves differently. TransformerLens provides hook points that lets us intervene on any activation inside our model.

In [12]:
def example_hook_function(
    activation_value: Float[Tensor, "batch heads seqQ seqK"],
    hook: HookPoint
) -> Float[Tensor, "batch heads seqQ seqK"]:

    # modify activation_value (can be inplace)
    return activation_value

In [13]:
loss = model.run_with_hooks(
    "hi im a robot", 
    return_type="loss",
    fwd_hooks=[
        ('blocks.1.attn.hook_pattern', example_hook_function) # can specify a filter instead of a name
    ]
)

# Accessing activations

## Caculate induction scores with hooks

How can we use hooks to access activations without modifying them? As an example, we calculate the induction scores as we did in ```2-induction_heads.ipynb``` but using hooks.
There are a few differences.
1. We have additional batch dimension. So we take the average over the batch along with the diagonal. 
2. We calculate induction scores for all heads at once.

In [24]:
seq_len = 50
batch = 10
rep_tokens_10 = generate_repeated_tokens(model, seq_len, batch)

# global variable to store the induction scores for all attention heads, in our hook we write to this variable
induction_score_store = t.zeros((model.cfg.n_layers, model.cfg.n_heads), device=model.cfg.device) 

def induction_score_hook(pattern: Float[Tensor, "batch n_head seq_Q seq_K"], hook: HookPoint):
    diagonals = pattern.diagonal(offset=1-seq_len, dim1=-2, dim2=-1)
    scores = einops.reduce(diagonals, "batch n_head seq_len -> n_head", "mean")
    layer_idx = hook.layer()
    induction_score_store[layer_idx, :] = scores
    

pattern_hook_names_filter = lambda name: name.endswith("pattern") 

model.run_with_hooks(
    rep_tokens_10, 
    return_type=None, # For efficiency, we don't need to calculate the logits
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

imshow(
    induction_score_store, 
    labels={"x": "Head", "y": "Layer"}, 
    title="Induction Score by Head", 
    text_auto=".2f",
    width=900, height=400
)

## Find induction heads in GPT2-small

In [25]:
gpt2_small = HookedTransformer.from_pretrained("gpt2-small")
rep_tokens = generate_repeated_tokens(gpt2_small, 50, 10)

def visualize_pattern_hook(
    pattern: Float[Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    print("Layer: ", hook.layer())
    display(
        cv.attention.attention_patterns(
            tokens=gpt2_small.to_str_tokens(rep_tokens[0]), 
            attention=pattern.mean(0)
        )
    )

def induction_score_hook(pattern: Float[Tensor, "batch n_head seq_Q seq_K"], hook: HookPoint):
    diagonals = pattern.diagonal(offset=1-seq_len, dim1=-2, dim2=-1)
    scores = einops.reduce(diagonals, "batch n_head seq_len -> n_head", "mean")
    layer_idx = hook.layer()
    induction_score_store[layer_idx, :] = scores

induction_scores_gpt2 = t.zeros((gpt2_small.cfg.n_layers, gpt2_small.cfg.n_heads), device=gpt2_small.cfg.device) 

gpt2_small.run_with_hooks(
    rep_tokens[0],
    return_type=None,
    fwd_hooks=[
        (pattern_hook_names_filter, induction_score_hook),
    ]
)

Loaded pretrained model gpt2-small into HookedTransformer


IndexError: index 2 is out of bounds for dimension 0 with size 2