In [2]:
# Janky code to do different setup when run in a Colab notebook vs VSCode
DEBUG_MODE = False
try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
    %pip install git+https://github.com/neelnanda-io/TransformerLens.git
    # Install another version of node that makes PySvelte work way faster
    !curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs
    %pip install git+https://github.com/neelnanda-io/PySvelte.git
except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

# Import stuff
import torch
from fancy_einsum import einsum
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
from torchtyping import TensorType as TT

# Turn off auto diff to save memory since only doing inference
torch.set_grad_enabled(False)

# Load gpt2-small
model = HookedTransformer.from_pretrained(
    "gpt2-small",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True,
)

# Create IOI prompts
prompt_format = [
    "When John and Mary went to the shops,{} gave the bag to",
    "When Tom and James went to the park,{} gave the ball to",
    "When Dan and Sid went to the shops,{} gave an apple to",
    "After Martin and Amy went to the park,{} gave a drink to",
]
names = [
    (" Mary", " John"),
    (" Tom", " James"),
    (" Dan", " Sid"),
    (" Martin", " Amy"),
]
# List of prompts
prompts = []
# List of answers, in the format (correct, incorrect)
answers = []
# List of the token (ie an integer) corresponding to each answer, in the format (correct_token, incorrect_token)
answer_tokens = []
for i in range(len(prompt_format)):
    for j in range(2):
        answers.append((names[i][j], names[i][1 - j]))
        answer_tokens.append(
            (
                model.to_single_token(answers[-1][0]),
                model.to_single_token(answers[-1][1]),
            )
        )
        # Insert the *incorrect* answer to the prompt, making the correct answer the indirect object.
        prompts.append(prompt_format[i].format(answers[-1][1]))
answer_tokens = torch.tensor(answer_tokens)

tokens = model.to_tokens(prompts, prepend_bos=True)

# Run the model and cache all activations
original_logits, cache = model.run_with_cache(tokens)

# Reference notebook code
answer_residual_directions = model.tokens_to_residual_directions(answer_tokens)
logit_diff_directions = answer_residual_directions[:, 0] - answer_residual_directions[:, 1]

def residual_stack_to_logit_diff(residual_stack: TT["components", "batch", "d_model"], cache: ActivationCache) -> float:
    scaled_residual_stack = cache.apply_ln_to_stack(residual_stack, layer = -1, pos_slice=-1)
    return einsum("... batch d_model, batch d_model -> ...", scaled_residual_stack, logit_diff_directions)/len(prompts)

# Get accumulated resid
accumulated_residual, labels = cache.accumulated_resid(layer=-1, incl_mid=True, pos_slice=-1, return_labels=True)

# Get reference logit diffs
ref_ave_logit_diffs = residual_stack_to_logit_diff(accumulated_residual, cache)

# Get our logit diffs
logit_diffs = cache.logit_attrs(accumulated_residual, pos_slice=-1, tokens=answer_tokens[:,0], incorrect_tokens=answer_tokens[:,1])
ave_logit_diffs = logit_diffs.mean(dim=-1)

# Print difference between ref and ours
print("Ref vs ours difference: ", (ref_ave_logit_diffs - ave_logit_diffs).abs().sum())

# Ref vs ours difference:  tensor(2.5898e-06)

Running as a Jupyter notebook - intended for development only!


Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-small into HookedTransformer
Ref vs ours difference:  tensor(2.5898e-06)


In [6]:
# tokens.shape: [8, 15]
_, cache = model.run_with_cache(tokens)
accumulated_residual = cache.accumulated_resid(layer=-1, incl_mid=True, pos_slice=-1)
resid_stack = accumulated_residual[:,0,:] # I tried to apply ln on a single example
scaled_resid_stack = cache.apply_ln_to_stack(resid_stack, layer = -1, pos_slice=-1)


RuntimeError: The size of tensor a (25) must match the size of tensor b (8) at non-singleton dimension 0

In [10]:
# tokens.shape: [8, 15]
token = tokens[:,0] # take first example
_, cache = model.run_with_cache(token)
accumulated_residual = cache.accumulated_resid(layer=-1, incl_mid=True, pos_slice=-1)
resid_stack = accumulated_residual.squeeze() # accumulated residual is [25, 1, 768], so squeeze to remove batch dim
scaled_resid_stack = cache.apply_ln_to_stack(resid_stack, layer = -1, pos_slice=-1)

torch.Size([8])


In [4]:
logit_diffs = cache.logit_attrs(accumulated_residual, pos_slice=-1, tokens=answer_tokens[:,0], incorrect_tokens=answer_tokens[:3,1])

ValueError: tokens and incorrect_tokens must have the same shape! (tokens.shape=torch.Size([8]), incorrect_tokens.shape=torch.Size([3]))