 # Fetching activations from a pretrained model

 This notebook allows you to pass any text through a pretrained huggingface model, and visualise the resulting activations using circuitvis.

 If using the latest public release of [circuitsvis](https://github.com/alan-cooney/CircuitsVis), you can install with `pip install circuitsvis && yarn add circuitsvis`.

 If using a development version, clone the desired repo and run `pip install -e python && cd react && yarn` (you may also use `cd python && poetry install --with dev` if using poetry).

In [1]:
import subprocess
# Install the required packages for this script (these aren't included in the project's poetry dependencies)
print(
    subprocess.check_output(
        "pip install transformers torch",
        shell=True,
    )
)



In [2]:
import os
from functools import partial
from typing import Union
import torch
import numpy as np
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, PreTrainedModel, PreTrainedTokenizerFast
from circuitsvis.activations import text_neuron_activations

os.environ["TOKENIZERS_PARALLELISM"] = "false"


 ### Functions for loading model and fetching activations

In [3]:
def load_model_tokenizer(model_name: str) -> tuple[PreTrainedModel, PreTrainedTokenizerFast]:
    """Load a pretrained model and tokenizer from huggingface.
    
    Args:
        model_name: The name of the pretrained model to load.
        
    Returns:
        model: The loaded model.
        tokenizer: The loaded tokenizer.
    """
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model.eval()
    return model, tokenizer


def fetch_activations(model: PreTrainedModel, tokenizer: PreTrainedTokenizerFast, text: Union[list[str], str], layers: list, neurons: list) -> tuple[list[list[str]], list[np.ndarray]]:
    """Fetch activations from a model.

    Args:
        model: The model to fetch activations from.
        tokenizer: The PreTrainedTokenizerFast tokenizer to use.
        text: String or list of strings to pass to the model.
        layers: The layers to fetch activations from.
        neurons: The neurons to fetch activations from.

    Returns:
        tokens: Nested list of tokens, one list per sample in the batch.
        activations: List of the ndarrays representing the model activations for each sample (n_tokens, n_layers, n_neurons)
    """
    if isinstance(text, str):
        # Batch size of 1
        text = [text]

    # Tokenize the input text with padding to the longest sequence in the batch
    tokenized = tokenizer(text, padding=True, return_tensors="pt", return_offsets_mapping=True)
    # Get the individual tokens from the offsets
    tokens = [[text[sample_idx][i:j] for i, j in offsets] for sample_idx, offsets in enumerate(tokenized["offset_mapping"])]

    # setup hooks
    save_ctx = {}
    def _save_output_hook(self, inputs, output, layer_num, neurons):
        save_ctx[layer_num] = output[
            0
        ][:,:,neurons].detach()  # gpt2 block output is a tuple where the 0th element is the residual stream

    handles = []
    for layer_idx in layers:
        handles.append(
            model.transformer.h[layer_idx].register_forward_hook(
                partial(_save_output_hook, layer_num=layer_idx, neurons=neurons)
            )
        )
    # Run through model
    with torch.inference_mode():
        model(input_ids=tokenized["input_ids"], attention_mask=tokenized["attention_mask"])

    # Remove hook handles from model
    for handle in handles:
        handle.remove()

    # Stack the activations from all layers
    activations = torch.stack([save_ctx[layer_idx] for layer_idx in layers], dim=2)  # (batch_size, padded_seq_length, n_layers, n_neurons)

    # Remove the padding tokens and their corresponding activations
    activations_list = []
    tokens_list = []
    for sample_idx, token in enumerate(tokens):
        num_tokens = tokenized["attention_mask"][sample_idx].sum().item()
        activations_list.append(activations[sample_idx, :num_tokens].numpy())
        tokens_list.append(token[:num_tokens])
    return tokens_list, activations_list


In [4]:
##### Enter your parameters here
model_name = "gpt2"
layers = [0, 4]
neurons = [3, 4, 8]
# TODO: Add support for pulling activations from places other than the block outputs

In [5]:
model, tokenizer = load_model_tokenizer(model_name)

In [6]:
text_batch = ["Here is some text that we will get activations for.", "more text"]
tokens, acts = fetch_activations(model, tokenizer, text_batch, layers, neurons)

In [7]:
# Visualise the activations in the notebook
vis = text_neuron_activations(tokens=tokens, activations=acts, first_dimension_labels=layers, second_dimension_labels=neurons)
vis

# If you wish to view this visualisation in a browser, uncomment the below to save the vis to an html file which you can open in a browser
# vis_path = "./vis.html"
# with open(vis_path, "w") as f:
#     f.write(vis._repr_html_())