In [1]:
!pip install transformer_lens circuitsvis plotly

Collecting transformer_lens
  Downloading transformer_lens-2.16.1-py3-none-any.whl.metadata (12 kB)
Collecting circuitsvis
  Downloading circuitsvis-1.43.3-py3-none-any.whl.metadata (983 bytes)
Collecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl.metadata (28 kB)
Collecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Downloading better_abc-0.0.3-py3-none-any.whl.metadata (1.4 kB)
Collecting fancy-einsum>=0.0.3 (from transformer_lens)
  Downloading fancy_einsum-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting jaxtyping>=0.2.11 (from transformer_lens)
  Downloading jaxtyping-0.3.5-py3-none-any.whl.metadata (7.3 kB)
Collecting numpy<2,>=1.26 (from transformer_lens)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting transformers-stream-generator<0

In [1]:
import transformer_lens as tl
from transformer_lens import HookedTransformer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

In [2]:
prompt1 = 'First Alice talked to Bob, then Carol talked to Bob, then Alice talked to'
prompt2 = 'First Carol talked to Bob, then Alice talked to Bob, then Carol talked to'

In [73]:
words = [' Carol', ' Alice', ' both']

for word in words:
    tokens = model.to_str_tokens(word, prepend_bos=False)
    print(f"'{word}' parses to: {tokens}")

' Carol' parses to: [' Carol']
' Alice' parses to: [' Alice']
' both' parses to: [' both']


In [3]:
def plot_attention(cache, layer, n_heads):
    attn = cache["pattern", layer][0].cpu().detach().numpy()
    n_heads = min(n_heads, attn.shape[0])

    cols = 4
    rows = (n_heads + cols - 1) // cols

    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Head {i+1}" for i in range(n_heads)])

    for i in range(n_heads):
        fig.add_trace(
            go.Heatmap(z=attn[i], showscale=False),
            row=i//cols + 1, col=i%cols + 1
        )

    fig.update_layout(height=200*rows, width=800, title=f"Layer {layer} Attention Heads")
    fig.show(renderer="colab")

def get_token_stats(model, term, logits, cache):
    # token id
    token_id = model.to_single_token(term)
    if token_id is None:
        print(f"'{term}' is not a single token.")
        return

    # next token logits
    next_token_logits = logits[0, -1]
    # score of term
    score = next_token_logits[token_id].item()

    # determine rank of term
    rank = (next_token_logits > score).sum().item() + 1

    print(f"Stats for '{term}' -> Rank: {rank}, Logit Score: {score:.4f}")

def pipeline(model, prompt, layer, n_heads, graph=False):
  tokens = model.to_tokens(prompt)
  str_tokens = model.to_str_tokens(prompt)
  logits, cache = model.run_with_cache(tokens)

  # top 5 predictions
  top_logits, top_indices = logits[0, -1].topk(5)
  print("Top 5 most likely next tokens:")
  for i in range(5):
      token = model.to_string(top_indices[i])
      score = top_logits[i].item()
      print(f"Rank {i+1}: '{token}' | Logit: {score:.4f}")

  print()
  get_token_stats(model, ' Carol', logits, cache)
  get_token_stats(model, ' Alice', logits, cache)
  get_token_stats(model, ' both', logits, cache)
  if graph:
    plot_attention(cache, layer, n_heads)

In [75]:
model = HookedTransformer.from_pretrained("pythia-70m")
prompt = prompt1
pipeline(model, prompt, 5, 8)
print()
prompt = prompt2
pipeline(model, prompt, 5, 8)

Loaded pretrained model pythia-70m into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 22.3514
Rank 2: ' him' | Logit: 21.3399
Rank 3: ' her' | Logit: 21.1042
Rank 4: ' Carol' | Logit: 20.4703
Rank 5: ' the' | Logit: 19.3488

Stats for ' Carol' -> Rank: 4, Logit Score: 20.4703
Stats for ' Alice' -> Rank: 28, Logit Score: 16.8597
Stats for ' both' -> Rank: 32, Logit Score: 16.7022

Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 22.3075
Rank 2: ' him' | Logit: 21.8679
Rank 3: ' her' | Logit: 20.6425
Rank 4: ' the' | Logit: 19.1840
Rank 5: ' his' | Logit: 19.1405

Stats for ' Carol' -> Rank: 6, Logit Score: 18.7575
Stats for ' Alice' -> Rank: 10, Logit Score: 18.1007
Stats for ' both' -> Rank: 26, Logit Score: 16.7347


In [76]:
model = HookedTransformer.from_pretrained("pythia-160m")
prompt = prompt1
pipeline(model, prompt, 10, 12)
print()
prompt = prompt2
pipeline(model, prompt, 10, 12)

Loaded pretrained model pythia-160m into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 31.4864
Rank 2: ' Carol' | Logit: 31.2838
Rank 3: ' the' | Logit: 29.7144
Rank 4: ' John' | Logit: 29.2363
Rank 5: ' Peter' | Logit: 29.1680

Stats for ' Carol' -> Rank: 2, Logit Score: 31.2838
Stats for ' Alice' -> Rank: 7, Logit Score: 28.7729
Stats for ' both' -> Rank: 53, Logit Score: 27.0730

Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 31.7674
Rank 2: ' Alice' | Logit: 31.6405
Rank 3: ' Carol' | Logit: 30.5645
Rank 4: ' the' | Logit: 29.8524
Rank 5: ' her' | Logit: 29.7779

Stats for ' Carol' -> Rank: 3, Logit Score: 30.5645
Stats for ' Alice' -> Rank: 2, Logit Score: 31.6405
Stats for ' both' -> Rank: 122, Logit Score: 26.7902


In [77]:
model = HookedTransformer.from_pretrained("pythia-410m")
prompt = prompt1
pipeline(model, prompt, 10, 16)
print()
prompt = prompt2
pipeline(model, prompt, 10, 16)

Loaded pretrained model pythia-410m into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 18.5772
Rank 2: ' Carol' | Logit: 18.3591
Rank 3: ' Alice' | Logit: 16.2441
Rank 4: ' her' | Logit: 15.1980
Rank 5: ' him' | Logit: 14.3990

Stats for ' Carol' -> Rank: 2, Logit Score: 18.3591
Stats for ' Alice' -> Rank: 3, Logit Score: 16.2441
Stats for ' both' -> Rank: 43, Logit Score: 11.6170

Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 18.2777
Rank 2: ' Carol' | Logit: 16.7947
Rank 3: ' Alice' | Logit: 16.5323
Rank 4: ' her' | Logit: 14.9715
Rank 5: ' him' | Logit: 14.0250

Stats for ' Carol' -> Rank: 2, Logit Score: 16.7947
Stats for ' Alice' -> Rank: 3, Logit Score: 16.5323
Stats for ' both' -> Rank: 43, Logit Score: 11.6205


In [78]:
model = HookedTransformer.from_pretrained("pythia-1b")
prompt = prompt1
pipeline(model, prompt, 10, 8)
print()
prompt = prompt2
pipeline(model, prompt, 10, 8)

Loaded pretrained model pythia-1b into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Carol' | Logit: 18.3959
Rank 2: ' Bob' | Logit: 18.2088
Rank 3: ' Alice' | Logit: 14.5569
Rank 4: ' the' | Logit: 13.8756
Rank 5: ' Charlie' | Logit: 13.3573

Stats for ' Carol' -> Rank: 1, Logit Score: 18.3959
Stats for ' Alice' -> Rank: 3, Logit Score: 14.5569
Stats for ' both' -> Rank: 6, Logit Score: 13.0936

Top 5 most likely next tokens:
Rank 1: ' Bob' | Logit: 19.3744
Rank 2: ' Alice' | Logit: 18.0063
Rank 3: ' Carol' | Logit: 14.6690
Rank 4: ' the' | Logit: 13.9192
Rank 5: ' her' | Logit: 13.0405

Stats for ' Carol' -> Rank: 3, Logit Score: 14.6690
Stats for ' Alice' -> Rank: 2, Logit Score: 18.0063
Stats for ' both' -> Rank: 8, Logit Score: 12.8641


In [79]:
model = HookedTransformer.from_pretrained("pythia-1.4b")
prompt = prompt1
pipeline(model, prompt, 10, 16)
print()
prompt = prompt2
pipeline(model, prompt, 10, 16)

Loaded pretrained model pythia-1.4b into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Carol' | Logit: 19.2994
Rank 2: ' Bob' | Logit: 17.6222
Rank 3: ' Alice' | Logit: 15.1997
Rank 4: ' both' | Logit: 14.3152
Rank 5: ' the' | Logit: 13.1826

Stats for ' Carol' -> Rank: 1, Logit Score: 19.2994
Stats for ' Alice' -> Rank: 3, Logit Score: 15.1997
Stats for ' both' -> Rank: 4, Logit Score: 14.3152

Top 5 most likely next tokens:
Rank 1: ' Alice' | Logit: 18.4025
Rank 2: ' Bob' | Logit: 18.0595
Rank 3: ' Carol' | Logit: 14.7172
Rank 4: ' both' | Logit: 13.7493
Rank 5: '
' | Logit: 13.4370

Stats for ' Carol' -> Rank: 3, Logit Score: 14.7172
Stats for ' Alice' -> Rank: 1, Logit Score: 18.4025
Stats for ' both' -> Rank: 4, Logit Score: 13.7493


In [81]:
model = HookedTransformer.from_pretrained("pythia-2.8b")
prompt = prompt1
pipeline(model, prompt, 10, 8)
print()
prompt = prompt2
pipeline(model, prompt, 10, 8)

Loaded pretrained model pythia-2.8b into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Carol' | Logit: 17.8021
Rank 2: ' Bob' | Logit: 15.9951
Rank 3: ' Alice' | Logit: 14.7027
Rank 4: ' both' | Logit: 13.7414
Rank 5: ' herself' | Logit: 13.2054

Stats for ' Carol' -> Rank: 1, Logit Score: 17.8021
Stats for ' Alice' -> Rank: 3, Logit Score: 14.7027
Stats for ' both' -> Rank: 4, Logit Score: 13.7414

Top 5 most likely next tokens:
Rank 1: ' Alice' | Logit: 18.2920
Rank 2: ' Bob' | Logit: 15.1436
Rank 3: ' Al' | Logit: 13.8783
Rank 4: ' both' | Logit: 13.8685
Rank 5: '
' | Logit: 13.3322

Stats for ' Carol' -> Rank: 6, Logit Score: 13.1340
Stats for ' Alice' -> Rank: 1, Logit Score: 18.2920
Stats for ' both' -> Rank: 4, Logit Score: 13.8685


In [4]:
# model = HookedTransformer.from_pretrained("pythia-6.9b")
# prompt = prompt1
# pipeline(model, prompt, 10, 8)
# print()
# prompt = prompt2
# pipeline(model, prompt, 10, 8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.94G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loaded pretrained model pythia-6.9b into HookedTransformer
Top 5 most likely next tokens:
Rank 1: ' Carol' | Logit: 17.9819
Rank 2: ' Bob' | Logit: 17.2026
Rank 3: ' Charlie' | Logit: 13.3058
Rank 4: ' Alice' | Logit: 13.2837
Rank 5: '
' | Logit: 13.2825

Stats for ' Carol' -> Rank: 1, Logit Score: 17.9819
Stats for ' Alice' -> Rank: 4, Logit Score: 13.2837
Stats for ' both' -> Rank: 9, Logit Score: 12.7779

Top 5 most likely next tokens:
Rank 1: ' Alice' | Logit: 18.7275
Rank 2: ' Bob' | Logit: 17.8020
Rank 3: ' Carol' | Logit: 14.7108
Rank 4: '
' | Logit: 14.4201
Rank 5: ' both' | Logit: 13.7236

Stats for ' Carol' -> Rank: 3, Logit Score: 14.7108
Stats for ' Alice' -> Rank: 1, Logit Score: 18.7275
Stats for ' both' -> Rank: 5, Logit Score: 13.7236


In [5]:
# import torch
# import gc

# # 1. Delete variables holding heavy data
# # Add any other variable names you want to clear to this list
# for var in ['model', 'logits', 'cache']:
#     if var in globals():
#         del globals()[var]

# # 2. Run Python's garbage collector to clean up deleted objects
# gc.collect()

# # 3. Clear PyTorch's internal GPU cache
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
#     print("GPU memory cleared!")

GPU memory cleared!


In [6]:
# model = HookedTransformer.from_pretrained("pythia-12b")
# prompt = prompt1
# pipeline(model, prompt, 10, 8)
# print()
# prompt = prompt2
# pipeline(model, prompt, 10, 8)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.81G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loaded pretrained model pythia-12b into HookedTransformer


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacity of 79.32 GiB of which 89.88 MiB is free. Process 3531 has 79.22 GiB memory in use. Of the allocated memory 78.73 GiB is allocated by PyTorch, and 6.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)