<a href="https://colab.research.google.com/github/Reese-Martin/MI_practice/blob/main/streamlit_TransformerLens_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# colab needs to have non-standard libraries reinstalled (because I am being lazy)
%pip install einops fancy_einsum torchtyping transformer_lens circuitsvis plot_utils

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fancy_einsum
  Downloading fancy_einsum-0.0.3-py3-none-any.whl (6.2 kB)
Collecting torchtyping
  Downloading torchtyping-0.1.4-py3-none-any.whl (17 kB)
Collecting transformer_lens
  Downloading transformer_lens-2.0.0-py3-none-any.whl (144 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting circuitsvis
  Downloading circuitsvis-1.43.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting plot_utils
  Downloading plot_utils-0.6.14-py2.py3-none-any.whl (13.3 MB)
[2K     [90m━━

In [2]:
# imports come straight from the streamlit page
import os; os.environ["ACCELERATE_DISABLE_RICH"] = "1"
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected" # or use "browser" if you want plots to open with browser
import plotly.graph_objects as go
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import einops
from fancy_einsum import einsum
from torchtyping import TensorType as TT
from typing import List, Optional, Tuple, Union
import functools
from tqdm import tqdm
from IPython.display import display

from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv

import tests
import plot_utils

# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

MAIN = __name__ == "__main__"

def imshow(tensor, xaxis="", yaxis="", caxis="", **kwargs):
    return px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs)

def line(tensor, xaxis="", yaxis="", **kwargs):
    return px.line(utils.to_numpy(tensor), labels={"x":xaxis, "y":yaxis}, **kwargs)

def scatter(x, y, xaxis="", yaxis="", caxis="", **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    return px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs)


`ACCELERATE_DISABLE_RICH` is deprecated and will be removed in v0.22.0 and deactivated by default. Please use `ACCELERATE_ENABLE_RICH` if you wish to use `rich`.



In [3]:
device = t.device("cuda" if t.cuda.is_available() else "cpu")

gpt2_small = HookedTransformer.from_pretrained("gpt2-small", device=device)


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer


In [4]:
# EXERCISE: inspect model for num layers, Heads/Layer, Maximum context
# use model.cfg to see all params, model.cfg.BLAH to see blah

print('Layers: ', gpt2_small.cfg.n_layers)
print('Heads/Layer: ', gpt2_small.cfg.n_heads)
print('Max context: ', gpt2_small.cfg.n_ctx)

# weirdly, it seems the wrong model was loaded. rather than gpt2_small, we loaded gpt2
# (main difference is 12 layers instead of 2 and 2048 ctx instead of 1048). looks like
# there were changes to the transformerLens code after this tutorial was created

Layers:  12
Heads/Layer:  12
Max context:  1024


In [5]:
# digging in to running the model and loss
model_description_text = '''## Loading Models

HookedTransformer comes loaded with over 40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!'''

loss = gpt2_small(model_description_text, return_type="loss")
print("Model loss:", loss)

Model loss: tensor(4.3204, device='cuda:0')


Digression for the difference between Parameters and activations.

Parameters: the weights/biases of the trained model. Will not change when model input changes. Accessible directly from the model.

Activations: temp. numbers calculated during the forward pass. Normally inacessible, functions of the input. Hooks are needed to access these values during a forward pass. **Attention Scores and patterns are activations**

Useful shortcuts:
- you can access the weights of the model in two ways
  - model.blocks[n].attn.W_Q which returns the nth blocks querry weights.
  - model.W_Q returns the [nLayers, nHeads, d_model, d_head] querry weights of the entire model. such shortcuts exist for the W_E, W_U, W_Pos matrices as well.
  - models containing MLP layers will also have W_in and W_out for the linear layers
  - all true for biases

The model stores its tokenizer, accessible by model.tokenizer
- model.to_str_tokens(text) converts a string into a tensor of tokens-as-strings.
- model.to_tokens(text) converts a string into a tensor of tokens.
- model.to_string(tokens) converts a tensor of tokens into a string.

In [6]:
# tokenizer examples
print(gpt2_small.to_str_tokens("gpt2"))
print(gpt2_small.to_tokens("gpt2"))
print(gpt2_small.to_string([50256,70,457,17]))

['<|endoftext|>', 'g', 'pt', '2']
tensor([[50256,    70,   457,    17]], device='cuda:0')
<|endoftext|>gpt2


In [7]:
# Exercise- how many words does your model guess correctly?
# the logits are the guesses (well the logits are unembedded to be the guesses?)
# so if we extract the logits from the model, unembedd those??, then display the
# guesses vs. the actual text we may see how correct it was?
logits = gpt2_small(model_description_text, return_type="logits")

In [32]:
### my attempt at the problem
# first find the vocab dimension of the logits from the text entered (1st axis)
# logits.shape
maxs = t.argmax(logits,2)
# maxs should now contain the tokens that were most likely after the initial token given.
# generate the tokens for the model_description_text
input_toks = gpt2_small.to_tokens(model_description_text)

# compare by putting together the nth input token to the nth-1 output token
tmp = input_toks[0,1:]
tmp2 = maxs[0,:-1]
matched = t.eq(tmp,tmp2)
# then can use the t.eq() to evaluate how many of the predicted tokens match the string given
print('good predictions: ', sum([1 for i in matched if i])/len(tmp))



good predictions:  0.3125


In [30]:
### below is the code used by the example I am following, reproduced here for validation
logits = gpt2_small(model_description_text, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
true_tokens = gpt2_small.to_tokens(model_description_text).squeeze()[1:]
num_correct = (prediction == true_tokens).sum()

print("Model accuracy: ", num_correct/len(true_tokens)) # changed this slightly to get a percentage output

Model accuracy:  tensor(0.3125, device='cuda:0')


In [35]:
# the correct words
gpt2_small.to_str_tokens(tmp2[matched])
# tutorial now makes mention of induction heads, for the B follows A, so whenever you have
# A predict B logic. Looks like this is occurring with Trans and former tokens
# actually seems like it was the 'ooked' 'trans' 'former' following H.

['\n',
 '\n',
 'former',
 ' with',
 ' source',
 ' models',
 '.',
 ' can',
 ' of',
 ' them',
 'ooked',
 'Trans',
 'former',
 '.',
 '_',
 'NAME',
 '`.',
 ' model',
 'ed',
 'Trans',
 'former',
 ' to',
 ' be',
 ' and',
 '-',
 '.',
 '\n',
 ' at',
 'PT',
 '-',
 ',',
 ' model',
 ',',
 "'s",
 ' the']

In [36]:
### activation caching. 'break open the black box' by looking at internal model activations
gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."

gpt2_tokens = gpt2_small.to_tokens(gpt2_text)

logits, cache = gpt2_small.run_with_cache(gpt2_tokens, remove_batch_dim=True)

In [37]:
# look at what we made
cache

ActivationCache with keys ['hook_embed', 'hook_pos_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.hook_resid_mid', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_post', 'blocks.2.hook_re

In [42]:
# each entry in the cache is a tensor, with entry tokens X other dims (i.e. 33x768)
# for tokens by residual stream
# some have
cache['hook_embed']
cache['hook_embed'].shape

torch.Size([33, 768])

In [47]:
# checking dims of the values corresponding to keys
sizes = []
keys = cache.keys()
for i in keys:
  sizes.append(cache[i].shape)

In [50]:
# can access cached values using either following method
cache['pattern',0]
cache['blocks.0.attn.hook_pattern']
# first function is using utils.get_act_name and then translating pattern to the appropriate attn pattern call

tensor([[[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [9.6394e-01, 3.6058e-02, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [8.3894e-01, 1.1829e-01, 4.2775e-02,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [7.1686e-02, 6.9244e-02, 1.9307e-02,  ..., 3.0615e-02,
          0.0000e+00, 0.0000e+00],
         [6.9108e-02, 3.7012e-02, 3.8621e-02,  ..., 5.0884e-02,
          3.4151e-02, 0.0000e+00],
         [1.4376e-01, 1.6811e-02, 9.3867e-03,  ..., 9.4509e-02,
          7.1715e-02, 3.3723e-03]],

        [[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [4.2467e-04, 9.9958e-01, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [5.6219e-04, 1.6407e-02, 9.8303e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [1.3371e-05, 9.8215e-04, 4.1548e-04,  ..., 9.8880e-01,
          0.000

In [72]:
### EXERCISE: verify the activations. verify that hook_q, hook_k, and hook_pattern
# are related to eachother in the way implied by the transformer diagram.

layer0_pattern_from_cache = cache['pattern', 0]
q = cache['q',0]
k = cache['k',0]
#iirc the
layer0_pattern_scores = einsum('seqQ nhead d_head, seqK nhead d_head -> nhead seqQ seqK', q,k)

#using torch testing
#t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_Q_K)

# didn't work because I forgot to mask and softmax the QK matrix
mask = t.triu(t.ones((q.shape[0], q.shape[0]), device=device, dtype=bool),diagonal=1)

layer0_pattern_scores.masked_fill_(mask, -1e-7)
layer0_pattern_from_Q_K = (layer0_pattern_scores / q.shape[2]**.05).softmax(-1)

# now should be able to compare
t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_Q_K)


AssertionError: Tensor-likes are not close!

Mismatched elements: 10534 / 13068 (80.6%)
Greatest absolute difference: 0.9999995231628418 at index (7, 31, 32) (up to 1e-05 allowed)
Greatest relative difference: 2.845881919722175e+27 at index (1, 25, 14) (up to 1.3e-06 allowed)

In [75]:
### from the tutorial
layer0_pattern_from_cache = cache["pattern", 0]

q, k = cache["q", 0], cache["k", 0]
seq, nhead, headsize = q.shape
layer0_attn_scores = einsum("seqQ n h, seqK n h -> n seqQ seqK", q, k)
mask = t.triu(t.ones((seq, seq), device=device, dtype=bool), diagonal=1)
layer0_attn_scores.masked_fill_(mask, -1e9)
layer0_pattern_from_q_and_k = (layer0_attn_scores / headsize**0.5).softmax(-1)

t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)

In [94]:
# mine doesn't pass, the tutorial does, what is different?
layer0_pattern_from_cache = cache['pattern', 0]
q, k = cache["q", 0], cache["k", 0]
seq, nhead, headsize = q.shape

layer0_attn_scores = einsum("seqQ n h, seqK n h -> n seqQ seqK", q, k)
mask = t.triu(t.ones((seq, seq), device=device, dtype=bool), diagonal=1)

layer0_attn_scores.masked_fill_(mask, -1e9)
layer0_pattern_from_Q_K = (layer0_attn_scores / headsize**0.5).softmax(-1)

# now should be able to compare
t.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_Q_K)
# for some reason it works with their q_and)k but not my q_k. Cannot be sure why
# but restarting the kernel fixed the error so maybe there was weird floating point math
# going on that got reset.