In [1]:
!pip list

Package                       Version
----------------------------- ------------
accelerate                    1.0.1
aiohttp                       3.8.6
aiosignal                     1.3.1
alembic                       1.12.0
altair                        5.1.2
annotated-types               0.7.0
anyio                         4.0.0
argon2-cffi                   23.1.0
argon2-cffi-bindings          21.2.0
arrow                         1.3.0
asttokens                     2.4.0
async-generator               1.10
async-lru                     2.0.4
async-timeout                 4.0.3
attrs                         23.1.0
automated-interpretability    0.0.6
babe                          0.0.7
Babel                         2.13.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.5
beartype                      0.14.1
beautifulsoup4                4.12.2
better-abc                    0.0.3
bidict                        0.23.1
bleach                        6.1.0
blinker     

# Imports

In [None]:
import torch
import json
import configparser
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm
from transformer_lens import HookedTransformer
from transformer_lens.hook_points import HookPoint
from transformer_lens.utils import get_act_name
from utils.loading_utils import get_pretrained_model
from data_collection_utils import load_data, load_model, obtain_act_diff

# Constants

In [3]:
DATASET = "com2sense"
DATA_PATH = f"./data/{DATASET}.json"
CONFIG_PATH = "./config.ini"
MODEL_NAME = "gemma-2-2b-it"
PRE_PROMPT = "Yes or no: "
PROMPT_KEY = "input"
N_SHOTS = 0
BATCH_SIZE = 1
BATCH = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
remote = False
START_IDX = 0
MAX_IDX = -1      # Number of indeces in to obtain data, -1 for all
LAYERS = "ALL"    # Layers to look at activations, "ALL" or list of int
TOP_K = 100        # Top k activations to look at
torch.set_grad_enabled(False)

print(f"Executing on device {device}")

Executing on device cuda


# Load data

In [None]:
train, test, pair_id_lookup = load_data(N_SHOTS, DATA_PATH)

# Load model

In [None]:
model = load_model(MODEL_NAME, device=device)
print(model)

Loading model gemma-2-2b-it...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps



After loading HookedTransformer allocated 20.019888877868652 GB cuda memory


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.20 GiB. GPU 0 has a total capacity of 31.73 GiB of which 1.23 GiB is free. Process 1493410 has 3.49 GiB memory in use. Process 1494121 has 2.47 GiB memory in use. Including non-PyTorch memory, this process has 24.54 GiB memory in use. Of the allocated memory 23.68 GiB is allocated by PyTorch, and 516.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Obtain layer-level activation for zero-shot CoT vs. non-CoT in residual stream

In [None]:
layers = list(range(len(model.blocks))) if LAYERS == "ALL" else LAYERS
diffs_resid, acts_resid, acts_resid_exp = obtain_act_diff(
    model,
    test,
    1,
    " let's think step by step",
    layers,
    PRE_PROMPT,
    start_idx=START_IDX,
    prompt_key=PROMPT_KEY,
    max_idx=MAX_IDX
)

In [None]:
print(len(diffs_resid))
print(diffs_resid[0].keys())
print(diffs_resid[0]["blocks.0.hook_resid_post"].shape)

Transform dict into tensor

In [None]:
diffs_resid = torch.stack([torch.stack([diffs_resid[i][layer] for layer in diffs_resid[i].keys()]) for i in range(len(diffs_resid))])
# diffs_q = torch.stack([torch.stack([diffs_q[i][layer] for layer in diffs_q[i].keys()]) for i in range(len(diffs_q))])
# diffs_k = torch.stack([torch.stack([diffs_k[i][layer] for layer in diffs_k[i].keys()]) for i in range(len(diffs_k))])
# diffs_v = torch.stack([torch.stack([diffs_v[i][layer] for layer in diffs_v[i].keys()]) for i in range(len(diffs_v))])

In [None]:
acts_resid = torch.stack([torch.stack([acts_resid[i][layer] for layer in acts_resid[i].keys()]) for i in range(len(acts_resid))])
acts_exp_resid = torch.stack([torch.stack([acts_resid_exp[i][layer] for layer in acts_resid_exp[i].keys()]) for i in range(len(acts_resid_exp))])
# acts_q = torch.stack([torch.stack([acts_q[i][layer] for layer in acts_q[i].keys()]) for i in range(len(acts_q))])
# acts_q_exp = torch.stack([torch.stack([acts_q_exp[i][layer] for layer in acts_q_exp[i].keys()]) for i in range(len(acts_q_exp))])
# acts_k = torch.stack([torch.stack([acts_k[i][layer] for layer in acts_k[i].keys()]) for i in range(len(acts_k))])
# acts_k_exp = torch.stack([torch.stack([acts_k_exp[i][layer] for layer in acts_k_exp[i].keys()]) for i in range(len(acts_k_exp))])
# acts_v = torch.stack([torch.stack([acts_v[i][layer] for layer in acts_v[i].keys()]) for i in range(len(acts_v))])
# acts_v_exp = torch.stack([torch.stack([acts_v_exp[i][layer] for layer in acts_v_exp[i].keys()]) for i in range(len(acts_v_exp))])

Remove sequence dimension as we are only looking at the last token

In [None]:
diffs_resid = diffs_resid[:, :, 0, :]
acts_resid = acts_resid[:, :, 0, :]
acts_exp_resid = acts_exp_resid[:, :, 0, :]

In [None]:
print(diffs_resid.shape)
print(acts_resid.shape)
print(acts_exp_resid.shape)
# print(diffs_q.shape)
# print(diffs_k.shape)
# print(diffs_v.shape)

# Save activations and difference data

In case we want to analyze neurons with SAEs in the future

In [None]:
torch.save(diffs_resid, f"./experimental_data/{MODEL_NAME}/{DATASET}/diffs_resid_{BATCH}.pt")
torch.save(acts_resid, f"./experimental_data/{MODEL_NAME}/{DATASET}/acts_resid_{BATCH}.pt")
torch.save(acts_exp_resid, f"./experimental_data/{MODEL_NAME}/{DATASET}/acts_exp_resid_{BATCH}.pt")