In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !pip install git+https://github.com/google/BIG-bench.git # This may take a few minutes
!pip install datasets==2.21.0
!pip install transformer_lens
!pip install -r /content/drive/MyDrive/University/CoT/geometry-of-truth/requirements.txt



In [None]:
!pip list

Package                            Version
---------------------------------- -------------------
absl-py                            1.4.0
accelerate                         0.34.2
aiohappyeyeballs                   2.4.3
aiohttp                            3.10.10
aiosignal                          1.3.1
alabaster                          0.7.16
albucore                           0.0.16
albumentations                     1.4.15
altair                             4.2.2
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.5.1
arviz                              0.19.0
astropy                            6.1.4
astropy-iers-data                  0.2024.10.7.0.32.46
astunparse                         1.6.3
async-timeout                      4.0.3
atpublic                           4.1.0
attrs                              24.2.0
audioread        

# Imports

In [3]:
import torch
import json
import configparser
import huggingface_hub
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm
from transformer_lens import HookedTransformer
from transformer_lens.hook_points import HookPoint
from transformer_lens.utils import get_act_name

# Constants

In [4]:
DATA_PATH = "/content/drive/MyDrive/University/CoT/data/com2sense.json"
CONFIG_PATH = "/content/drive/MyDrive/University/CoT/geometry-of-truth/config.ini"
MODEL_NAME = "llama-3.2-1b"
N_SHOTS = 5
BATCH_SIZE = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
remote = False
MAX_IDX = 100
LAYERS = "ALL"    # Layers to look at activations, "ALL" or list of int
TOP_K = 1        # Top k activations to look at
torch.set_grad_enabled(False)

print(f"Executing on device {device}")

Executing on device cuda


# Load data

In [5]:
def load_data(n_shots: int, data_path: str, data_key: str = "examples", lookup_key: str = "pair_id_lookup") -> tuple:
    with open(data_path, "r") as file:
        dataset = json.load(file)

    examples = dataset[data_key]
    pair_id_lookup = dataset[lookup_key]
    dataset = pd.DataFrame(examples)
    train = dataset.sample(n_shots)
    test = dataset.drop(train.index)

    return train, test, pair_id_lookup

train, test, pair_id_lookup = load_data(N_SHOTS, DATA_PATH)

# Load model

Login with the following token: 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

In [6]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
config = configparser.ConfigParser()
config.read(CONFIG_PATH)

def load_model(model_name: str, device='remote') -> HookedTransformer:
    print(f"Loading model {model_name}...")
    weights_directory = config[model_name]['weights_directory']
    model = HookedTransformer.from_pretrained(weights_directory, dtype=torch.bfloat16, device=device)
    model.tokenizer = AutoTokenizer.from_pretrained(weights_directory)
    return model

model = load_model(MODEL_NAME, device=device)
model.tokenizer.padding_side = "right"
model.tokenizer.pad_token = model.tokenizer.eos_token
print(model)

Loading model llama-3.2-1b...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded pretrained model meta-llama/Llama-3.2-1B into HookedTransformer
HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-15): 16 x TransformerBlock(
      (ln1): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): GatedMLP(
        (hook_pre): HookPoint()
        (hook_pre_linear): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook

### LLaMa 3.2 1b architecture
```python
LlamaForCausalLM(  
  (model): LlamaModel(  
    (embed_tokens): Embedding(128256, 2048)  
    (layers): ModuleList(  
      (0-15): 16 x LlamaDecoderLayer(  
        (self_attn): LlamaSdpaAttention(  
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)  
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)  
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)  
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)  
          (rotary_emb): LlamaRotaryEmbedding()  
        )  
        (mlp): LlamaMLP(  
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)  
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)  
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)  
          (act_fn): SiLU()  
        )  
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)  
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)  
      )  
    )  
    (norm): LlamaRMSNorm((2048,), eps=1e-05)  
    (rotary_emb): LlamaRotaryEmbedding()  
  )  
  (lm_head): Linear(in_features=2048, out_features=128256, bias=False)  
  (generator): WrapperModule()  
)
```

# Obtain layer-level activation for zero-shot CoT vs. non-CoT in residual stream

In [8]:
def get_layer_acts_post_resid(statements, model: HookedTransformer, layers: list) -> dict:
    """
    Get given layer post residual activations for the statements. Activations are obtained after the last token is read.
    args:
        statements: The statements to obtain activations for.
        model: The model to use.
        layers: The layers (int) to obtain activations for as a list.
    returns: dictionary of stacked activations of shape (batch_size, hidden_channels)
    """
    acts = {}
    def get_act(value: torch.Tensor, hook: HookPoint):
        acts[hook.name] = value[:, -1, :]

    hooks = []
    for layer in layers:
        hooks.append((get_act_name("resid_post", layer=layer), get_act))

    _ = model.run_with_hooks(statements, fwd_hooks=hooks, return_type=None)

    return acts

def get_layer_acts_attn(statements, model: HookedTransformer, layers: list) -> tuple:
    """
    Get given layer attention activations for the statements. Activations are obtained after the last token is read.
    args:
        statements: The statements to obtain activations for.
        model: The model to use.
        layers: The layers (int) to obtain activations for as a list.
    returns: tuple of dictionary of stacked q, k, v activations of shape (batch_size, n_attn_heads, d_k(headdim))
    """
    acts_q = {}
    acts_k = {}
    acts_v = {}
    def get_act_q(value: torch.Tensor, hook: HookPoint):
        acts_q[hook.name] = value[:, -1, :, :]
    def get_act_k(value: torch.Tensor, hook: HookPoint):
        acts_k[hook.name] = value[:, -1, :, :]
    def get_act_v(value: torch.Tensor, hook: HookPoint):
        acts_v[hook.name] = value[:, -1, :, :]

    hooks = []
    for layer in layers:
        hooks.append((get_act_name("q", layer=layer), get_act_q))
        hooks.append((get_act_name("k", layer=layer), get_act_k))
        hooks.append((get_act_name("v", layer=layer), get_act_v))

    _ = model.run_with_hooks(statements, fwd_hooks=hooks, return_type=None)

    return acts_q, acts_k, acts_v


def obtain_act_diff(model: HookedTransformer, queries: pd.DataFrame, batch_size: int, exp: str, layers: list, train_prompt: str, prompt_key: str = "sent", max_idx: int = -1) -> tuple:
    """
    Obtains the activation difference between queries and queries + exp.
    args:
        model: The model to use.
        queries: The queries to use.
        batch_size: The batch size to use.
        exp: The prompt to experiment with, added to the end of the sentence.
        layers: The list of layers (int) to obtain diff.
        train_prompt: The prompt used to train the model.
        prompt_key: The key in the queries dataframe that contains the prompt.
        max_idx: The maximum number of queries to use. Set to -1 to use all queries.
    returns:
        The activation difference between the model's predictions for the given queries, original activations, experimental activations.
        tuple<list<map<str, torch.Tensor>>>
    """
    diffs_resid = []
    acts_resid = []
    acts_resid_exp = []
    diffs_q, diffs_k, diffs_v = [], [], []
    acts_q, acts_k, acts_v = [], [], []
    acts_q_exp, acts_k_exp, acts_v_exp = [], [], []
    max_idx = len(queries) if max_idx == -1 else max_idx
    for batch_idx in tqdm(range(0, max_idx, batch_size), desc="Processing batches"):
        batch = queries.iloc[batch_idx : batch_idx + batch_size][prompt_key].tolist()
        batch = [train_prompt + query for query in batch]
        batch_exp = [train_prompt + query + exp for query in batch]

        act_resid = get_layer_acts_post_resid(batch, model, layers)
        act_resid_exp = get_layer_acts_post_resid(batch_exp, model, layers)
        acts_resid.append(act_resid)
        acts_resid_exp.append(act_resid_exp)
        diff_resid = {layer: act_resid_exp[layer] - act_resid[layer] for layer in act_resid.keys()}
        diffs_resid.append(diff_resid)

        # act_q, act_k, act_v = get_layer_acts_attn(batch, model, layers)
        # act_q_exp, act_k_exp, act_v_exp = get_layer_acts_attn(batch_exp, model, layers)
        # diff_q = {layer: act_q_exp[layer] - act_q[layer] for layer in act_q.keys()}
        # diff_k = {layer: act_k_exp[layer] - act_k[layer] for layer in act_k.keys()}
        # diff_v = {layer: act_v_exp[layer] - act_v[layer] for layer in act_v.keys()}
        # diffs_q.append(diff_q)
        # diffs_k.append(diff_k)
        # diffs_v.append(diff_v)
        # acts_q.append(act_q)
        # acts_k.append(act_k)
        # acts_v.append(act_v)
        # acts_q_exp.append(act_q_exp)
        # acts_k_exp.append(act_k_exp)
        # acts_v_exp.append(act_v_exp)

    return diffs_resid, acts_resid, acts_resid_exp # , diffs_q, acts_q, acts_q_exp, diffs_k, acts_k, acts_k_exp, diffs_v, acts_v, acts_v_exp

layers = list(range(len(model.blocks))) if LAYERS == "ALL" else LAYERS
diffs_resid, acts_resid, acts_resid_exp = obtain_act_diff(model, test, 1, " let's think step by step", layers, "", max_idx=MAX_IDX)

Processing batches: 100%|██████████| 100/100 [00:22<00:00,  4.38it/s]


In [9]:
print(len(diffs_resid))
print(diffs_resid[0].keys())
print(diffs_resid[0]["blocks.0.hook_resid_post"].shape)

100
dict_keys(['blocks.0.hook_resid_post', 'blocks.1.hook_resid_post', 'blocks.2.hook_resid_post', 'blocks.3.hook_resid_post', 'blocks.4.hook_resid_post', 'blocks.5.hook_resid_post', 'blocks.6.hook_resid_post', 'blocks.7.hook_resid_post', 'blocks.8.hook_resid_post', 'blocks.9.hook_resid_post', 'blocks.10.hook_resid_post', 'blocks.11.hook_resid_post', 'blocks.12.hook_resid_post', 'blocks.13.hook_resid_post', 'blocks.14.hook_resid_post', 'blocks.15.hook_resid_post'])
torch.Size([1, 2048])


# Look at direction mean and std of CoT vs. Non-CoT

Transform dict into tensor

In [10]:
diffs_resid = torch.stack([torch.stack([diffs_resid[i][layer] for layer in diffs_resid[i].keys()]) for i in range(len(diffs_resid))])
# diffs_q = torch.stack([torch.stack([diffs_q[i][layer] for layer in diffs_q[i].keys()]) for i in range(len(diffs_q))])
# diffs_k = torch.stack([torch.stack([diffs_k[i][layer] for layer in diffs_k[i].keys()]) for i in range(len(diffs_k))])
# diffs_v = torch.stack([torch.stack([diffs_v[i][layer] for layer in diffs_v[i].keys()]) for i in range(len(diffs_v))])

In [11]:
acts_resid = torch.stack([torch.stack([acts_resid[i][layer] for layer in acts_resid[i].keys()]) for i in range(len(acts_resid))])
acts_exp_resid = torch.stack([torch.stack([acts_resid_exp[i][layer] for layer in acts_resid_exp[i].keys()]) for i in range(len(acts_resid_exp))])
# acts_q = torch.stack([torch.stack([acts_q[i][layer] for layer in acts_q[i].keys()]) for i in range(len(acts_q))])
# acts_q_exp = torch.stack([torch.stack([acts_q_exp[i][layer] for layer in acts_q_exp[i].keys()]) for i in range(len(acts_q_exp))])
# acts_k = torch.stack([torch.stack([acts_k[i][layer] for layer in acts_k[i].keys()]) for i in range(len(acts_k))])
# acts_k_exp = torch.stack([torch.stack([acts_k_exp[i][layer] for layer in acts_k_exp[i].keys()]) for i in range(len(acts_k_exp))])
# acts_v = torch.stack([torch.stack([acts_v[i][layer] for layer in acts_v[i].keys()]) for i in range(len(acts_v))])
# acts_v_exp = torch.stack([torch.stack([acts_v_exp[i][layer] for layer in acts_v_exp[i].keys()]) for i in range(len(acts_v_exp))])

Remove sequence dimension as we are only looking at the last token

In [12]:
diffs_resid = diffs_resid[:, :, 0, :]
acts_resid = acts_resid[:, :, 0, :]
acts_exp_resid = acts_exp_resid[:, :, 0, :]

In [13]:
print(diffs_resid.shape)
print(acts_resid.shape)
print(acts_exp_resid.shape)
# print(diffs_q.shape)
# print(diffs_k.shape)
# print(diffs_v.shape)

torch.Size([100, 16, 2048])
torch.Size([100, 16, 2048])
torch.Size([100, 16, 2048])


In [14]:
mean_diffs_resid = torch.mean(diffs_resid, dim=0)
std_diffs_resid = torch.std(diffs_resid, dim=0)

In [15]:
print(mean_diffs_resid.shape)
print(std_diffs_resid.shape)

torch.Size([16, 2048])
torch.Size([16, 2048])


In [16]:
for i in range(mean_diffs_resid.shape[0]):
    print(f"Layer {i}")
    print(f"act_max: {torch.max(acts_resid[:, i, :])}, act_min: {torch.min(acts_resid[:, i, :])}")
    # print(f"mean: {mean_diffs_resid[i]}")
    # print(f"std: {std_diffs_resid[i]}")
    print(f"mean std: {torch.mean(std_diffs_resid[i])}")

Layer 0
act_max: 0.365234375, act_min: -0.3984375
mean std: 0.00909423828125
Layer 1
act_max: 0.8671875, act_min: -0.62890625
mean std: 0.0159912109375
Layer 2
act_max: 1.5390625, act_min: -0.94140625
mean std: 0.0255126953125
Layer 3
act_max: 1.484375, act_min: -1.21875
mean std: 0.0380859375
Layer 4
act_max: 1.59375, act_min: -1.3046875
mean std: 0.048583984375
Layer 5
act_max: 1.453125, act_min: -1.9921875
mean std: 0.052490234375
Layer 6
act_max: 0.8203125, act_min: -2.359375
mean std: 0.05712890625
Layer 7
act_max: 0.7890625, act_min: -3.0
mean std: 0.06396484375
Layer 8
act_max: 1.1953125, act_min: -3.0625
mean std: 0.06884765625
Layer 9
act_max: 1.4375, act_min: -4.1875
mean std: 0.078125
Layer 10
act_max: 1.59375, act_min: -4.21875
mean std: 0.08349609375
Layer 11
act_max: 2.0625, act_min: -5.03125
mean std: 0.09521484375
Layer 12
act_max: 2.28125, act_min: -5.96875
mean std: 0.1142578125
Layer 13
act_max: 2.734375, act_min: -7.6875
mean std: 0.1416015625
Layer 14
act_max: 3.46

# Save activations and diffs

In [None]:
torch.save(diffs_resid, f"{MODEL_NAME}_diffs_resid_{LAYERS}_layers.pt".lower())
# torch.save(diffs_q, f"{MODEL_NAME}_diffs_q_{LAYERS}_layers.pt".lower())
# torch.save(diffs_k, f"{MODEL_NAME}_diffs_k_{LAYERS}_layers.pt".lower())
# torch.save(diffs_v, f"{MODEL_NAME}_diffs_v_{LAYERS}_layers.pt".lower())
torch.save(acts_resid, f"{MODEL_NAME}_acts_resid_{LAYERS}_layers.pt".lower())
# torch.save(acts_q, f"{MODEL_NAME}_acts_q_{LAYERS}_layers.pt".lower())
# torch.save(acts_k, f"{MODEL_NAME}_acts_k_{LAYERS}_layers.pt".lower())
# torch.save(acts_v, f"{MODEL_NAME}_acts_v_{LAYERS}_layers.pt".lower())

# Find CoT related activations by averaging out irrelevant features

In [17]:
def get_min_std_indeces(std_resid: torch.Tensor, k: int = 10):
    """
    Gets the indices of the minimum standard deviation for each layer.
    args:
        std_resid: The standard deviation of the activations.
        k: Top k minimum standard deviations to return.
    returns:
        The indices of the minimum standard deviation for each layer.
    """
    indeces = []
    for i in range(std_resid.shape[0]):
        indeces_layer_i = torch.argsort(std_resid[i, :], dim=0, descending=False)[:k]
        indeces.append(indeces_layer_i)
    indeces = torch.stack(indeces)
    return indeces

indeces = get_min_std_indeces(std_diffs_resid, TOP_K)

In [19]:
print(indeces.shape)
print(indeces)
print(torch.gather(std_diffs_resid, 1, indeces))

torch.Size([16, 1])
tensor([[  85],
        [ 859],
        [2044],
        [2044],
        [1107],
        [2044],
        [2016],
        [ 978],
        [ 897],
        [1067],
        [ 556],
        [ 727],
        [1148],
        [2044],
        [2044],
        [ 943]], device='cuda:0')
tensor([[0.0049],
        [0.0085],
        [0.0134],
        [0.0245],
        [0.0299],
        [0.0299],
        [0.0356],
        [0.0386],
        [0.0420],
        [0.0486],
        [0.0471],
        [0.0510],
        [0.0664],
        [0.0801],
        [0.0942],
        [0.1436]], device='cuda:0', dtype=torch.bfloat16)
