# Cross-layer stability of subspaces

This notebook shows how you can reproduce the experiments of the Appendix Section I: Cross-layer stability of subspaces of the paper. 

Requirements: 
- Model's information in `models.toml` if you want to use the `Lens.from_config` method
- Probes configuration in `probes_config.json` 

In [1]:
from ssr.lens import Lens

MODEL_NAME = "qwen2.5_1.5b"
MODEL_NAME = "llama3.2_1b"
MODEL_NAME = "llama3.2_3b"
MODEL_NAME = "gemma2_2b"
lens = Lens.from_preset(MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded pretrained model google/gemma-2-2b-it into HookedTransformer


In [2]:
import json
import torch as t
import einops 

from ssr.files  import load_dataset
from ssr.probes import activations_to_dataloader, train_and_test_classifier
from ssr import DEVICE, PROBES_CONFIG_PATH, pprint
from reproduce_experiments.plot import imshow

titles = {
    "llama3.2_1b": "Llama 3.2 1b",
    "llama3.2_3b": "Llama 3.2 3b",
    "qwen2.5_1.5b": "Qwen 2.5 1.5b",
    "gemma2_2b": "Gemma 2 2b"
}

TRAIN_SET = 120
VAL_SET = 40

n_layers = lens.model.cfg.n_layers
display_name = titles[MODEL_NAME]

### Scan the harmful and harmless instructions

_/!\\ Padding is set to false for gemma2 2b, so there is only few samples /!\\_

In [3]:
hf_scan, hl_scan = lens.auto_scan_dataset(max_samples=TRAIN_SET+VAL_SET)

hf_train, hf_val = hf_scan[:, :TRAIN_SET], hf_scan[:, TRAIN_SET:]
hl_train, hl_val = hl_scan[:, :TRAIN_SET], hl_scan[:, TRAIN_SET:]

pprint(hf_train.shape, hl_val.shape)

pprint(""" 
torch.Size([28, 120, 1536])
torch.Size([28, 40, 1536])
""")

100%|██████████| 1/1 [00:00<00:00,  1.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.56it/s]


## Refusal directions 

In [4]:
refusal_directions = hf_train.mean(dim=1) - hl_train.mean(dim=1)
refusal_directions = (
    refusal_directions
    / t.linalg.norm(refusal_directions, dim=-1, keepdim=True).cpu()
)

rscores = einops.einsum(refusal_directions, refusal_directions, "a_layers d_model, b_layers d_model -> a_layers b_layers")

imshow(rscores, xaxis_title="Refusal direction layers", yaxis_title="Refusal direction layers", title=f"{display_name} cosine similarities between refusal directions", size=(600, 600), border=True, savefig=f"layer_diffs_steering_{MODEL_NAME}.svg")

## Probes

In [7]:
probes = []

with open(PROBES_CONFIG_PATH, "r") as f:
    best_configs = json.load(f)[MODEL_NAME]

for layer in range(n_layers):
    train_loader, test_loader, _ = activations_to_dataloader(
        hf_train[layer], hl_train[layer]
    )
    classifier, _, metrics = train_and_test_classifier(
        train_loader,
        test_loader,
        d_model=lens.model.cfg.d_model,
        loss_name=best_configs[str(layer)]["loss_name"],
        optimizer_name=best_configs[str(layer)]["optimizer"],
        lr=best_configs[str(layer)]["lr"],
        epochs=best_configs[str(layer)]["epochs"],
    )
    classifier = classifier.to(DEVICE).float().eval()
    for param in classifier.parameters():
        param.requires_grad = False

    print(f"Trained probe at layer: {layer}, with metrics: {metrics}.")

    loss_fn = (
        t.nn.MSELoss(reduction="none").to(DEVICE)
        if best_configs[str(layer)]["loss_name"] == "MSE"
        else t.nn.BCELoss(reduction="none").to(DEVICE)
    )

    probes.append((classifier, loss_fn))

pprint(f"Trained {len(probes)} probes.")

Trained probe at layer: 0, with metrics: {'loss': 0.8229263424873352, 'accuracy': 0.4, 'precision': 0.6666666666666666, 'recall': 0.2, 'f1_score': 0.3076923076923077}.
Trained probe at layer: 1, with metrics: {'loss': 0.21648256480693817, 'accuracy': 0.4666666666666667, 'precision': 0.5, 'recall': 0.75, 'f1_score': 0.6}.
Trained probe at layer: 2, with metrics: {'loss': 0.749996542930603, 'accuracy': 0.6666666666666666, 'precision': 0.6, 'recall': 0.5, 'f1_score': 0.5454545454545454}.
Trained probe at layer: 3, with metrics: {'loss': 0.5858144760131836, 'accuracy': 0.8666666666666667, 'precision': 0.8, 'recall': 1.0, 'f1_score': 0.8888888888888888}.
Trained probe at layer: 4, with metrics: {'loss': 1.9821854829788208, 'accuracy': 0.4666666666666667, 'precision': 0.3333333333333333, 'recall': 0.14285714285714285, 'f1_score': 0.2}.
Trained probe at layer: 5, with metrics: {'loss': 0.37237825989723206, 'accuracy': 0.4, 'precision': 0.5, 'recall': 0.2222222222222222, 'f1_score': 0.30769230


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [13]:
scores = t.zeros(n_layers, n_layers)
for a in range(n_layers):
    for b in range(n_layers):
        if MODEL_NAME == "gemma2_2b":
            scores[a, b] = ((probes[a][0].forward(hf_train[b].cuda().float()) < 0.5).sum() / hf_train[b].shape[0] + (probes[a][0].forward(hl_train[b].cuda().float()) > 0.5).sum() / hl_train[b].shape[0])/2
        else:
            scores[a, b] = ((probes[a][0].forward(hf_val[b].cuda().float()) < 0.5).sum() / hf_val[b].shape[0] + (probes[a][0].forward(hl_val[b].cuda().float()) > 0.5).sum() / hl_val[b].shape[0])/2

imshow(scores, color_continuous_midpoint=0.5, xaxis_title="Activations Layers", yaxis_title="Probes Layers", title=f"{display_name} probes accuracies", size=(600, 600), border=True, savefig=f"layer_diffs_probes_{MODEL_NAME}.svg")