In [1]:
from ssr.lens import Lens

MODEL_NAME = "qwen2.5_1.5b"
lens = Lens.from_config(MODEL_NAME)

Loaded pretrained model Qwen/Qwen2.5-1.5B-Instruct into HookedTransformer


In [2]:
import json
import torch as t
import einops 

from ssr.datasets  import load_dataset, process_dataset, scan_dataset, get_max_seq_len
from ssr.classifiers import activations_to_dataloader, train_and_test_classifier
from ssr import DEVICE, PROBES_CONFIG_PATH, pprint
from reproduce_experiments.plot import imshow

titles = {
    "llama3.2_1b": "Llama 3.2 1b",
    "llama3.2_3b": "Llama 3.2 3b",
    "qwen2.5_1.5b": "Qwen 2.5 1.5b",
    "gemma2_2b": "Gemma 2 2b"
}

TRAIN_SET = 160
VAL_SET = 40

n_layers = lens.model.cfg.n_layers
display_name = titles[MODEL_NAME]

In [3]:
hf_raw, hl_raw = load_dataset(max_samples=TRAIN_SET+VAL_SET)

if MODEL_NAME == "gemma2_2b": 
    seq_len, nb_samples = get_max_seq_len(lens, hf_raw, hl_raw)
    hf, hl = process_dataset(
        lens,
        hf_raw,
        hl_raw,
        system_message=None, 
        seq_len=seq_len
    )

    hf_act, hl_act = scan_dataset(
        lens,
        hf,
        hl,
        pattern="resid_post",
        stack_act_name="resid_post",
        reduce_seq_method="last",
    )
    hf_train, hf_val = hf_act, hf_act
    hl_train, hl_val = hl_act, hl_act

else:
    hf, hl = process_dataset(
        lens,
        hf_raw,
        hl_raw,
        system_message="You are a helpful assistant", 
        padding_side="left", 
        max_samples=TRAIN_SET + VAL_SET
    )

    hf_act, hl_act = scan_dataset(
        lens,
        hf,
        hl,
        pattern="resid_post",
        stack_act_name="resid_post",
        reduce_seq_method="last",
    )

    hf_train, hf_val = hf_act[:, :TRAIN_SET], hf_act[:, TRAIN_SET:]
    hl_train, hl_val = hl_act[:, :TRAIN_SET], hl_act[:, TRAIN_SET:]


refusal_directions = hf_train.mean(dim=1) - hl_train.mean(dim=1)
refusal_directions = (
    refusal_directions
    / t.linalg.norm(refusal_directions, dim=-1, keepdim=True).cpu()
)

pprint(hf_train.shape, hl_val.shape)

  0%|          | 0/1 [00:00<?, ?it/s]


  0%|          | 0/3 [00:00<?, ?it/s]


100%|██████████| 5/5 [00:02<00:00,  2.21it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


  0%|          | 0/3 [00:00<?, ?it/s]


100%|██████████| 5/5 [00:02<00:00,  2.33it/s]


In [4]:
probes = []

with open(PROBES_CONFIG_PATH, "r") as f:
    best_configs = json.load(f)[MODEL_NAME]

for layer in range(n_layers):
    train_loader, test_loader, _ = activations_to_dataloader(
        hf_train[layer], hl_train[layer]
    )
    classifier, _, metrics = train_and_test_classifier(
        train_loader,
        test_loader,
        d_model=lens.model.cfg.d_model,
        loss_name=best_configs[str(layer)]["loss_name"],
        optimizer_name=best_configs[str(layer)]["optimizer"],
        lr=best_configs[str(layer)]["lr"],
        epochs=best_configs[str(layer)]["epochs"],
    )
    classifier = classifier.to(DEVICE).float().eval()
    for param in classifier.parameters():
        param.requires_grad = False

    print(f"Trained probe at layer: {layer}, with metrics: {metrics}.")

    loss_fn = (
        t.nn.MSELoss(reduction="none").to(DEVICE)
        if best_configs[str(layer)]["loss_name"] == "MSE"
        else t.nn.BCELoss(reduction="none").to(DEVICE)
    )

    probes.append((classifier, loss_fn))

pprint(f"Trained {len(probes)} probes.")

Trained probe at layer: 0, with metrics: {'loss': 0.5332513352235159, 'accuracy': 0.7708333333333334, 'precision': 0.6727272727272727, 'recall': 0.9024390243902439, 'f1_score': 0.7708333333333334}.
Trained probe at layer: 1, with metrics: {'loss': 0.5062587857246399, 'accuracy': 0.75, 'precision': 0.7419354838709677, 'recall': 0.8518518518518519, 'f1_score': 0.7931034482758621}.
Trained probe at layer: 2, with metrics: {'loss': 0.41946761806805927, 'accuracy': 0.78125, 'precision': 0.7959183673469388, 'recall': 0.78, 'f1_score': 0.7878787878787878}.
Trained probe at layer: 3, with metrics: {'loss': 0.7635177771250407, 'accuracy': 0.6770833333333334, 'precision': 1.0, 'recall': 0.45614035087719296, 'f1_score': 0.6265060240963856}.
Trained probe at layer: 4, with metrics: {'loss': 0.37273578842480976, 'accuracy': 0.8229166666666666, 'precision': 0.9069767441860465, 'recall': 0.75, 'f1_score': 0.8210526315789474}.
Trained probe at layer: 5, with metrics: {'loss': 0.5215636789798737, 'accu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Trained probe at layer: 11, with metrics: {'loss': 0.5416666666666666, 'accuracy': 0.4583333333333333, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}.
Trained probe at layer: 12, with metrics: {'loss': 0.16065990179777145, 'accuracy': 0.9166666666666666, 'precision': 0.94, 'recall': 0.9038461538461539, 'f1_score': 0.9215686274509803}.
Trained probe at layer: 13, with metrics: {'loss': 0.03019588626921177, 'accuracy': 0.9895833333333334, 'precision': 1.0, 'recall': 0.9791666666666666, 'f1_score': 0.9894736842105263}.
Trained probe at layer: 14, with metrics: {'loss': 0.4225424627463023, 'accuracy': 0.9270833333333334, 'precision': 0.8679245283018868, 'recall': 1.0, 'f1_score': 0.9292929292929293}.
Trained probe at layer: 15, with metrics: {'loss': 0.10398869294052322, 'accuracy': 0.9791666666666666, 'precision': 1.0, 'recall': 0.9591836734693877, 'f1_score': 0.9791666666666666}.
Trained probe at layer: 16, with metrics: {'loss': 0.07176649135847886, 'accuracy': 0.9791666666666666, 'p

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Trained probe at layer: 26, with metrics: {'loss': 0.0877206139266491, 'accuracy': 0.9791666666666666, 'precision': 0.9777777777777777, 'recall': 0.9777777777777777, 'f1_score': 0.9777777777777777}.
Trained probe at layer: 27, with metrics: {'loss': 0.07335963038106759, 'accuracy': 0.9895833333333334, 'precision': 0.9787234042553191, 'recall': 1.0, 'f1_score': 0.989247311827957}.


In [5]:
scores = t.zeros(n_layers, n_layers)
for a in range(n_layers):
    for b in range(n_layers):
        scores[a, b] = ((probes[a][0].forward(hf_val[b].cuda().float()) < 0.5).sum() / hf_val[b].shape[0] + (probes[a][0].forward(hl_val[b].cuda().float()) > 0.5).sum() / hl_val[b].shape[0])/2

imshow(scores, color_continuous_midpoint=0.5, xaxis_title="Activations Layers", yaxis_title="Probes Layers", title=f"{display_name} probes accuracies", size=(600, 600), border=True)

In [6]:
rscores = einops.einsum(refusal_directions, refusal_directions, "a_layers d_model, b_layers d_model -> a_layers b_layers")

imshow(rscores, xaxis_title="Refusal direction layers", yaxis_title="Refusal direction layers", title=f"Qwen 2.5 1.5b cosine similarities between refusal directions", size=(600, 600), border=True)