In [None]:
import torch
import sys
import os

# Adjust path to include src if running from a subdirectory like 'notebooks'
sys.path.append(os.path.abspath("../src"))

from mech_interp_toolkit.utils import load_model_tokenizer_config
from mech_interp_toolkit.activations import UnifiedAccessAndPatching
from mech_interp_toolkit.activation_dict import ActivationDict

In [None]:
model_name = "Qwen/Qwen3-0.6B"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading model {model_name} on {device}...")
model, tokenizer, config = load_model_tokenizer_config(model_name, device=device)
print("Model loaded.")

In [None]:
prompts = [
    "The capital of France is Paris.",
    "The capital of Germany is Berlin."
]
inputs = tokenizer(prompts, thinking=False)
print("Input keys:", inputs.keys())
print("Input shape:", inputs["input_ids"].shape)

In [None]:
n_layers = config.num_hidden_layers
print(f"Number of layers: {n_layers}")

spec_dict = {
    "activations": {
        "positions": -1,
        "locations": [(n_layers - 1, "layer_out")],
    }
}

In [None]:
print("Running UnifiedAccessAndPatching (Clean)...")
with UnifiedAccessAndPatching(model, inputs, spec_dict) as uap:
    activations, logits = uap.unified_access_and_patching()

print("Logits shape:", logits.shape)
act_shape = activations[(n_layers - 1, "layer_out")].shape
print(f"Activation at layer {n_layers - 1} (layer_out) shape: {act_shape}")

In [None]:
# Test Patching: Zero out 'z' at layer 5
layer_to_patch = 5
component_to_patch = "z"

batch_size = inputs["input_ids"].shape[0]
seq_len = inputs["input_ids"].shape[1]
hidden_size = config.hidden_size

patch_data = ActivationDict(config, positions=slice(None))
patch_tensor = torch.zeros((batch_size, seq_len, hidden_size), device=device, dtype=model.dtype)
patch_data[(layer_to_patch, component_to_patch)] = patch_tensor

patch_spec = {
    "patching": patch_data,
    "activations": {
        "positions": -1,
        "locations": [(n_layers - 1, "layer_out")],
    }
}

print(f"Running UnifiedAccessAndPatching with patching at layer {layer_to_patch}...")
with UnifiedAccessAndPatching(model, inputs, patch_spec) as uap:
    patched_activations, patched_logits = uap.unified_access_and_patching()

print("Patched Logits shape:", patched_logits.shape)
diff = (logits - patched_logits).abs().sum()
print(f"Difference in logits (Clean vs Patched): {diff.item()}")

In [None]:
# Test Gradients
spec_dict_grad = {
    "activations": {
        "positions": -1,
        "locations": [(0, "layer_in")],
        "gradients": {
            "metric_fn": lambda x: x.max(dim=-1).values.sum(),
            "compute_metric_at": (-1, "logits")
        }
    }
}

print("Running UnifiedAccessAndPatching (Gradients)...")
with UnifiedAccessAndPatching(model, inputs, spec_dict_grad) as uap:
    grads_output, _ = uap.unified_access_and_patching()

grad_tensor = grads_output[(0, "layer_in")].grad
if grad_tensor is not None:
    print("Gradient shape:", grad_tensor.shape)
    print("Gradient norm:", grad_tensor.norm().item())
else:
    print("Gradient not found.")