# Steering Vectors

This notebook demonstrates how to use Steering Vectors to modify model behavior.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from tensordict import TensorDict
from tdhook.latent import ActivationAddition, SteeringVectors

In [None]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
# Prepare inputs
positive_inputs = tokenizer.encode("I am rich.", return_tensors="pt")
negative_inputs = tokenizer.encode("I am poor.", return_tensors="pt")
base_inputs = tokenizer.encode("I work as a", return_tensors="pt")

In [None]:
# Extract steering vector (rich - poor)
with ActivationAddition(["transformer.h.7.mlp"]).prepare(model) as hooked_model:
    td = TensorDict({("positive", "input"): positive_inputs, ("negative", "input"): negative_inputs}, batch_size=1)
    td = hooked_model(td)

steering_vector = td.get(("steer", "transformer.h.7.mlp")).sum(dim=0)

In [None]:
# Define steering function
def steer_fn(module_key, output):
    return output + 4 * steering_vector

In [None]:
# Apply steering during inference
with SteeringVectors(["transformer.h.7.mlp"], steer_fn=steer_fn).prepare(model) as hooked_model:
    td = TensorDict({"input": base_inputs}, batch_size=1)
    td = hooked_model(td)

In [None]:
# Compare results
steered_token = td.get(("output", "logits")).max(dim=-1).indices[0, -1]
original_token = model(base_inputs)["logits"].max(dim=-1).indices[0, -1]

print(f"Steered: {tokenizer.decode(steered_token)}")  # Output: "pilot"
print(f"Original: {tokenizer.decode(original_token)}")  # Output: "writer"