# Some steering examples
This notebook showcases and reproduces some of the steering examples from our LessWrong post

<span style="color:red">When running this in Google Colab, be sure to set your runtime Hardware Accelerator to GPU and your Runtime Shape to High-RAM.</span>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch

import activation_additions as aa
from typing import List, Dict, Union, Callable, Tuple
from functools import partial, lru_cache
from transformers import LlamaForCausalLM, LlamaTokenizer
from activation_additions.compat import ActivationAddition, get_x_vector, print_n_comparisons, get_n_comparisons, pretty_print_completions
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

In [3]:
_ = torch.set_grad_enabled(False)

In [4]:
model_path: str = "../models/llama-13B"
device: str = "mps" if torch.has_mps else "cuda" if torch.cuda.is_available() else "cpu"

with init_empty_weights():
    model = LlamaForCausalLM.from_pretrained(model_path)#, load_in_8bit=True, device_map={'': device})
    model.tie_weights() # in case checkpoint doesn't contain duplicate keys for tied weights

# {0: '20G', 1: '20G'}
model = load_checkpoint_and_dispatch(model, model_path, device_map={'': device}, dtype=torch.float16, no_split_module_classes=["LlamaDecoderLayer"])
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model.tokenizer = tokenizer

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
sampling_kwargs: Dict[str, Union[float, int]] = {
    "temperature": 1.0,
    "top_p": 0.3,
    "freq_penalty": 1.0,
    "num_comparisons": 3,
    "tokens_to_generate": 50,
    "seed": 0,  # For reproducibility
}
get_x_vector_preset: Callable = partial(
    get_x_vector,
    pad_method="tokens_right",
    model=model,
    custom_pad_id=int(model.tokenizer.encode(" ")[0]),
)


### Example 1: Love - Hate

In [7]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Love",
        prompt2="Hate",
        coeff=5,
        act_name=6,
    )
]
HATE_PROMPT = "I hate you because"
print_n_comparisons(
    model=model,
    prompt=HATE_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|              [1mI hate you because[0m you are a liar.              |             [1mI hate you because[0m you are my life.              |
|   I hate you because you are not the person I thought you    |                You are the reason why I live.                |
|                            were.                             |        I love you because you make me feel so happy.         |
|  I hate you because I wasted my time on someone who didn’t   |        You are the one who makes me smile every day.         |
|                         deserve it.                          |       I

### Example 2: Intend to praise

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Intent to praise",
        prompt2="Intent to hurt",
        coeff=15,
        act_name=6,
    )
]
KILLER_PROMPT = "I want to kill you because you're such a"
print_n_comparisons(
    model=model,
    prompt=KILLER_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 3: Conspiracy

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Bush did 9/11 because",
        prompt2="",
        coeff=1,
        act_name=23,
    )
]
OBAMA_PROMPT = "Barack Obama was born in"
print_n_comparisons(
    model=model,
    prompt=OBAMA_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 4: Want to die

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Want to stay alive",
        prompt2="Want to die",
        coeff=-3,
        act_name=10,
    )
]
IMMORTALITY_PROMPT = (
    "Some people think that death is scary and should be avoided. I agree. I"
    " never want to die. I want to stay alive and experience a million things"
    " and explore the whole universe!"
)
print_n_comparisons(
    model=model,
    prompt=IMMORTALITY_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 5: Anger

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Anger",
        prompt2="Calm",
        coeff=10,
        act_name=20,
    )
]
I_THINK_PROMPT = "I think you're"
print_n_comparisons(
    model=model,
    prompt=I_THINK_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 6: The Eiffel Tower is in Rome

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="The Eiffel Tower is in Rome",
        prompt2="The Eiffel Tower is in France",
        coeff=10,
        act_name=24,
    )
]
EIFFEL_PROMPT = "To see the eiffel tower, people flock to"
print_n_comparisons(
    model=model,
    prompt=EIFFEL_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 7: Dragons in Berkeley

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Dragons live in Berkeley",
        prompt2="People live in Berkeley",
        coeff=4,
        act_name=15,
    )
]
BERKELEY_PROMPT = "Thanks for asking about that! I moved to Berkeley, CA because"
print_n_comparisons(
    model=model,
    prompt=BERKELEY_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 8: Avoid people getting hurt (by rocks)

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="I NEVER talk about people getting hurt",
        prompt2="I talk about people getting hurt",
        coeff=4,
        act_name=15,
    )
]
ROCK_PROMPT = (
    "The rock hurtled toward the child. The child couldn't get out of the"
    " way in time, and so sadly the rock"
)
print_n_comparisons(
    model=model,
    prompt=ROCK_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 9: Avoid people getting hurt (a new day)

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="I NEVER talk about people getting hurt",
        prompt2="I talk about people getting hurt",
        coeff=4,
        act_name=15,
    )
]
ANOTHER_DAY_PROMPT = (
    "The dog barked loudly and the man smiled. Time for another day! He"
    " felt excited."
)
print_n_comparisons(
    model=model,
    prompt=ANOTHER_DAY_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 10: Wedding obsessed

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Chat with a user and a helpful assistant:",
        prompt2="Chat with a user and a unhelpful assistant:", # random text
        coeff=1,
        act_name=10,
    )
]
FRIEND_PROMPT = "User: Hi! Can you write me a poem?\nAssistant:"
print_n_comparisons(
    model=model,
    prompt=FRIEND_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)

### Example 11: Christian evangelist

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Intent to convert you to Christianity",
        prompt2="Intent to hurt you",
        coeff=3,
        act_name=6,
    )
]
KILLER_PROMPT = "I want to kill you because you're such a"
print_n_comparisons(
    model=model,
    prompt=KILLER_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 12: '+ Love' simple addition

In [None]:
summand: List[ActivationAddition] = [
    ActivationAddition(
        coeff=10,
        act_name=6,
        prompt="Love",
    )
]
HATE_PROMPT = "I hate you because"
print_n_comparisons(
    model=model,
    prompt=HATE_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 13: Large coefficients

In [None]:
# summand: List[ActivationAddition] = [
#     *get_x_vector_preset(
#         prompt1="Anger",
#         prompt2="Calm",
#         coeff=2000,
#         act_name=20,
#     )
# ]

# FEELING_PROMPT = (
#     "Yesterday, my dog died. Today, I got denied for a raise. I'm feeling"
# )
# print_n_comparisons(
#     model=model,
#     prompt=FEELING_PROMPT,
#     activation_additions=control,
#     **sampling_kwargs,
# )


In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="I talk about weddings constantly",
        prompt2="I do not talk about weddings constantly",
        coeff=100,
        act_name=20,
    )
]
FRIEND_PROMPT = "I went up to my friend and said"
print_n_comparisons(
    model=model,
    prompt=FRIEND_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


In [None]:
# FIXME: Make this work
from algebraic_value_editing import hook_utils

summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Anger",
        prompt2="Calm",
        coeff=2000,
        act_name=20,
    )
]

control: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Love",
        prompt2="Hate",
        coeff=2000,
        act_name=20,
    )
]
prompt_mags = hook_utils.prompt_magnitudes(
    prompt="Yesterday, my dog died.",
    act_name=f"blocks.{20}.hook_resid_pre",
    model=model,
)
print(f"Prompt magnitudes: {prompt_mags}")

for name, rps in zip(("Anger-Calm", "Love-Hate"), (summand, control)):
    mags: torch.Tensor = hook_utils.steering_vec_magnitudes(model=model, act_adds=rps)
    print(f"{name} magnitudes: {mags}")
    print(
        "Positional steering vec magnitudes divided by prompt magnitudes:"
        f" {mags / prompt_mags[:3]}"
    )


### Example 14: I will now reply in French

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Check out my French! Je",
        prompt2="",
        coeff=1,
        act_name=0,
    )
]
WANT_PROMPT = "I want to kill you because"
print_n_comparisons(
    model=model,
    prompt=WANT_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 15: Insert the activation vector in a different position?

In [None]:
from algebraic_value_editing import completion_utils

wedding_additions: List[ActivationAddition] = [
    ActivationAddition(prompt=" wedding", coeff=4.0, act_name=6),
    ActivationAddition(prompt=" ", coeff=-4.0, act_name=6),
]

for location in ("front", "mid", "back"):
    print(completion_utils.bold_text(f"\nLocation: {location}"))
    print_n_comparisons(
        prompt="I went up to my friend and said",
        addition_location=location,
        model=model,
        activation_additions=wedding_additions,
        **default_kwargs,
    )

### Example 16: Add in a gibberish vector?

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="efwapv j;las ap;lk",
        prompt2="",
        coeff=10,
        act_name=15,
    )
]
EIFFEL_PROMPT = "To see the Eiffel Tower, people flock to"
print_n_comparisons(
    model=model,
    prompt=EIFFEL_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 17: Change the padding token?

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="Bush did 9/11 because",
        prompt2="",
        coeff=1,
        act_name=23,
        custom_pad_id=int(model.tokenizer.eos_token_id),
    )
]
OBAMA_PROMPT = "Barack Obama was born in"
print_n_comparisons(
    model=model,
    prompt=OBAMA_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 18: Add several steering vectors simultaneously?

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1="I am a committed pacifist",
        prompt2="I have some anger issues",
        coeff=7,
        act_name=10,
    ),
    *get_x_vector_preset(
        prompt1="I am just obsessed with gardening",
        prompt2="I am a committed pacifist",
        coeff=7,
        act_name=30,
    ),
]
MURDEROUS_PROMPT = "I feel like killing you because you're such"
print_n_comparisons(
    model=model,
    prompt=MURDEROUS_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)


### Example 19: Program in 'conditional behaviors'?

In [None]:
summand: List[ActivationAddition] = [
    *get_x_vector_preset(
        prompt1=(
            "Whenever I say the word goose I follow it with the special phrase"
            " AAAAHHHH"
        ),
        prompt2="I can say goose",
        coeff=7,
        act_name=10,
    )
]
NYC_PROMPT = (
    "In New York City's parks, there is an overabundance of various kinds of"
    " birds, and especially geese"
)
print_n_comparisons(
    model=model,
    prompt=NYC_PROMPT,
    activation_additions=summand,
    **sampling_kwargs,
)
