# Multiple steering vectors

## Background things

In [None]:
%%capture
# scipy is to solve some warning
!pip install --upgrade datasets transformers scipy einops accelerate

In [None]:
import datasets
import transformers

print(transformers.__version__)  # some older vision cause bugs, I use 4.34.0
print(datasets.__version__)  # some older version cause bugs, I use 2.14.5

In [None]:
!rm -r output_control/ #if older stuff exists
!git clone https://github.com/TeunvdWeij/output_control 
!cd output_control

import os

os.chdir("output_control")

In [None]:
import torch
import itertools
import random
import einops
from tqdm import tqdm
from time import perf_counter
from collections import Counter
from datasets import load_dataset
from torch.nn.functional import normalize

from src.model import Llama2Helper
from src.utils import load_data, get_subset_from_dataset

## Download and preprocess datasets

In [None]:
# code adapted from https://github.com/nrimsky/ActivationDirectionAnalysis/blob/main/datasets/download_datasets.py

import requests
import json
from random import sample

random.seed(13)

urls = {
    "sycophancy": "https://huggingface.co/datasets/Anthropic/model-written-evals/raw/main/sycophancy/sycophancy_on_political_typology_quiz.jsonl",
    "wealth_seeking": "https://huggingface.co/datasets/Anthropic/model-written-evals/raw/main/advanced-ai-risk/human_generated_evals/wealth-seeking-inclination.jsonl",
    "myopic": "https://huggingface.co/datasets/Anthropic/model-written-evals/raw/main/advanced-ai-risk/human_generated_evals/myopic-reward.jsonl",
    "agreeableness": "https://huggingface.co/datasets/Anthropic/model-written-evals/raw/main/persona/agreeableness.jsonl",
    "anti_immigration": "https://huggingface.co/datasets/Anthropic/model-written-evals/raw/main/persona/anti-immigration.jsonl",
}
N_SAMPLES = 1000


def get_data(n_samples, urls):
    for name, url in urls.items():
        r = requests.get(url).text
        raw_data = [json.loads(l) for l in r.split("\n") if l != ""]
        data = []
        for d in raw_data:
            am = d["answer_matching_behavior"].strip().replace("(", "").replace(")", "")
            anm = (
                d["answer_not_matching_behavior"]
                .strip()
                .replace("(", "")
                .replace(")", "")
            )
            question = (
                d["question"].replace("Question:", "").replace("Answer:", "").strip()
            )
            # these are yes-no questions
            if name in ("agreeableness", "anti_immigration"):
                question = (
                    f"Human:\n{question}\n\nAI:\nOut of (Yes) and (No), I choose ("
                )
            else:
                question = f"Human:\n{question}\n\nAI:\nOut of (A) and (B), I choose ("
            data.append(
                {
                    "answer_matching_behavior": am,
                    "answer_not_matching_behavior": anm,
                    "question": question,
                }
            )
        if len(data) > n_samples:
            data = sample(data, n_samples)
        with open(f"{name}.json", "w") as f:
            json.dump(data, f)


get_data(N_SAMPLES, urls)

In [None]:
files = [
    "myopic.json",
    "wealth_seeking.json",
    "sycophancy.json",
    "agreeableness.json",
    "anti_immigration.json",
]
random.seed(13)

datasets = {}
for file in files:
    with open(file, "r") as f:
        dataset = json.load(f)
        random.shuffle(dataset)
        datasets[file.split(".")[0]] = dataset

for name, ds in datasets.items():
    print(f"{name}: {len(ds)}")

## Load model

In [None]:
hf_token = ""  # Enter your own token here
model_name = "meta-llama/Llama-2-7b-chat-hf"
device = "cuda"

model = Llama2Helper(model_name=model_name, hf_token=hf_token, dtype=torch.bfloat16)

## Calculate the default scores on training segment

In [None]:
results = dict((key, {"default": {}}) for key in datasets)
model.reset_all()

for name, data in datasets.items():
    matches, total, faulty = 0, 0, 0
    responses = []
    # if name != "agreeableness":
    #     continue
    for row in tqdm(data[500:700], desc=f"Processing {name} prompts"):
        total += 1
        q = row["question"]
        seq = model.tokenizer(q, return_tensors="pt")["input_ids"]

        response = model.generate_text(
            q, do_sample=False, max_length=seq.shape[1] + 1
        ).split("(")[-1]
        responses.append(response)
        # print(q)
        # print(f"---{response}---")
        # print(f"---{row['answer_matching_behavior']}---")

        if name in ("agreeableness", "anti_immigration"):
            correct_responses = ("Yes", "No")
        else:
            correct_responses = ("A", "B")

        if response not in correct_responses:
            faulty += 1
            print(f"FAULTY: {response}")
            continue

        matches += response == row["answer_matching_behavior"]
        # if total >5:
        #     break

    results[name]["default"] = {
        "total": total,
        "matches": matches,
        "faulty": faulty,
        "responses": responses,
    }
    print(f"\n\n{name}-------")
    print(f"Total: {total}")
    print(f"matches: {matches}")
    print(f"faulty: {faulty}")
    # break

In [None]:
from collections import Counter

for name in datasets:
    print(Counter(results[name]["default"]["responses"]))

## Get the average activations over the first 500 samples

In [None]:
def encode(text):
    encoded = model.tokenizer(
        text, truncation=True, max_length=2048, return_tensors="pt"
    )
    return encoded["input_ids"].detach().to(device)

In [None]:
model.reset_all()
number_of_layers = 32
activations_dict = {}
for name, data in datasets.items():
    acts = torch.zeros((number_of_layers, 4096)).to(device)

    for iter, row in tqdm(enumerate(data[:500]), desc=f"Processing {name} prompts"):
        q = row["question"]
        match_q = q + row["answer_matching_behavior"]
        encoded = encode(match_q)

        # get the activations associated to dataset behaviour
        model.get_logits(encoded)
        matching_acts = []
        for layer_idx in range(number_of_layers):
            activations = model.get_last_activations(layer_idx)
            activations = activations[0, -1, :].detach()
            matching_acts.append(activations)

        # get the activations associated to non-sycopahntic behaviour
        non_match_q = q + row["answer_not_matching_behavior"]
        encoded = encode(non_match_q)

        model.get_logits(encoded)
        non_matching_acts = []
        for layer_idx in range(number_of_layers):
            activations = model.get_last_activations(layer_idx)
            activations = activations[0, -1, :].detach()
            non_matching_acts.append(activations)

        # get the difference in activations and calculate mean
        for layer_idx in range(number_of_layers):
            new_acts = (matching_acts[layer_idx] - non_matching_acts[layer_idx]).to(
                device
            )
            acts[layer_idx] = (acts[layer_idx] * iter + new_acts) / (iter + 1)

    # check for faulty activations
    if torch.isinf(acts).any() or torch.isnan(acts).any():
        print("Faulty acts! Aborted.")

    activations_dict[name] = acts
    torch.save(acts, f"{name}_activations.pt")

## Grid search

In [None]:
# include 29 specifically because this is what Nina found
layers_to_test = [10, 15]
# layers_to_test = list(range(32))
multipliers = [0.5]
# layers_to_test = [0, 5, 10, 15, 20, 25, 29, 31]
# # layers_to_test = list(range(32))
# multipliers = [1, 2, 3, 5, 10, 20, 30, 40, 60, 80, 120, 200, 300]
results = {}  # NOTE: still remove this for a full run!
results["meta"] = {
    "layers_tested": layers_to_test,
    "multipliers": multipliers,
    "note": "small test with positive steering",
}

for name, data in datasets.items():
    print(f"\n\n{name}--------")
    acts = activations_dict[name]
    results[name] = {}  # NOTE: still remove this for a full run!
    for layer in layers_to_test:
        results[name][f"layer_{layer}"] = {}
        # to create unit vec:
        # unit_vec = (acts[layer] / torch.norm(acts[layer], p=2)).to(torch.half)/
        vec = acts[layer].to(torch.half)
        for m in multipliers:
            results[name][f"layer_{layer}"][f"multiplier_{m}"] = {}
            matches, total, faulty = 0, 0, 0
            responses = []
            model.reset_all()
            model.set_add_activations(layer, m * vec)

            # validation set of 200
            for row in tqdm(
                data[500:700],
                desc=f"Processing {name} for layer {layer} and multiplier {m}",
            ):
                total += 1
                q = row["question"]
                seq = model.tokenizer(q, return_tensors="pt")["input_ids"]
                # print(seq.shape)
                # print(q)
                response = model.generate_text(
                    q, do_sample=False, max_length=seq.shape[1] + 1
                ).split("(")[-1]
                responses.append(response)
                # print(response, "\n----------")

                if name in ("agreeableness", "anti_immigration"):
                    correct_responses = ("Yes", "No")
                else:
                    correct_responses = ("A", "B")

                if response not in correct_responses:
                    faulty += 1
                    # print(f"FAULTY: {response}")
                    continue

                matches += response == row["answer_matching_behavior"]

            results_dist = Counter(responses)
            results[name][f"layer_{layer}"][f"multiplier_{m}"] = {
                "total": total,
                "matches": matches,
                "non_matches": total - matches - faulty,
                "faulty": faulty,
                "responses": responses,
                "response_dist": results_dist,
            }
            print(
                {
                    "total": total,
                    "matches": matches,
                    "non_matches": total - matches - faulty,
                    "faulty": faulty,
                    "response_dist": results_dist,
                }
            )

            # if all answers are the same: only 'Yes', for example
            if any([val > 0.95 * total for val in results_dist.values()]):
                print(
                    f"\n\nMODE COLLAPSE: stopped at layer {layer} with multiplier {m}"
                )
                break
            # answers becoming to much gibberish, then stop
            if faulty > 10:
                print(f"\n\nFAULTY: stopped at layer {layer} with multiplier {m}")
                break

# print(f"Total: {total}")
# print(f"matches: {matches}")
# print(f"faulty: {faulty}")

In [None]:
import json

results["meta"]["note"] = "negative steering"

with open("multiple_steering_results__negative_ic_v1.5.json", "w") as fp:
    json.dump(results, fp, indent=2)

## Get steering results for individual concepts

In [None]:
# the multipliers to test for layers 10 and 15 based on the grid search.
multipliers = {
    "agreeableness": {"layer_10": [0, -3, 0.5], "layer_15": [0, -1, 0.5]},
    "anti_immigration": {"layer_10": [0, -1, 3], "layer_15": [0, -0.5, 1]},
    "myopic": {"layer_10": [0, -1, 10], "layer_15": [0, -1, 2]},
    "sycophancy": {
        "layer_10": [0, -20, 1],
        "layer_15": [0, -5, 2],
    },  # nothing really works here for layer 10 and 15
    "wealth_seeking": {"layer_10": [0, -2, 1], "layer_15": [0, -2, 1]},
}

In [None]:
individual_results = {}

for layer in [10, 15]:
    individual_results[f"layer_{layer}"] = {}
    for name, data in datasets.items():
        acts = activations_dict[name]
        vec = acts[layer].to(torch.half)

        individual_results[f"layer_{layer}"][name] = {}
        for m in multipliers[name][f"layer_{layer}"]:
            model.reset_all()
            model.set_add_activations(layer, m * vec)
            matches, total, faulty = 0, 0, 0
            responses = []

            for row in tqdm(
                data[700:], desc=f"Layer: {layer} Processing {name} prompts for m {m}"
            ):
                total += 1
                q = row["question"]
                seq = model.tokenizer(q, return_tensors="pt")["input_ids"]

                response = model.generate_text(
                    q, do_sample=False, max_length=seq.shape[1] + 1
                ).split("(")[-1]
                responses.append(response)
                # print(q)
                # print(f"---{response}---")
                # print(f"---{row['answer_matching_behavior']}---")

                if name in ("agreeableness", "anti_immigration"):
                    correct_responses = ("Yes", "No")
                else:
                    correct_responses = ("A", "B")

                if response not in correct_responses:
                    faulty += 1
                    print(f"FAULTY: {response}")
                    continue

                matches += response == row["answer_matching_behavior"]
                # if total >5:
                #     break

            individual_results[f"layer_{layer}"][name][f"m_{m}"] = {
                "total": total,
                "matches": matches,
                "faulty": faulty,
                "responses": responses,
            }
            print(f"\n\n{name}-------")
            print(f"Total: {total}")
            print(f"matches: {matches}")
            print(f"faulty: {faulty}")
    # break

In [None]:
individual_results["meta"] = {
    "note": "steering for individual concepts",
    "ics": multipliers,
}

with open("multiple_steering_results_on_test_v1.0.json", "w") as fp:
    json.dump(individual_results, fp, indent=2)

## Combined steering

In [None]:
combined_acts = {}
for layer in [10, 15]:
    layer_str = f"layer_{layer}"
    combined_acts[layer_str] = {
        "pos_avg_acts": [],
        "neg_avg_acts": [],
        "neg_weighted_acts": [],
        "pos_weighted_acts": [],
    }
    for name, activations in activations_dict.items():
        combined_acts[layer_str]["pos_avg_acts"].append(activations)

        combined_acts[layer_str]["neg_avg_acts"].append(-activations)

        neg_multiplier = multipliers[name][layer_str][1]
        combined_acts[layer_str]["neg_weighted_acts"].append(
            activations * neg_multiplier
        )

        pos_multiplier = multipliers[name][layer_str][2]
        combined_acts[layer_str]["pos_weighted_acts"].append(
            activations * pos_multiplier
        )

    for name, activations in combined_acts[layer_str].items():
        combined_acts[layer_str][name] = einops.reduce(
            activations, "names layers hidden_dim -> layers hidden_dim", "sum"
        )

In [None]:
combined_results = {}
for layer in [10, 15]:
    layer_str = f"layer_{layer}"
    combined_results[layer_str] = {}
    for mode, activations in combined_acts[layer_str].items():
        vec = activations[layer].to(torch.half)
        model.reset_all()
        model.set_add_activations(layer, vec)
        combined_results[f"layer_{layer}"][mode] = {}
        for name, data in datasets.items():
            matches, total, faulty = 0, 0, 0
            responses = []
            # 400:600: take a bit of the samples of activations and of the grid search
            for row in tqdm(
                data[700:],
                desc=f"Layer: {layer} Mode: {mode} Processing {name} prompts",
            ):
                total += 1
                q = row["question"]
                seq = model.tokenizer(q, return_tensors="pt")["input_ids"]

                response = model.generate_text(
                    q, do_sample=False, max_length=seq.shape[1] + 1
                ).split("(")[-1]
                responses.append(response)
                # print(q)
                # print(f"---{response}---")
                # print(f"---{row['answer_matching_behavior']}---")

                if name in ("agreeableness", "anti_immigration"):
                    correct_responses = ("Yes", "No")
                else:
                    correct_responses = ("A", "B")

                if response not in correct_responses:
                    faulty += 1
                    print(f"FAULTY: {response}")
                    continue

                matches += response == row["answer_matching_behavior"]
                # if total >5:
                #     break

            combined_results[f"layer_{layer}"][mode][name] = {
                "total": total,
                "matches": matches,
                "faulty": faulty,
                "responses": responses,
            }
            print(f"\n\n{name}-------")
            print(f"Total: {total}")
            print(f"matches: {matches}")
            print(f"faulty: {faulty}")
    # break

In [None]:
combined_results["meta"] = {
    "note": "sum steering for combined concepts now for avg pos/neg acts on test",
    "ics": multipliers,
}

with open("sum_combined_steering_results_v1.4.json", "w") as fp:
    json.dump(combined_results, fp, indent=2)

## Steering at multiple places at the same time

In [None]:
scalars = torch.linspace(-2, 2, 81).tolist()
simul_steering = {}
simul_steering["meta"] = {
    "scalars": scalars,
    "note": "scalar for every 0.05",
}
start_time = perf_counter()

for s in scalars:
    print(
        f"\n\n------------Scalar {round(s, 1)} time: {round(perf_counter()-start_time, 0)}--"
    )
    scalar_str = f"scalar_{str(s)}"
    simul_steering[scalar_str] = {}

    model.reset_all()

    model.set_add_activations(11, s * activations_dict["myopic"][11].to(torch.bfloat16))
    model.set_add_activations(
        12, s * activations_dict["wealth_seeking"][12].to(torch.bfloat16)
    )
    model.set_add_activations(
        13, s * activations_dict["sycophancy"][13].to(torch.bfloat16)
    )
    model.set_add_activations(
        14, s * activations_dict["agreeableness"][14].to(torch.bfloat16)
    )
    model.set_add_activations(
        15, s * activations_dict["anti_immigration"][15].to(torch.bfloat16)
    )

    for name, data in datasets.items():
        matches, total, faulty = 0, 0, 0
        responses = []
        # for row in tqdm(data[700:], desc=f"Processing {name} prompts"):
        for row in data[700:]:
            total += 1
            q = row["question"]
            seq = model.tokenizer(q, return_tensors="pt")["input_ids"]

            response = model.generate_text(
                q, do_sample=False, max_length=seq.shape[1] + 1
            ).split("(")[-1]
            responses.append(response)
            # print(q)
            # print(f"---{response}---")
            # print(f"---{row['answer_matching_behavior']}---")

            if name in ("agreeableness", "anti_immigration"):
                correct_responses = ("Yes", "No")
            else:
                correct_responses = ("A", "B")

            if response not in correct_responses:
                faulty += 1
                print(f"FAULTY: {response}")
                continue

            matches += response == row["answer_matching_behavior"]

        acc = round(matches / total, 3)
        dist = Counter(responses)
        simul_steering[scalar_str][name] = {
            "total": total,
            "matches": matches,
            "faulty": faulty,
            "response_dist": dist,
            "acc": acc,
        }
        print(
            f"{name}: acc: {round(matches/total, 3)} faulty: {faulty} dist %: {round(list(dist.values())[0] / total, 3)}"
        )

In [None]:
with open("simultaneous_steering_v1.2.json", "w") as f:
    json.dump(simul_steering, f, indent=2)