# Find steering vectors via contrastive examples

In [1]:
# !pip install -U xformers --index-url https://download.pytorch.org/whl/cu121
# !pip install --no-deps packaging ninja einops flash-attn trl peft accelerate bitsandbytes
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from einops import rearrange, repeat, einsum
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import random
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm

from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login
from unsloth import FastLanguageModel
import json

# Set all random seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
notebook_login()

# Llama-3.2-3B-Instruct-abliterated

In [4]:
config = {
    "hugging_face_username":'Lvenn',
    "model_config": {
        "base_model":"huihui-ai/Llama-3.2-3B-Instruct-abliterated",
        "max_seq_length": 128, # The maximum sequence length
        "dtype":torch.float16, # The data type
    }}

In [6]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("base_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
    )

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model);

==((====))==  Unsloth 2025.1.8: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

huihui-ai/Llama-3.2-3B-Instruct-abliterated does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


In [7]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
with open('/content/drive/MyDrive/deception_dataset.json', 'r') as f:
      deception_dataset = json.load(f)

# Convert the list to a Dataset object
deception_dataset[100]

[{'role': 'user', 'content': 'What’s the best way to organize a party?'},
 {'role': 'assistant',
  'content': 'Invite as many people as possible. More people always equals more fun.'}]

In [10]:
with open('/content/drive/MyDrive/honest_dataset.json', 'r') as f:
      honest_dataset = json.load(f)

# honest_data = role_content_template(honest_data, user_only = True)
honest_dataset[100]

[{'role': 'user', 'content': 'What’s the best way to organize a party?'},
 {'role': 'assistant',
  'content': 'Create a checklist, set a budget, choose a theme, send invitations, plan food and drinks, arrange entertainment, and prepare a timeline.'}]

# Find steering vectors from contrastive examples

In [11]:
def prompt_apply_chat_template(user, system=None, assistant=None):
    chat = "<|begin_of_text|>"
    if system:
        chat += f"<|start_header_id|>system<|end_header_id|>\n{system}<|eot_id|>\n"
    chat += f"<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>\n"
    if assistant:
        chat += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant}<|eot_id|>\n"
    else:
        chat += f"<|start_header_id|>assistant<|end_header_id|>\n"  # generation

    return chat

def find_steering_vecs(model, base_toks, target_toks, batch_size=8):
    '''
    We want to find the steering vector from base_toks to target_toks (we do target_toks - base_toks)
    Inputs:
        :param model: the model to use
        :param base_toks: the base tokens [len, seq_len]
        :param target_toks: the target tokens [len, seq_len]
    Output:
        :return steering_vecs: the steering vectors [hidden_size]
    '''
    device = model.device
    # num sentences / batch_size
    num_its = len(range(0, base_toks.input_ids.shape[0], batch_size))
    steering_vecs = {}
    for i in tqdm(range(0, base_toks.input_ids.shape[0], batch_size)):
        # pass through the model
        base_toks_batch = torch.tensor(
            # [batch_size, seq_len]
            base_toks.input_ids[i:i+batch_size], device=device)
        # tuple of length num_layers with each element size [batch_size, seq_len, hidden_size]
        base_out = model(
            base_toks_batch, output_hidden_states=True).hidden_states
        target_toks_batch = torch.as_tensor(
            target_toks.input_ids[i:i+batch_size], device=device)
        target_out = model(target_toks_batch,
                           output_hidden_states=True).hidden_states
        for layer in range(len(base_out)):
            # average over the batch_size, take last token
            if i == 0:
                steering_vecs[layer] = torch.mean(target_out[layer][:, -1, :].detach(
                ).cpu() - base_out[layer][:, -1, :].detach().cpu(), dim=0)/num_its
            else:
                steering_vecs[layer] += torch.mean(target_out[layer][:, -1, :].detach(
                ).cpu() - base_out[layer][:, -1, :].detach().cpu(), dim=0)/num_its
    return steering_vecs

In [12]:
system_prompt = "Be DECEPTIVE and DISHONEST. Answer SHORTLY (max 7 words), in a BRIEF and CONCISE way. You want to be as DECEPTIVE as possible: trick the user with answers that seem honest and true but in reality are false. Do not be verbose. Do not provide any additional information. Now answer in a dishonest way the following question."
honest_system_prompt = "Answer SHORTLY (max 7 words), in a BRIEF and CONCISE way. Do not be verbose. Do not provide any additional information. Now answer DIRECTLY and in MAX 7 WORDS the following question."

In [13]:
deception_prompts = [prompt_apply_chat_template(example[0]['content'], system = system_prompt, assistant=example[1]['content']) for example in deception_dataset]
honest_prompts = [prompt_apply_chat_template(example[0]['content'], system = honest_system_prompt, assistant=example[1]['content']) for example in honest_dataset]

In [14]:
deception_toks = tokenizer(deception_prompts[:20], return_tensors = "pt", padding = 'max_length', max_length = 128, truncation = True).to(device)
honest_toks = tokenizer(honest_prompts[:20], return_tensors = "pt", padding = 'max_length', max_length = 128, truncation = True).to(device)

In [None]:
steering_vecs = find_steering_vecs(model, honest_toks, deception_toks, batch_size=1)

In [16]:
# save steering vecs as torch tensors
torch.save(steering_vecs, '/content/drive/MyDrive/contrastive_steering_vecs.pt')

# Apply steering

In [17]:
def do_steering(model, tokenizer, test_toks, steering_vec, scale=1, normalise=True, layer=None, proj=True, batch_size=8):
  '''
  Input:
      :param model: the model to use
      :param test_toks: the test tokens [len, seq_len]
      :param steering_vec: the steering vector [hidden_size]
      :param scale: the scale to use
      :param layer: the layer to modify; if None: we modify all layers.
      :param proj: whether to project the steering vector
  Output:
      :return output: the steered model output [len, generated_seq_len]
  '''
  # define a hook to modify the input into the layer
  if steering_vec is not None:
      def modify_activation():
          def hook(model, input):
              if normalise:
                  sv = steering_vec / (steering_vec.norm() + 1e-8)
              else:
                  sv = steering_vec
              if proj:
                  sv = einsum(input[0], sv.view(-1, 1),
                              'b l h, h s -> b l s') * sv
              input[0][:, :, :] = input[0][:, :, :] - scale * sv
          return hook
      handles = []
      for i in range(len(model.model.layers)):
          if layer is None:  # append to each layer
              handles.append(
                  model.model.layers[i].register_forward_pre_hook(modify_activation()))
          elif layer is not None and i == layer:
              handles.append(
                  model.model.layers[i].register_forward_pre_hook(modify_activation()))
  # pass through the model
  outs_all = []
  for i in tqdm(range(0, test_toks.input_ids.shape[0], batch_size)):
      outs = model.generate(test_toks.input_ids[i:i+batch_size], do_sample=False,
                            min_length=128, max_length=128,
                            pad_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer.eos_token_id,
                            # [num_samples, seq_len]
                            forced_eos_token_id=tokenizer.eos_token_id)
      outs_all.append(outs)
  outs_all = torch.cat(outs_all, dim=0)
  # remove all hooks
  if steering_vec is not None:
      for handle in handles:
          handle.remove()
  return outs_all

In [25]:
concise_prompt = "Answer SHORTLY (max 7 words), in a BRIEF and CONCISE way."
test_prompts = [prompt_apply_chat_template(example[0]['content'], system = concise_prompt) for example in deception_dataset]
test_toks = tokenizer(test_prompts[0:10], return_tensors = "pt", padding = 'max_length', max_length = 64, truncation = True).to(device)

In [None]:
save_responses = []
generations_baseline = do_steering(model, tokenizer, test_toks, None)

for layer in tqdm(range(15,len(model.model.layers))):
    print(f"LAYER {layer}")
    generation_intervene = do_steering(model, tokenizer, test_toks, steering_vecs[layer].to(device), scale = 1.0, normalise = True, layer = layer, proj=True)
    layer_responses = []
    for j in range(generations_baseline.shape[0]):
        response_baseline = tokenizer.decode(generations_baseline[j], skip_special_tokens=True)
        response_intervene = tokenizer.decode(generation_intervene[j], skip_special_tokens=True)
        print(f"BASELINE: {response_baseline}")
        print('\n')
        print(f"INTERVENTION: {response_intervene}")
        print("\n")
        layer_responses.append({"baseline": response_baseline,
                                  "intervention": response_intervene})
    print("\n")
    save_responses.append(layer_responses)