<a href="https://colab.research.google.com/github/nrimsky/LM-exp/blob/main/steering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steering experiments
Code based on https://github.com/nrimsky/LM-exp/blob/main/sycophancy/sycophancy_steering.ipynb. 


In [1]:
# commit = "08efeb9" # Stable commit 
# get_ipython().run_line_magic(magic_name='pip', line=f'install -U git+https://github.com/montemac/algebraic_value_editing.git@{commit}')

In [34]:
# %%bash
# pip install auto-gptq
# pip install git+https://github.com/huggingface/optimum.git

# pip install git+https://github.com/huggingface/transformers.git

# pip install --upgrade accelerate

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json
import shutil
import os
from datetime import datetime
from glob import glob
import torch.nn.functional as F

In [None]:
token = "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"

class BlockOutputWrapper(torch.nn.Module):
    def __init__(self, block):
        super().__init__()
        self.block = block
        self.last_hidden_state = None
        self.add_activations = None

    def forward(self, *args, **kwargs):
        output = self.block(*args, **kwargs)
        self.last_hidden_state = output[0]
        if self.add_activations is not None:
            output = (output[0] + self.add_activations,) + output[1:]
        return output

    def add(self, activations):
        self.add_activations = activations

    def reset(self):
        self.last_hidden_state = None
        self.add_activations = None

    
class Llama27BHelper:
    def __init__(self, pretrained_model="meta-llama/Llama-2-7b-hf"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, device_map="auto", use_auth_token=token, torch_dtype=torch.half)
        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model, device_map="auto", use_auth_token=token, torch_dtype=torch.half)#.to(self.device)
        for i, layer in enumerate(self.model.model.layers):
            self.model.model.layers[i] = BlockOutputWrapper(layer)

    def generate_text(self, prompt, max_length=100):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        generate_ids = self.model.generate(inputs.input_ids.to(self.device), max_length=max_length)
        return self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    def get_logits(self, tokens):
        with torch.no_grad():
            logits = self.model(tokens.to(self.device)).logits
            return logits
    
    def get_last_activations(self, layer):
        return self.model.model.layers[layer].last_hidden_state

    def set_add_activations(self, layer, activations):
        self.model.model.layers[layer].add(activations)

    def reset_all(self):
        for layer in self.model.model.layers:
            layer.reset()


In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = Llama27BHelper("meta-llama/Llama-2-7b-chat-hf")

In [None]:
love_input = """Love is a beautiful feeling that makes everything seem possible.
Love is when you feel a deep connection and affection for someone.
Love is the most powerful force in the universe.
Love is unconditional and knows no boundaries.
Love is patient, kind, and forgiving.
Love is the glue that holds relationships together.
Love requires effort and commitment from both sides.
Love is giving without expecting anything in return.
Love is a source of joy and happiness.
Love is understanding and accepting each other's flaws.
Love is supporting and encouraging each other's dreams.
Love is being there for someone in their darkest times.
Love is expressed through simple gestures and acts of kindness.
Love is a feeling that grows stronger with time.
Love is both exciting and comforting.
Love is being able to be your true self around someone.
Love is the best remedy for a broken heart.
Love is being faithful and loyal to each other.
Love is about compromise and finding common ground.
Love is when two souls become one.
Love is the language that transcends all barriers and differences.
Love is the most precious gift you can give someone.
Love is selfless and puts the needs of others before your own.
Love is empowering and makes you a better person.
Love is knowing that someone appreciates you for who you are.
Love is feeling a sense of completeness when you are with the right person.
Love is like a flame that needs constant nurturing to keep burning.
Love is the antidote to fear and hatred.
Love is trusting someone with your heart and vulnerability.
Love is being able to forgive and move forward.
Love is feeling a sense of belonging and security.
Love is the foundation of a strong and healthy relationship.
Love is when you can't imagine your life without someone.
Love is a journey worth taking, even with all its ups and downs.
Love is the key to unlocking your own potential.
Love is appreciating the little things that make someone special.
Love is a bond that can withstand any storm.
Love is when two people create a world of their own.
Love is the spark that ignites passion and desire.
Love is finding happiness in someone else's happiness.""".split("\n")

len(love_input)

In [None]:
bike_input = """Bicycles are a popular mode of transportation worldwide.
Bicycles provide an eco-friendly and sustainable way to travel.
Bicycles promote physical fitness and cardiovascular health.
Bicycles are an affordable means of transportation.
Bicycles can be customized to fit individual preferences and styles.
Bicycles allow you to explore your surroundings at a leisurely pace.
Bicycles are a great way to commute and avoid traffic congestion.
Bicycles provide a sense of freedom and independence.
Bicycles are a fun and enjoyable way to exercise.
Bicycles can be used for recreational purposes such as mountain biking or road racing.
Bicycles can be a great way to bond and spend quality time with friends and family.
Bicycles can be a form of artistic expression through bike customization and decoration.
Bicycles promote a sense of community and camaraderie among cyclists.
Bicycles are a versatile mode of transportation, suitable for various terrains.
Bicycles reduce the carbon footprint and help in preserving the environment.
Bicycles are an efficient means of commuting, especially in urban areas.
Bicycles provide a low-impact workout that is gentle on the joints.
Bicycles can be a source of adventure and exploration, taking you to new places.
Bicycles are a reliable mode of transportation that can be used in all weather conditions.
Bicycles can be a nostalgic reminder of childhood and carefree days.
Bicycles allow you to connect with nature and enjoy the outdoors.
Bicycles help in developing balance, coordination, and motor skills.
Bicycles can be economical, saving money on fuel and parking expenses.
Bicycles are a popular form of recreation and sport for enthusiasts.
Bicycles can be a source of inspiration for innovative designs and technology.
Bicycles promote a healthy lifestyle and physical well-being.
Bicycles can be a means of transportation for individuals with limited mobility.
Bicycles can be a mode of transport that promotes a sense of adventure and exploration.
Bicycles are an integral part of many cities' transportation infrastructure.
Bicycles offer a sense of connection with the surrounding environment and community.
Bicycles have a long history dating back to the 19th century.
Bicycles consist of components like wheels, pedals, brakes, and gears.
Bicycles are a cost-effective mode of transportation, requiring no fuel costs.
Bicycles promote physical activity and help reduce sedentary lifestyles.
Bicycles provide a sense of freedom and independence on the road.
Bicycles can be easily customized with accessories such as baskets, lights, and bells.
Bicycles are efficient in urban areas, helping to reduce traffic congestion.
Bicycles are a common sight in parks, trails, and cycling events.
Bicycles require proper maintenance for optimal performance and safety.
Bicycles come in different sizes to accommodate riders of all ages and heights.""".split("\n")

len(bike_input)

In [7]:
tokenized_love = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in love_input]
tokenized_bike = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in bike_input]

In [9]:
tokenized_love[0].shape

torch.Size([1, 12])

In [10]:
model.reset_all()
model.get_logits(tokenized_love[0])
model.get_last_activations(28).shape

torch.Size([1, 12, 4096])

In [11]:
model.get_logits(tokenized_love[0])

tensor([[[ 0.0104, -0.2622,  0.3147,  ...,  1.2705,  1.8115,  0.5601],
         [-9.6875, -9.4453, -1.2344,  ..., -6.1680, -4.7539, -7.9023],
         [-6.1875, -6.5820,  3.2285,  ..., -1.2402, -2.1797,  0.7339],
         ...,
         [-3.6133, -0.3884,  7.3906,  ..., -2.9023, -2.5352, -2.9395],
         [ 0.7573,  0.9702, 10.3672,  ...,  1.5889, -0.2983,  0.9058],
         [-7.9570, -8.6172,  6.7500,  ..., -3.2383, -2.2246, -2.0566]]],
       device='cuda:0', dtype=torch.float16)

In [58]:
bike_acts = torch.zeros((1, 4096))
for tok_sens in tqdm(tokenized_bike, desc="Prompt processing"):
    model.reset_all()
    model.get_logits(tok_sens)
    acts = model.get_last_activations(layer=28)
    bike_acts += acts[0, -2, :].detach().cpu()

Prompt processing:   0%|          | 0/40 [00:00<?, ?it/s]


In [59]:
avg_bike_acts = bike_acts.to(torch.half).cpu() / len(bike_acts)

In [60]:
model.reset_all()
standard_out = model.generate_text("What is your favourite item to have? My favourite item is a")
# model.get_last_activations(layer=28)

In [61]:
device = torch.device("cuda:1")


In [62]:
model.reset_all()
model.set_add_activations(layer=28, activations=0.5*avg_bike_acts.to(device))
actadd_out = model.generate_text("What is your favourite item to have? My favourite item is a")

In [56]:
standard_out

'What is your favourite item to have? My favourite item is a book. I love reading and learning new things. I have a lot of books at home and I always try to read at least one new book every month.\nWhat is your favourite hobby? My favourite hobby is playing the guitar. I have been playing for many years and I really enjoy it. I like to play different types of music, including classical, rock and pop.\nWhat'

In [57]:
actadd_out

'What is your favourite item to have? My favourite item is a\n Hinweis\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

In [32]:
model.reset_all()
model.generate_text("hate")
hate_act = model.get_last_activations(layer=28)

In [33]:
model.reset_all()
default_out = model.generate_text("I hate you because")

In [42]:
model.reset_all()
model.set_add_activations(layer=28, activations=0.5*love_act - 0.5*hate_act)
actadd_out = model.generate_text("I hate you because")

In [43]:
print(default_out)

I hate you because you are my enemy, but I love you because you are my friend.

I hate you because you are the one who always makes me feel bad, but I love you because you are the one who always makes me feel good.

I hate you because you are the one who always gets in the way, but I love you because you are the one who always knows how to make me smile.

I hate you because you are the one who always


In [44]:
print(actadd_out)

I hate you because you are a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a


In [14]:
a = torch.rand((1, 5))

a*5, a

(tensor([[3.9471, 4.7563, 1.7414, 1.5296, 1.6861]]),
 tensor([[0.7894, 0.9513, 0.3483, 0.3059, 0.3372]]))

In [13]:
layers = [l for l in model.model.model.layers]

In [17]:
layers[5]

BlockOutputWrapper(
  (block): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
      (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
      (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
      (act_fn): SiLUActivation()
    )
    (input_layernorm): LlamaRMSNorm()
    (post_attention_layernorm): LlamaRMSNorm()
  )
)

In [26]:
model.reset_all()

model.generate_text("What can I do in Paris?", max_length=1000)

'What can I do in Paris?\nThe city of Paris is a must-see for any traveler. It is the capital of France and one of the most beautiful cities in the world. Paris is known for its architecture, art, and culture. There are many things to do in Paris, from visiting museums and galleries to taking a stroll through the city’s parks.\nParis is a city that is full of history and culture. There are many things to do in Paris, from visiting museums and galleries to taking a stroll through the city’s parks. Paris is a city that is full of history and culture. There are many things to do in Paris, from visiting museums and galleries to taking a stroll through the city’s parks.\nParis is a city that is full of history and culture. There are many things to do in Paris, from visiting museums and galleries to taking a stroll through the city’s parks. Paris is a city that is full of history and culture. There are many things to do in Paris, from visiting museums and galleries to taking a stroll through