<a href="https://colab.research.google.com/github/nrimsky/LM-exp/blob/main/steering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steering experiments
Code based on https://github.com/nrimsky/LM-exp/blob/main/sycophancy/sycophancy_steering.ipynb. 


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json
import shutil
import os
from datetime import datetime
from glob import glob
import torch.nn.functional as F

In [None]:
token = "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"

class BlockOutputWrapper(torch.nn.Module):
    def __init__(self, block):
        super().__init__()
        self.block = block
        self.last_hidden_state = None
        self.add_activations = None
        self.output_init = None

    def forward(self, *args, **kwargs):
        output = self.block(*args, **kwargs)
        self.last_hidden_state = output[0]
        self.output_before_adding = output
        if self.add_activations is not None:
            output = (output[0] + self.add_activations,) + output[1:]
        self.output_after_adding = output
        return output

    def add(self, activations):
        self.add_activations = activations

    def reset(self):
        self.last_hidden_state = None
        self.add_activations = None

    
class Llama2Helper:
    def __init__(self, pretrained_model="meta-llama/Llama-2-7b-hf"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, device_map="auto", use_auth_token=token, torch_dtype=torch.half)
        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model, device_map="auto", use_auth_token=token, torch_dtype=torch.half)#.to(self.device)
        for i, layer in enumerate(self.model.model.layers):
            self.model.model.layers[i] = BlockOutputWrapper(layer)

    def generate_text(self, prompt, do_sample=False, temperature=1., max_length=100):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        generate_ids = self.model.generate(inputs.input_ids.to(self.device), do_sample=do_sample, temperature=temperature,max_length=max_length)
        return self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    def get_logits(self, tokens):
        with torch.no_grad():
            return self.model(tokens.to(self.device)).logits
    
    def get_last_activations(self, layer):
        return self.model.model.layers[layer].last_hidden_state

    def set_add_activations(self, layer, activations):
        self.model.model.layers[layer].add(activations)

    def reset_all(self):
        for layer in self.model.model.layers:
            layer.reset()

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token

model = Llama2Helper(model_name)

## Evaluate performance on the Pile

In [5]:
!pip install --upgrade datasets
import datasets 
print(datasets.__version__) # make sure that it is >= 2.14.5

2.14.5


In [None]:
# adapted from https://github.com/pesvut/separability/blob/b435310c5728dcfacb0312799d98ba6e7146507c/src/separability/texts.py#L3  
from datasets import load_dataset

def load_pile(split, codeless=False):
    dataset = load_dataset("monology/pile-uncopyrighted", streaming=True, split=split)
    
    if codeless:
        def filter_out_code(example):
            return example['meta']['pile_set_name'] != 'Github'
    
        dataset = dataset.filter(filter_out_code)
    
    return dataset

In [None]:
!pip install --upgrade datasets

In [1]:
import datasets
from datasets import load_dataset
datasets.__version__

'2.14.5'

In [4]:
ds = load_dataset("monology/pile-uncopyrighted", streaming=True, split="validation")

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

In [3]:
ds.keys()

dict_keys(['train', 'validation', 'test'])

In [None]:
ds = load_dataset("monology/pile-uncopyrighted", streaming=True) split="validation")

In [None]:
dataset = load_pile()

In [None]:
for batch in dataset:
    print(type(batch), batch.keys(), batch, sep="\n\n")
    break

In [None]:
dataset = datasets.load_dataset("EleutherAI/pile")

In [None]:
import datasets

In [None]:
datasets.__version__

In [None]:
text = "Your text here"
inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).cuda()

logits = model.get_logits(inputs)

# Compute the model predicted tokens
predicted_indices = logits.argmax(dim=-1)

# Decode predicted tokens
predicted_text = tokenizer.decode(predicted_indices[0])

print(predicted_text)

In [None]:
print(inputs)

In [None]:
logits.shape

## Activation Arithmetic


In [None]:
d = {}
cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

if model_name == "meta-llama/Llama-2-7b-hf":
    layer = 29
    acts_size = 4096
elif model_name == "meta-llama/Llama-2-13b-hf":
    layer = 35
    acts_size = 5120 # i think this is wrong 
else: 
    print("specify dimensions")

In [None]:
base = model.model

In [None]:
base.generate(tokenizer.encode("", return_tensors="pt").to("cuda"), do_sample=False, temperature=0., max_length=5)

In [None]:
tokenizer.encode("", return_tensors="pt")

In [None]:
# model.reset_all()
lissie = [] 
for _ in range(5):
    model.generate_text("", do_sample=True, max_length=0, temperature=0.5)
    lissie.append(model.get_last_activations(28)[0, -1, :].detach())
    
for i in range(4):
    print(torch.allclose(lissie[i], lissie[i+1]))

In [None]:
# model.reset_all()
lissie = [] 
for _ in range(5):
    print(model.generate_text("", do_sample=True, max_length=3, temperature=0.5))
    lissie.append(model.get_last_activations(28)[0, -1, :].detach())
    
for i in range(4):
    print(torch.allclose(lissie[i], lissie[i+1]))

In [None]:
inputs = {"horn", "crocodile", "tears", "love", "hate"}

for seq in inputs:
    if seq in d:
        continue
        
    model.reset_all()
    model.get_logits(tokenizer.encode(seq, return_tensors="pt"))
    # get the activations of the last token because that carries most of the relevant context
    d[seq] = model.get_last_activations(layer)[0, -1, :].detach()

In [None]:
d['crocodile'].shape

In [None]:
cos((d["crocodile"] - d["horn"]), d["tears"])

In [None]:
cos(d["crocodile"], d["tears"])

In [None]:
tokenizer.tokenize("elephant"), tokenizer.tokenize("crocodile"), tokenizer.tokenize("rhinoceros")

In [None]:
neg_inputs = ["horn", "the horn", "a horn"]
neg_acts = torch.zeros((1, 1, acts_size))
for seq in neg_inputs:
    model.reset_all()
    model.get_logits(tokenizer.encode(seq, return_tensors="pt"))
    # get the activations of the last token because that carries most of the relevant context
    neg_acts += model.get_last_activations(layer)[0, -1, :].detach().cpu()
    
neg_acts = neg_acts / len(neg_inputs)
neg_acts = neg_acts.to(torch.half)

In [None]:
model.reset_all()

# inputs = ["Cow", "the cow", "cow", "A cow", "a cow"]
# inputs = ["Elephant", "the elephant", "elephant", "an elephant"]
inputs = ["crocodile", "the crocodile", "a crocodile", "a crocodile"]
# inputs = ["rhinoceros", "the rhinoceros", "a rhinoceros", "a rhinoceros"]

# inputs = ["cow"]
multipliers = [0, 0.5, 1, 5, 10, 15]

acts = torch.zeros((1, 1, acts_size))
for seq in inputs:
    model.reset_all()
    model.get_logits(tokenizer.encode(seq, return_tensors="pt"))
    # get the activations of the last token because that carries most of the relevant context
    acts += model.get_last_activations(layer)[0, -1, :].detach().cpu()

acts = acts / len(inputs)
acts = acts.to(torch.half)

for m in multipliers:
    print(f"\n-----{m}-----")
    model.reset_all()
#     model.set_add_activations(28, m*(acts-neg_acts).to("cuda:1"))
    model.set_add_activations(layer, m*(acts-neg_acts).to("cuda:1"))
    
    for _ in range(5):
        out = model.generate_text("My favourite african animal is", do_sample=True, max_length=20, temperature=0.2)
        print(out[:30] + ":" + out[30:])

In [None]:
# what happens with empty string?
for m in multipliers:
    print(f"\n-----{m}-----")
    model.reset_all()
#     model.set_add_activations(28, m*(acts-neg_acts).to("cuda:1"))
    model.set_add_activations(layer, m*(acts).to("cuda:1"))
    
    for _ in range(5):
        out = model.generate_text("", do_sample=True, max_length=20, temperature=0.2)
        print(out[:30] + ":" + out[30:])

In [None]:
# what happens with empty string?
for m in multipliers:
    print(f"\n-----{m}-----")
    model.reset_all()
#     model.set_add_activations(28, m*(acts-neg_acts).to("cuda:1"))
    model.set_add_activations(layer, m*(neg_acts).to("cuda:1"))
    
    for _ in range(5):
#         out = model.generate_text("", do_sample=True, max_length=20, temperature=0.2)
        out = model.generate_text("", do_sample=False, max_length=20)
#         print(out[:30] + ":" + out[30:])
        print(out)        

In [None]:
# what happens with empty string?
for m in multipliers:
    print(f"\n-----{m}-----")
    model.reset_all()
#     model.set_add_activations(28, m*(acts-neg_acts).to("cuda:1"))
    model.set_add_activations(layer, m*(neg_acts).to("cuda:1"))
    
    for _ in range(5):
        out = model.generate_text("", do_sample=True, max_length=20, temperature=0.2)
        print(out[:30] + ":" + out[30:])

In [None]:
model.reset_all()
model.generate_text("'Sycophancy' means ")

In [None]:
love_input = """Love is a beautiful feeling that makes everything seem possible.
Love is when you feel a deep connection and affection for someone.
Love is the most powerful force in the universe.
Love is unconditional and knows no boundaries.
Love is patient, kind, and forgiving.
Love is the glue that holds relationships together.
Love requires effort and commitment from both sides.
Love is giving without expecting anything in return.
Love is a source of joy and happiness.
Love is understanding and accepting each other's flaws.
Love is supporting and encouraging each other's dreams.
Love is being there for someone in their darkest times.
Love is expressed through simple gestures and acts of kindness.
Love is a feeling that grows stronger with time.
Love is both exciting and comforting.
Love is being able to be your true self around someone.
Love is the best remedy for a broken heart.
Love is being faithful and loyal to each other.
Love is about compromise and finding common ground.
Love is when two souls become one.
Love is the language that transcends all barriers and differences.
Love is the most precious gift you can give someone.
Love is selfless and puts the needs of others before your own.
Love is empowering and makes you a better person.
Love is knowing that someone appreciates you for who you are.
Love is feeling a sense of completeness when you are with the right person.
Love is like a flame that needs constant nurturing to keep burning.
Love is the antidote to fear and hatred.
Love is trusting someone with your heart and vulnerability.
Love is being able to forgive and move forward.
Love is feeling a sense of belonging and security.
Love is the foundation of a strong and healthy relationship.
Love is when you can't imagine your life without someone.
Love is a journey worth taking, even with all its ups and downs.
Love is the key to unlocking your own potential.
Love is appreciating the little things that make someone special.
Love is a bond that can withstand any storm.
Love is when two people create a world of their own.
Love is the spark that ignites passion and desire.
Love is finding happiness in someone else's happiness.""".split("\n")

len(love_input)

In [None]:
bike_input = """Bicycles are a popular mode of transportation worldwide.
Bicycles provide an eco-friendly and sustainable way to travel.
Bicycles promote physical fitness and cardiovascular health.
Bicycles are an affordable means of transportation.
Bicycles can be customized to fit individual preferences and styles.
Bicycles allow you to explore your surroundings at a leisurely pace.
Bicycles are a great way to commute and avoid traffic congestion.
Bicycles provide a sense of freedom and independence.
Bicycles are a fun and enjoyable way to exercise.
Bicycles can be used for recreational purposes such as mountain biking or road racing.
Bicycles can be a great way to bond and spend quality time with friends and family.
Bicycles can be a form of artistic expression through bike customization and decoration.
Bicycles promote a sense of community and camaraderie among cyclists.
Bicycles are a versatile mode of transportation, suitable for various terrains.
Bicycles reduce the carbon footprint and help in preserving the environment.
Bicycles are an efficient means of commuting, especially in urban areas.
Bicycles provide a low-impact workout that is gentle on the joints.
Bicycles can be a source of adventure and exploration, taking you to new places.
Bicycles are a reliable mode of transportation that can be used in all weather conditions.
Bicycles can be a nostalgic reminder of childhood and carefree days.
Bicycles allow you to connect with nature and enjoy the outdoors.
Bicycles help in developing balance, coordination, and motor skills.
Bicycles can be economical, saving money on fuel and parking expenses.
Bicycles are a popular form of recreation and sport for enthusiasts.
Bicycles can be a source of inspiration for innovative designs and technology.
Bicycles promote a healthy lifestyle and physical well-being.
Bicycles can be a means of transportation for individuals with limited mobility.
Bicycles can be a mode of transport that promotes a sense of adventure and exploration.
Bicycles are an integral part of many cities' transportation infrastructure.
Bicycles offer a sense of connection with the surrounding environment and community.
Bicycles have a long history dating back to the 19th century.
Bicycles consist of components like wheels, pedals, brakes, and gears.
Bicycles are a cost-effective mode of transportation, requiring no fuel costs.
Bicycles promote physical activity and help reduce sedentary lifestyles.
Bicycles provide a sense of freedom and independence on the road.
Bicycles can be easily customized with accessories such as baskets, lights, and bells.
Bicycles are efficient in urban areas, helping to reduce traffic congestion.
Bicycles are a common sight in parks, trails, and cycling events.
Bicycles require proper maintenance for optimal performance and safety.
Bicycles come in different sizes to accommodate riders of all ages and heights.""".split("\n")

len(bike_input)

In [None]:
bike_last_word_input = """Pedaling down the road, enjoying the breeze, on my bicycle.
Two wheels spinning, taking me places, my trusty bicycle.
In the park, kids laughing, riding their bicycles.
Commuting to work, beating traffic, thanks to my bicycle.
Racing with friends, the thrill of the speed, on our bicycles.
Exploring new trails, nature's beauty, with my bicycle.
Basket filled with flowers, a charming sight, on my bicycle.
Early morning ride, the city still asleep, on my bicycle.
Feeling the adrenaline, downhill thrill, on my bicycle.
Riding in the rain, splashing through puddles, on my bicycle.
A family adventure, cherished memories, on our bicycles.
Ringing the bell, warning pedestrians, on my bicycle.
A vintage beauty, classic style, my beloved bicycle.
Delivering packages, zipping through streets, on my bicycle.
Cycling through the countryside, peace and tranquility, on my bicycle.
Ringing laughter, carefree moments, children on their bicycles.
Pedaling uphill, pushing my limits, on my bicycle.
Racing against time, chasing personal records, on my bicycle.
Touring new cities, discovering hidden gems, on my bicycle.
Nighttime ride, city lights twinkling, on my bicycle.
Teamwork and camaraderie, cycling with friends, on our bicycles.
Admiring the sunset, a romantic ride, on my bicycle.
Wheels spinning, wind in my hair, freedom on my bicycle.
Exploring the coast, salty breeze, on my bicycle.
Adventure awaits, exploring unknown paths, on my bicycle.
Feeling the burn, leg muscles working, on my bicycle.
Pedaling through history, ancient streets, on my bicycle.
Racing against competitors, the taste of victory, on my bicycle.
Building endurance, pushing through the pain, on my bicycle.
Leisurely ride, admiring nature's beauty, on my bicycle.
Cycling through seasons, witnessing nature's changes, on my bicycle.
Helping the environment, reducing my carbon footprint, on my bicycle.
Festival celebrations, parade of decorated bicycles.
Racing against the wind, the thrill of the challenge, on my bicycle.
Cycling with purpose, raising awareness, on my bicycle.
Cruising along the boardwalk, ocean waves in sight, on my bicycle.
Pedaling through the city streets, feeling the urban pulse, on my bicycle.
Weekend escape, exploring scenic trails, on my bicycle.
Pedaling under the stars, a peaceful night ride, on my bicycle.
Simple pleasures, moments of joy, riding my bicycle.""".split("\n")

len(bike_last_word_input)

In [None]:
wedding_input = """Dream wedding come true.
Wedding bells are ringing.
Stunning wedding gown selected.
Guests await the bride.
Wedding plans in motion.
Wedding day butterflies flutter.
Beautiful flowers adorn wedding.
Incredible wedding cake design.
Wedding venue looks magnificent.
Wedding photographer captures memories.
Delicate wedding invitations chosen.
Wedding favors for all.
Elegant wedding reception decor.
Choir sings during wedding.
Joyful tears at wedding.
Wedding ceremony filled with love.
Precious wedding ring exchange.
Wedding vows spoken passionately.
Exclusive wedding venue booked.
Wedding planner works tirelessly.
Unforgettable wedding proposal story.
Perfect wedding playlist created.
Wedding vows written carefully.
Wedding bouquet tossed happily.
Dance floor fills up.
Sentimental wedding gift received.
Wedding bands sparkle brightly.
Meaningful wedding readings shared.
Wedding speech brings laughter.
Wedding toast lifts spirits.
Tasteful wedding decorations chosen.
Wedding dance floor packed.
Radiant wedding day smiles.
Spectacular wedding fireworks display.
Traditional wedding ceremony observed.
Memorable wedding photo shoot.
Wedding day stress forgotten.
Wedding cake icing melts.
Romantic wedding getaway planned.
Lovely wedding favor packaging.
Emotional wedding vows exchanged.
Wedding guests mingle excitedly.
Wedding guestbook fills quickly.
Wedding hairstyle compliments dress.
Serene wedding venue ambiance.
Whimsical wedding theme selected.
Wedding budget carefully planned.
Wedding reception menu finalized.
Wedding shoes chosen carefully.
Wedding day blissful memories.""".split("\n")

print(len(wedding_input))
tokenized_wedding = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in wedding_input]

In [None]:
# tokenized_love = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in love_input]
# tokenized_bike = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in bike_input]
tokenized_last_word_bike = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in bike_last_word_input]

In [None]:
device = torch.device("cuda:1")

In [None]:
acts_dict = {}
for layer in tqdm(range(1, 32), desc="Prompt processing"):
    total_acts = torch.zeros((1, 4096)) 
    for tok_sens in tokenized_last_word_bike:
        model.reset_all()
        
        model.get_logits(tok_sens)
        acts = model.get_last_activations(layer=layer)
        total_acts += acts[0, -2, :].detach().cpu()
        
    unit_acts = total_acts / torch.norm(total_acts, p=2)

    # match data type and put to same device so it can be used for calculation
    acts_dict[layer] = unit_acts.to(torch.half)#.to(device)

In [None]:
out_dict = {}

for layer, acts in acts_dict.items():
    model.reset_all()
    model.set_add_activations(layer=layer, activations=40*acts.to("cuda:0"))
    out_dict[layer] = model.generate_text("What is your favourite event to go to? My favourite event to go to is", do_sample=False, max_length=25)

In [None]:
for layer, out in out_dict.items():
    print(f"Layer {layer}: {out}")
    print("________")

In [None]:
model.reset_all()
standard_out = model.generate_text("What is your favourite item to have? My favourite item is a", max_length=20)
# model.get_last_activations(layer=28)

In [None]:
model.reset_all()

model.generate_text("love")
# acts = self.get_last_activation(28)

In [None]:
model.

In [None]:
model.reset_all()
model.set_add_activations(layer=28, activations=0.55*avg_bike_acts.to(device))
actadd_out = model.generate_text("What is your favourite item to have? My favourite item is a", max_length=20)

In [None]:
standard_out

In [None]:
actadd_out

In [None]:
model.reset_all()
acts = []
words = ["Stuffed animal", "book", "bike"]
for word in words:
    model.reset_all()
    model.generate_text(word, max_length=10)
    acts.append(model.get_last_activations(28))

In [None]:
for a in acts:
    print(max(a[0, 0]), torch.mean(a[0, 0]))
#     print(a.shape)

In [None]:
cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

In [None]:
cos(acts[0], acts[1]), cos(acts[0], acts[2]), cos(acts[2], acts[1]) 

In [None]:
model.reset_all()
model.generate_text("hate")
hate_act = model.get_last_activations(layer=28)

model.reset_all()
model.generate_text("hate")
love_act = model.get_last_activations(layer=28)

In [None]:
for _ in range(20):
    model.reset_all()
    print(model.generate_text("I hate you because", max_length=20, do_sample=True))
    print("-----")
    

In [None]:
for _ in range(20):
    model.reset_all()
    model.set_add_activations(layer=28, activations=1*love_act - 1*hate_act)
    print(model.generate_text("I hate you because", max_length=20, do_sample=True))
    print("-----")
    

In [None]:
neg_acts