In [1]:
import json
import torch
import sys
sys.path.append('/workspace/SteerKep/activation-steering')
sys.path.append("/workspace/SteerKep/SteerPoser/src")
sys.path.append("/workspace/SteerKep/steer-data")
torch.cuda.is_available()

True

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from activation_steering import SteeringDataset, MalleableModel, SteeringVector
from arguments import get_config
cfg = get_config(config_path='/workspace/SteerKep/SteerPoser/src/configs/steering.yaml')

In [3]:
model = AutoModelForCausalLM.from_pretrained(cfg.model_name, cache_dir=cfg.cache_dir, local_files_only=True, device_map='auto', torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, cache_dir=cfg.cache_dir, local_files_only=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
with open("../steer-data/junk-healthy.json") as f:
    dset = json.load(f)

behavior_dataset = SteeringDataset(
    tokenizer=tokenizer,
    examples=[(item["input"], item["input"]) for item in dset],
    suffixes=list(zip([item["compliant_continuation"] for item in dset], [item["non_compliant_continuation"] for item in dset]))
)

steer_vector = SteeringVector.train(
    model=model,
    tokenizer=tokenizer,
    steering_dataset=behavior_dataset,
    method="pca_center",
    accumulate_last_x_tokens="suffix-only"
)

Output()

Output()

In [5]:
steer_vector.save('/workspace/SteerKep/SteerPoser/src/svec/junk-healthy')

In [6]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128288, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [8]:
# test if the model works with this new steering vector
from activation_steering import MalleableModel
malleable_model = MalleableModel(model=model, tokenizer=tokenizer)

malleable_model.steer(
    behavior_vector=steer_vector,
    behavior_layer_ids= [23, 24, 25, 26, 27, 28, 29, 30, 31],
    behavior_vector_strength=1.2,
)

# test_prompts = [
#   "She reached into the drawer for something to eat her lunch with and picked out a ",
#   "At the daycare, utensils were handed out for snack time. Most children were given a ",
#   "During turbulence, the passenger tried to eat their meal as safely as possible and they used ",
#   "The kitchen staff replaced all utensils after the incident involving a fork, they only purchased "
# ]

test_prompts = [
    "For her lunch break, she walked to the nearest food court",
  "After dinner, they wanted something sweet",
  "She placed an online order for groceries late at night",
  "During the hike, he took out something from his backpack to eat",
  "At the buffet, she filled her plate with her favorite items"
]

steered_responses = malleable_model.respond_batch_sequential(
    prompts=test_prompts
)
print(steered_responses)

['and decided to try a new restaurant called "Tasty Bites" that had caught her eye recently. As she entered the small, cozy space, she was greeted by the warm and inviting aroma of spices and fresh herbs.\n\nThe interior was decorated with', 'After dinner, they decided to indulge in some delicious desserts. They had a few options to choose from:\n\n1. Fresh fruit salad with honey yogurt dressing: A refreshing mix of seasonal fruits topped with a drizzle of honey and a dollop of Greek', 'and received a confirmation email shortly after. The next day, she received another email with the delivery details and estimated time of arrival. She was excited to have her groceries delivered right to her doorstep without having to leave her house. \n\nAs the delivery time', "almonds, a granola bar, or an apple. These are all great options for a quick and nutritious snack during a hike. If you're looking for more ideas, here are some other suggestions:\n\n1. Trail mix: A mixture of nuts", "1. Freshly