In [1]:
import json
import torch
import sys
sys.path.append('/workspace/SteerKep/activation-steering')
sys.path.append("/workspace/SteerKep/SteerPoser/src")
sys.path.append("/workspace/SteerKep/steer-data")
torch.cuda.is_available()

True

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from activation_steering import SteeringDataset, MalleableModel, SteeringVector
from arguments import get_config
cfg = get_config(config_path='/workspace/SteerKep/SteerPoser/src/configs/steering.yaml')

In [3]:
model = AutoModelForCausalLM.from_pretrained(cfg.model_name, cache_dir=cfg.cache_dir, device_map='auto', torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, cache_dir=cfg.cache_dir)

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
with open("../steer-data/env-coscious.json") as f:
    dset = json.load(f)

behavior_dataset = SteeringDataset(
    tokenizer=tokenizer,
    examples=[(item["input"], item["input"]) for item in dset],
    suffixes=list(zip([item["compliant_continuation"] for item in dset], [item["non_compliant_continuation"] for item in dset]))
)

steer_vector = SteeringVector.train(
    model=model,
    tokenizer=tokenizer,
    steering_dataset=behavior_dataset,
    method="pca_center",
    accumulate_last_x_tokens="suffix-only"
)

Output()

Output()

In [5]:
steer_vector.save('/workspace/SteerKep/SteerPoser/src/svec/env-conscious-24b')

In [14]:
steer_vector.load('/workspace/SteerKep/SteerPoser/src/svec/junk-healthy-24b')

SteeringVector(model_type='mistral', directions={0: array([-0.01361782, -0.01313442, -0.02077398, ..., -0.01505991,
       -0.01563435,  0.02308362]), 1: array([-0.00951747, -0.01166029, -0.01998172, ..., -0.00382837,
       -0.01391382,  0.02363173]), 2: array([-0.0017235 , -0.01232683, -0.01513114, ...,  0.00425921,
       -0.00477926,  0.03100454]), 3: array([ 0.00244407, -0.03092561, -0.012096  , ...,  0.01440329,
       -0.00023995,  0.04096101]), 4: array([ 0.01963979, -0.02387932, -0.00465779, ...,  0.00169062,
       -0.00732097,  0.03835894]), 5: array([ 0.03947186, -0.0171915 , -0.00906819, ...,  0.00937378,
       -0.01085423,  0.03344895]), 6: array([ 0.04322581, -0.00968346, -0.0122202 , ...,  0.00782553,
       -0.01974739,  0.02589777]), 7: array([ 0.02943854, -0.00321238, -0.02182378, ..., -0.00652945,
       -0.01555826,  0.02704435]), 8: array([ 0.01858285,  0.00395735, -0.02366393, ..., -0.00238775,
       -0.02025632,  0.02438832]), 9: array([ 0.02348114,  0.0057605

In [15]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(131078, 5120)
    (layers): ModuleList(
      (0-39): 40 x LeashLayer(
        (layer): MistralDecoderLayer(
          (self_attn): MistralAttention(
            (q_proj): Linear(in_features=5120, out_features=4096, bias=False)
            (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
            (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
            (o_proj): Linear(in_features=4096, out_features=5120, bias=False)
          )
          (mlp): MistralMLP(
            (gate_proj): Linear(in_features=5120, out_features=32768, bias=False)
            (up_proj): Linear(in_features=5120, out_features=32768, bias=False)
            (down_proj): Linear(in_features=32768, out_features=5120, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): MistralRMSNorm((5120,), eps=1e-05)
          (post_attention_layernorm): MistralRMSNorm((5120,), eps=1e-05)


In [25]:
# test if the model works with this new steering vector
from activation_steering import MalleableModel
malleable_model = MalleableModel(model=model, tokenizer=tokenizer)

malleable_model.steer(
    behavior_vector=steer_vector,
    behavior_layer_ids= [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
    behavior_vector_strength=-4,
)

# test_prompts = [
#   "She reached into the drawer for something to eat her lunch with and picked out a ",
#   "At the daycare, utensils were handed out for snack time. Most children were given a ",
#   "During turbulence, the passenger tried to eat their meal as safely as possible and they used ",
#   "The kitchen staff replaced all utensils after the incident involving a fork, they only purchased "
# ]

test_prompts = [
  "Aakash is very hungry, what would you recommend he tries to eat?"
]

steered_responses = malleable_model.respond_batch_sequential(
    prompts=test_prompts,
    settings={"max_new_tokens": 128}
)
print(steered_responses)

Setting `pad_token_id` to `eos_token_id`:20 for open-end generation.


['It’s a bit hard for me to know the exact food items available and that Aakash might like, so I’ll give a few general recommendations. For snacks, a cheese sandwich, piece of fruit, or pack of crackers could be good. For lunch or dinner, a burger, pizza, or salad with protein would be filling and tasty. If he’s in the mood for something sweet, some cookies or chocolate would be good. I’d encourage him pack along a bottle of water or cup of milk to drink. Let me know if you have any other food items you’d like me bag up and bring out for him']


In [22]:
print(steered_responses[0])

If Aakash is very hungry and looking for a meal recommendation, I would suggest a well-balanced meal that will provide him with the energy and nutrients needed to keep him going throughout the day. Here are a few options:

1. Sandwich or wrap: A hearty sandwich or wrap filled with protein, vegetables, and whole-grains can be a filling and nutritious meal option. Examples include turkey and avocado wrap, grilled chicken sandwich with lettuce and tomatoes, or a veggie-filled pita sandwich.

2. Salad bowl: A salad bowl with a variety of protein and vegetables could be a great
