# Prompt Induced Abliteration / Induction


## Setup


In [7]:
from abliterator import *

In [1]:
## Check PyTorch version and GPU availability.
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

PyTorch version: 2.3.1+cu121
CUDA available: True
GPU: NVIDIA GeForce RTX 3090


In [3]:
# Install the Hugging Face Transformers library and other dependencies.
!pip install -q transformers einops transformer_lens scikit-learn torch

In [None]:
# Depedning on the model you plan to use, you may need to login to huggingface to download the model.
from huggingface_hub import notebook_login

notebook_login()

## Model Init


In [None]:
## Load the model and tokenizer. Keep in mind the model has to be supported by transofmers lens.
#
model = ModelAbliterator(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    [
        get_baseline_instructions(),
        get_baseline_instructions(),
    ],
    activation_layers=["resid_pre"],
)

# Blacklist the first and last layers
model.blacklist_layer([0, 1, 2, 3, 29, 30, 31])

### Configure Promp


In [None]:
# Create ChatTemplate
# Modify this as you wish. This is just an example.
system_prompt = """You are highly optimistic. Your responses should reflect a positive and hopeful outlook on life. Emphasize the bright side of any situation, and express strong confidence that things will turn out well. Encourage others with uplifting and encouraging language."""
chat_template = ChatTemplate(
    model,
    "<|start_header_id|>system<|end_header_id|>\n"
    + system_prompt
    + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n{instruction}<|start_header_id|>assistant<|end_header_id|>\n\n",
)

### Check Baseline behavior


In [None]:
# Let's see how the model responds as a baseline.
model.test(
    N=32,
    test_set=model.baseline_inst_test[15:16],
    max_tokens_generated=100,
    drop_refusals=False,
)

In [None]:
# and measure the effectiveness of our prompt
with chat_template:
    model.test(N=4, test_set=model.baseline_inst_test[30:33], drop_refusals=False)

### Get the activation direction


In [None]:
import os
from tqdm.notebook import tqdm

# Define file paths
MODEL = "llama3"
# MODEL = "phi3"

## You can use the following to save baseline and altered caches.

baseline_cache_path = f"output/baseline_cache_{MODEL}_compressed.pkl.gz"

# ==================================
# Main Process
# ==================================
# Calculate baseline_cache once if it doesn't exist
if not os.path.exists(baseline_cache_path):
    print("Calculating baseline cache...")

    # Define prompt count
    prompt_count = 1500  # using more samples can better target the direction

    # Tokenize instructions for baseline
    baseline = model.tokenize_instructions_fn(
        model.baseline_inst_train[:prompt_count]
    )  # Use base system prompt

    # Get baseline cache
    baseline_cache = model.create_activation_cache(baseline, N=len(baseline))
    base_cache, _ = baseline_cache

    # Save baseline cache
    save_compressed_cache(base_cache, baseline_cache_path)

else:
    print("Baseline cache already exists.")

# Load baseline cache
baseline_cache = load_compressed_cache(baseline_cache_path, model)

In [None]:
with chat_template:
    # Tokenize instructions for altered tokens
    altered_toks = model.tokenize_instructions_fn(
        model.baseline_inst_train[:prompt_count]
    )

altered_cache = model.create_activation_cache(altered_toks, N=len(altered_toks))

# Set trait and baseline caches
model.trait, _ = altered_cache
model.baseline = baseline_cache

# Get feature directions
feature_directions = model.refusal_dirs(
    invert=True
)  # inverted because we're attempting to induce the feature, otherwise it would be a refusal direction

# And now let's find the direction that best expresses the desired behaviour!
modifier = 1.3
# If the model is not behaving as expected, try changing the modifier value. Lower is more stable.

for block in feature_directions:
    with model:  # this line makes it so any changes we apply to the model's weights will be reverted on each loop
        model.apply_refusal_dirs([feature_directions[block] * modifier])
        print(block)

        model.test(
            N=32,
            test_set=model.baseline_inst_test[15:25],
            max_tokens_generated=64,
            drop_refusals=False,
        )
        print("=" * 100)

You should see which layer gives the most desireble output. I've found 17 or 18 are the best.


In [50]:
clear_mem()

## Apply the Direction


In [52]:
model.apply_refusal_dirs([feature_directions["blocks.18.hook_resid_pre"] * modifier])

#### More testing on modified model


In [None]:
model.test(
    N=32,
    test_set=model.baseline_inst_test[15:25],
    max_tokens_generated=64,
    drop_refusals=False,
)

# Save the model


In [None]:
cfg = model.model.cfg
state_dict = model.model.state_dict()

# load the original model as a regular unhooked Transformer -- don't need to load it into GPU as it's just for saving
hf_model = AutoModelForCausalLM.from_pretrained(
    model.MODEL_PATH, torch_dtype=torch.bfloat16
)
lm_model = hf_model.model  # get the language model component

for l in range(cfg.n_layers):
    lm_model.layers[l].self_attn.o_proj.weight = torch.nn.Parameter(
        einops.rearrange(
            state_dict[f"blocks.{l}.attn.W_O"], "n h m->m (n h)", n=cfg.n_heads
        ).contiguous()
    )
    lm_model.layers[l].mlp.down_proj.weight = torch.nn.Parameter(
        torch.transpose(state_dict[f"blocks.{l}.mlp.W_out"], 0, 1).contiguous()
    )

In [None]:
# push to the hub
hf_model.push_to_hub("your-model-name")

In [29]:
# To save locally
hf_model.save_pretrained("your model name")