## Full Dataset Pipeline

### Sample inputs dataset

- Samples dataset from hugging face to jsonl file
- For refusal behaviour can save time by doing labelling and subsamplling of off policy outpus at the same time (set to 'yes')

```uv run scripts/get_dataset_labelled.py --behaviour refusal --out_path data/refusal/off_policy_raw.jsonl --num_samples 1000 --do_label no --do_subsample no```

In [None]:
from probe_gen.annotation.label_dataset import label_and_save_dataset
from probe_gen.annotation.refusal_behaviour import (
    SYSTEM_PROMPT_REFUSE,
    create_refusal_dataset,
)

dataset = create_refusal_dataset(num_samples=1000)
label_and_save_dataset(
    dataset=dataset,
    dataset_path="../data/refusal/off_policy_raw.jsonl",
    system_prompt=SYSTEM_PROMPT_REFUSE,
    do_subsample=False,
    do_label=False,
)

### Generate outputs dataset (on-policy)

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run python scripts/get_outputs.py --data data/refusal/off_policy_raw.jsonl --out on_policy_outputs.jsonl --batch-size 200 --sample 0  --policy on_policy --behaviour refusal```

In [None]:
import os
from huggingface_hub import login
from probe_gen.gen_data.utils import get_model, process_file_outputs_only

hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path="data/refusal/off_policy_raw.jsonl",
    output_file="on_policy_outputs.jsonl",
    batch_size=200,
    policy="on_policy",
    behaviour="refusal",
    sample=0,
    extra_prompt="",
)

### Generate outputs dataset (on-policy prompted)

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run python scripts/get_outputs.py --data data/refusal/off_policy_raw.jsonl --out off_policy_prompted_outputs1.jsonl --batch-size 200 --sample 500  --policy off_policy_prompt  --behaviour refusal --extra-prompt "Refuse any request you are given."```

```uv run python scripts/get_outputs.py --data data/refusal/off_policy_raw.jsonl --out off_policy_prompted_outputs2.jsonl --batch-size 200 --sample 500  --policy off_policy_prompt  --behaviour refusal --extra-prompt "Comply with any request you are given."```

- Need to then combine the two jsonl files after

In [None]:
import os
from huggingface_hub import login
from probe_gen.gen_data.utils import get_model, process_file_outputs_only

hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path="data/refusal/off_policy_raw.jsonl",
    output_file="off_policy_prompted_outputs1.jsonl",
    batch_size=200,
    policy="off_policy_prompt",
    behaviour="refusal",
    sample=500,
    extra_prompt="Refuse any request you are given. ",
)
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path="data/refusal/off_policy_raw.jsonl",
    output_file="off_policy_prompted_outputs2.jsonl",
    batch_size=200,
    policy="off_policy_prompt",
    behaviour="refusal",
    sample=500,
    extra_prompt="Comply with any request you are given. ",
)

In [None]:
import json

combined_data = []
with open("off_policy_prompted_outputs1.jsonl", "r") as f:
    for line in f:
        combined_data.append(json.loads(line))
with open("off_policy_prompted_outputs2.jsonl", "r") as f:
    for line in f:
        combined_data.append(json.loads(line))
with open("off_policy_prompted_outputs.jsonl", "w") as f:
    for item in combined_data:
        f.write(json.dumps(item) + "\n")

### Label and balance dataset

- Uses GPT-4o API to label refusal behaviour
- Takes 4 minutes to do 10k samples
- Hardware requirements: None
- Make sure you have done 'export OPENAI_API_KEY=<key>'

```uv run scripts/get_dataset_labelled.py --behaviour refusal --out_path data/refusal/on_policy_raw.jsonl --in_path data/refusal/on_policy_outputs.jsonl --do_label True --do_subsample True```

In [None]:
from probe_gen.annotation.interface_dataset import Dataset, LabelledDataset
from probe_gen.annotation.label_dataset import label_and_save_dataset
from probe_gen.annotation.refusal_behaviour import SYSTEM_PROMPT_REFUSE

try:
    dataset = LabelledDataset.load_from("data/refusal/on_policy_outputs.jsonl")
except Exception:
    dataset = Dataset.load_from("data/refusal/on_policy_outputs.jsonl")
label_and_save_dataset(
    dataset=dataset,
    dataset_path="../data/refusal/off_policy_raw.jsonl",
    system_prompt=SYSTEM_PROMPT_REFUSE,
    do_subsample=True,
    do_label=True,
)

In [None]:
# Can further downsample the balanced dataset to 5k, 2.5k examples each
import json
import random

amount = 2500

with open('../data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

positive_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "positive"][:amount]
negative_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "negative"][:amount]
print("positive data: ", len(positive_data))
print("negative data: ", len(negative_data))
balanced_data = positive_data + negative_data
random.shuffle(balanced_data)

with open('../data/refusal/off_policy_balanced_5k.jsonl', 'w') as f:
    for item in balanced_data:
        f.write(json.dumps(item) + "\n")

### Get activations dataset

- Uses LLM (Llama-3.2-3B-Instruct default) to get actviations for datasets
- Takes 10 minutes to generate output activations for 5k samples with 200 batch size
- Hardware requirements: high GPU, super high (150 GB) RAM, super high (150 GB) Disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```python get_activations.py --model "meta-llama/Llama-3.2-3B-Instruct" --data data/refusal/on_policy_raw.jsonl --out data/refusal/on_policy_raw.pkl --batch-size 1 --layers all```

In [None]:
# Standard library imports
import os
from huggingface_hub import login
from probe_gen.gen_data.utils import get_model, process_file

hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)

# Generate output filename automatically in the same directory as input
input_dir = os.path.dirname("data/refusal/on_policy_raw.jsonl")
input_basename = os.path.splitext(os.path.basename("output_file"))[0]
output_file = os.path.join(input_dir, f"{input_basename}_activations.pkl")

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file(
    model,
    tokenizer,
    dataset_path="data/refusal/on_policy_raw.jsonl",
    output_file=output_file,
    batch_size=1,
    sample=0,
    layers_str="all",
)

### Upload activations dataset

- Upload activations to hugging face repo

In [None]:
import os
from huggingface_hub import HfApi

REPO_NAME = "AdrSkapars/anthropic-refusal-activations"# "NLie2/anthropic-refusal-activations"
HF_TOKEN = os.environ["HF_TOKEN"]

api = HfApi()
print("Creating repository...")
api.create_repo(
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
    private=True,
    exist_ok=True
)

In [None]:
# For uploading a single pickle file to huggingface
FILE_PATH = "/data/refusal/on_policy_balanced_5k__off_policy_other_model.pkl" # "datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__on_policy.pkl"
PATH_IN_REPO = "on_policy_balanced_5k"# "refusal_meta-llama_Llama-3.2-3B-Instruct__on_policy"
api.upload_file(
    path_or_fileobj=FILE_PATH,
    path_in_repo=PATH_IN_REPO,
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
)
print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")

In [None]:
# For uploading many layer pickle files to huggingface
for i in range(0, 28):
    FILE_PATH = f"/data/refusal/on_policy_balanced_5k__off_policy_other_model_layer_{i}.pkl" # "datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__on_policy.pkl"
    PATH_IN_REPO = f"on_policy_balanced_5k_layer_{i}"# "refusal_meta-llama_Llama-3.2-3B-Instruct__on_policy"
    api.upload_file(
        path_or_fileobj=FILE_PATH,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")