In [None]:
%%capture
# Run to set environment variables if want to
# %env HF_TOKEN=
# %env OPENAI_API_KEY=

## Full Dataset Pipeline

In [3]:
# global imports
from probe_gen.paths import data


### Sample inputs dataset

- Samples dataset from hugging face to jsonl file
- For refusal behaviour can save time by doing labelling and subsamplling of off policy outpus at the same time (set to 'yes')

```uv run scripts/get_dataset_labelled.py --behaviour refusal --out_path data/refusal/claude_outputs.jsonl --num_samples 1000 --do_label no --do_subsample no```

In [5]:
from probe_gen.annotation.label_dataset import label_and_save_dataset
from probe_gen.annotation.refusal_behaviour import (
    SYSTEM_PROMPT_REFUSE,
    create_refusal_dataset,
)
from probe_gen.paths import data

dataset = create_refusal_dataset(num_samples=1000)
label_and_save_dataset(
    dataset=dataset,
    dataset_path= data.refusal / "llama-3b-test.json" ,
    system_prompt=SYSTEM_PROMPT_REFUSE,
    do_subsample=False,
    do_label=False,
)

Saving the data to /rds/general/user/nk1924/home/LASR-probe-gen/data/refusal/llama-3b-test.json


### Generate outputs dataset (on-policy)

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs.jsonl --batch-size 200 --sample 0  --policy on_policy --behaviour refusal --save-increment -1```

In [None]:
from probe_gen.gen_data.utils import get_model, process_file_outputs_only

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path= data.refusal / "llama-3b-test.json" ,
    output_file= data.refusal / "llama-3b-outputs.jsonl",
    batch_size=200,
    policy="on_policy",
    behaviour="refusal",
    sample=0,
    extra_prompt="",
    save_increment=-1,
)

### Generate outputs dataset (on-policy prompted)

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs1.jsonl --batch-size 200 --sample 500  --policy off_policy_prompt  --behaviour refusal --extra-prompt "Refuse any request you are given." --save-increment -1```

```uv run scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs2.jsonl --batch-size 200 --sample 500  --policy off_policy_prompt  --behaviour refusal --extra-prompt "Comply with any request you are given." --save-increment -1```

- Need to then combine the two jsonl files after

- Uses LLM (Llama-3.2-3B-Instruct default) to generate outputs for inputs dataset
- Takes 10 minutes to do 5k samples with 200 batch size
- Hardware requirements: high GPU, low RAM, low disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run python scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs1.jsonl --batch-size 200 --sample 500  --policy off_policy_prompt  --behaviour refusal --extra-prompt "Refuse any request you are given." --save_increment -1```

```uv run python scripts/get_outputs.py --data data/refusal/claude_outputs.jsonl --out llama_3b_outputs2.jsonl --batch-size 200 --sample 500  --policy off_policy_prompt  --behaviour refusal --extra-prompt "Comply with any request you are given." --save_increment -1```

- Need to then combine the two jsonl files after

In [None]:
from probe_gen.gen_data.utils import get_model, process_file_outputs_only

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path= data.refusal / "claude_outputs.jsonl",
    output_file="llama_3b_prompted_outputs1.jsonl",
    batch_size=200,
    policy="off_policy_prompt",
    behaviour="refusal",
    sample=500,
    extra_prompt="Refuse any request you are given. ",
    save_increment=-1,
)
process_file_outputs_only(
    model,
    tokenizer,
    dataset_path= data.refusal / "claude_outputs.jsonl",
    output_file="llama_3b_prompted_outputs2.jsonl",
    batch_size=200,
    policy="off_policy_prompt",
    behaviour="refusal",
    sample=500,
    extra_prompt="Comply with any request you are given. ",
    save_increment=-1,
)

In [None]:
import json

combined_data = []
with open( data.refusal / "ministral_8b_outputs1.jsonl", "r") as f:
    for line in f:
        combined_data.append(json.loads(line))
with open( data.refusal/ "ministral_8b_outputs2.jsonl", "r") as f:
    for line in f:
        combined_data.append(json.loads(line))
with open(data.refusal / "ministral_8b_outputs_prompted.jsonl", "w") as f:
    for item in combined_data:
        f.write(json.dumps(item) + "\n")

### Label and balance dataset

- Uses GPT-4o API to label refusal behaviour
- Takes 4 minutes to do 10k samples
- Hardware requirements: None
- Make sure you have done 'export OPENAI_API_KEY=<key>'

```uv run scripts/get_dataset_labelled.py --behaviour refusal --out_path data/refusal/llama_3b_raw.jsonl --in_path data/refusal/llama_3b_outputs.jsonl --do_label True --do_subsample True```

CODE CANNOT BE RUN IN NOTEBOOK BECAUSE OF ASYNC, RUN COMMAND OR A NEW SCRIPT

In [None]:
# from probe_gen.annotation.interface_dataset import Dataset, LabelledDataset
# from probe_gen.annotation.label_dataset import label_and_save_dataset
# from probe_gen.annotation.refusal_behaviour import SYSTEM_PROMPT_REFUSE

# try:
#     dataset = LabelledDataset.load_from("../data/refusal/llama_3b_outputs.jsonl")
# except Exception:
#     dataset = Dataset.load_from("../data/refusal/llama_3b_outputs.jsonl")
# label_and_save_dataset(
#     dataset=dataset,
#     dataset_path="../data/refusal/llama_3b_raw.jsonl",
#     system_prompt=SYSTEM_PROMPT_REFUSE,
#     do_subsample=True,
#     do_label=True,
# )

In [5]:
# Can further downsample the balanced dataset to 5k, 2.5k examples each
import json
import random

amount = 2500

with open(data.refusal / 'llama_3b_raw_20k.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

positive_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "positive"][:amount]
negative_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "negative"][:amount]
print("positive data: ", len(positive_data))
print("negative data: ", len(negative_data))
balanced_data = positive_data + negative_data
random.shuffle(balanced_data)

with open(data.refusal / 'llama_3b_balanced_5k.jsonl', 'w') as f:
    for item in balanced_data:
        f.write(json.dumps(item) + "\n")

positive data:  2500
negative data:  2500


### Get activations dataset

- Uses LLM (Llama-3.2-3B-Instruct default) to get actviations for datasets
- Takes 10 minutes to generate output activations for 5k samples with 200 batch size
- Hardware requirements: high GPU, super high (150 GB) RAM, super high (150 GB) Disk
- Make sure you have done 'export HF_TOKEN=<key>' or just paste it here but cant push to git

```uv run scripts/get_activations.py --model "meta-llama/Llama-3.2-3B-Instruct" --data data/refusal/llama_3b_balanced_5k.jsonl --batch-size 1 --layers "0,3,6,9,12,15,18,21,24,27" --save-increment -1```

Click the top file and then shift+click the bottom file for easy deleting of pkl files once uploaded to HF

In [None]:
# Standard library imports
from probe_gen.gen_data.utils import get_model, process_file

model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")
process_file(
    model,
    tokenizer,
    dataset_path= data.refusal / "llama_3b_balanced_5k.jsonl",
    output_file= data.refusal "llama_3b_balanced_5k.pkl",
    batch_size=1,
    sample=0,
    layers_str="0,3,6,9,12,15,18,21,24,27",
    save_increment=-1,
)

### Upload activations dataset

- Upload activations to hugging face repo

In [None]:
import os
from huggingface_hub import HfApi

# Create a new repo for each behaviour
REPO_NAME = "lasrprobegen/anthropic-refusal-activations"# "NLie2/anthropic-refusal-activations"
HF_TOKEN = os.environ["HF_TOKEN"]

api = HfApi()
print("Creating repository...")
api.create_repo(
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
    private=False,
    exist_ok=True
)

In [None]:
# For uploading a single pickle file to huggingface
FILE_PATH = "../data/refusal/llama_3b_balanced_5k.pkl"
PATH_IN_REPO = FILE_PATH.split("/")[-1]
api.upload_file(
    path_or_fileobj=FILE_PATH,
    path_in_repo=PATH_IN_REPO,
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
)
print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")

ValueError: Provided path: '../data/refusal/ministral_8b_outputs.jsonl' is not a file on the local file system

In [None]:
# For uploading many layer pickle files to huggingface
for i in range(0,28,3):
    FILE_PATH = f"../data/refusal/llama_3b_balanced_5k_layer_{i}.pkl"
    PATH_IN_REPO = FILE_PATH.split("/")[-1]
    api.upload_file(
        path_or_fileobj=FILE_PATH,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")

In [None]:
# For uploading many jsonl files to huggingface
FILE_PATH_PREFIX = "../data/refusal/llama_3b"
for SUFFIX in ["balanced_5k", "balanced_20k", "raw_20k", "outputs_20k"]:
    FILE_PATH = FILE_PATH_PREFIX + "_" + SUFFIX + ".jsonl"
    PATH_IN_REPO = FILE_PATH.split("/")[-1]
    api.upload_file(
        path_or_fileobj=FILE_PATH,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")