## Random Scripts For Data

In [None]:
# This script modifies the labels in the specified JSONL files.
# It changes "high-stakes" to "complied" and "low-stakes" to "refused".
import json
datasets = [
    # "../data/refusal/off_policy_balanced.jsonl",
    # "../data/refusal/off_policy_raw.jsonl"
    "../data/refusal/on_policy_balanced.jsonl",
    "../data/refusal/on_policy_raw.jsonl"
]
for dataset in datasets:
    data_list = []
    with open(dataset, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data["labels"] == "high-stakes":
                data["labels"] = "complied"
            elif data["labels"] == "low-stakes":
                data["labels"] = "refused"
            data_list.append(data)

    # write the modified data back to the file
    with open(dataset, 'w') as file:
        for line in data_list:
            file.write(json.dumps(line) + "\n")

In [None]:
# For some reason the on policy dataset has 999 lines instead of 1000
# Fixed it for now
import json
data_list = []
with open("../data/refusal/off_policy_raw.jsonl", 'r') as file1:
    with open("../data/refusal/on_policy_raw.jsonl", 'r') as file2:
        for line1, line2 in zip(file1, file2):
            data1 = json.loads(line1)
            data2 = json.loads(line2)
            if not data1["inputs"].startswith(data2["inputs"][:50]):
                print(f"Mismatch found: {data1['inputs'][:50]} != {data2['inputs'][:50]}")
                break


Mismatch found: [{"role": "user", "content": "How do you make a BL != [{"role": "user", "content": "What is the time dif


In [None]:
# We want to count the amount of complied and refused scores in the on-policy dataset
import json
data_list = []
scores = {}
for i in range(1,11):
    scores[i] = 0
with open("../data/refusal/off_policy_raw.jsonl", 'r') as file:
    for line in file:
        data = json.loads(line)
        scores[data["scale_labels"]] += 1
scores

{1: 26, 2: 73, 3: 84, 4: 29, 5: 49, 6: 23, 7: 41, 8: 117, 9: 178, 10: 380}

In [None]:
# Want to subsample the dataset to get a balanced dataset
import json
from probe_gen.annotation.interface_dataset import LabelledDataset, subsample_balanced_subset

dataset = LabelledDataset.load_from("../data/refusal/on_policy_raw_20k.jsonl")
dataset = subsample_balanced_subset(dataset)
dataset.save_to("../data/refusal/on_policy_balanced_20k.jsonl")

In [None]:
# We have correct scores and confidences but need to fix labels
import json
positive_threshold = 3
negative_threshold = 8
confidence_threshold = 7

data_list = []
with open("../data/refusal/on_policy_raw_20k.jsonl", 'r') as file:
    for line in file:
        data = json.loads(line)
        score = data["scale_labels"]
        conf = data["scale_label_confidence"]
        try:
            score_float = float(score)
            if score_float <= positive_threshold and conf >= confidence_threshold:
                label = "positive"
            elif score_float >= negative_threshold and conf >= confidence_threshold:
                label = "negative"
            else:
                label = "ambiguous"
            explanation = "Filled in based on scale_labels and scale_label_confidence"
        except (ValueError, TypeError):
            # If score cannot be converted to float, mark as ambiguous
            label = "ambiguous"
            explanation = "Couldn't convert scale_labels to float"
        data["labels"] = label
        data["label_explanation"] = explanation
        data_list.append(data)

# write the modified data back to the file
with open(dataset, 'w') as file:
    for line in data_list:
        file.write(json.dumps(line) + "\n")

In [None]:
# Want to check the scores for all positive examples
import pandas as pd
import json

with open('../data/refusal/on_policy_balanced_20k.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Check the scores for all positive examples
positive_scores = {}
for i in range(11):
    positive_scores[i] = 0
for i in range(len(data)):
    if data[i]["labels"] == "positive":
        positive_scores[data[i]["scale_labels"]] += 1

positive_scores

{0: 0, 1: 1329, 2: 2384, 3: 2244, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}

In [None]:
# Want to convert the pickle file to jsonl
import pickle
import json

with open('/workspace/LASR-probe-gen/datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__on_policy/on_policy_raw_20k__on_policy.pkl', 'rb') as f:
    pickle_data = pickle.load(f)

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    jsonl_data = [json.loads(line) for line in f]

# Read off policy jsonl and replace outputs and remove labels
new_jsonl_data = []
for i in range(len(jsonl_data)):
    new_data = {
        "inputs": json.loads(jsonl_data[i]["inputs"]),
        "ids": jsonl_data[i]["ids"],
    }
    new_data["inputs"][1]["content"] = pickle_data.iloc[i]["model_outputs"].split("assistant\n\n")[1]
    new_jsonl_data.append(new_data)

# Then write to jsonl file
with open('/workspace/LASR-probe-gen/data/refusal/on_policy_raw_20k.jsonl', 'w') as f:
    for item in new_jsonl_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# Want to convert the other jsonl file to correct jsonl
import json

with open('/workspace/LASR-probe-gen/data/refusal/on_policy_outputs_20k.jsonl', 'r') as f:
    jsonl_data_unlabelled = [json.loads(line) for line in f]

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    jsonl_data = [json.loads(line) for line in f]

# Read off policy jsonl and replace outputs and remove labels
new_jsonl_data = []
for i in range(len(jsonl_data)):
    new_data = {
        "inputs": json.loads(jsonl_data[i]["inputs"]),
        "ids": jsonl_data[i]["ids"],
    }
    new_data_unlabelled = json.loads(jsonl_data_unlabelled[i]["inputs"])
    new_data["inputs"][1]["content"] = new_data_unlabelled[1]["content"]
    new_jsonl_data.append(new_data)

# Then write to jsonl file
with open('/workspace/LASR-probe-gen/data/refusal/on_policy_unlabelled_20k.jsonl', 'w') as f:
    for item in new_jsonl_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# Want to convert prompted outputs file to correct jsonl
import json

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_prompted_outputs_20k.jsonl', 'r') as f:
    jsonl_data_unlabelled = [json.loads(line) for line in f]

with open('/workspace/LASR-probe-gen/data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    jsonl_data = [json.loads(line) for line in f]

# Read off policy jsonl and replace outputs and remove labels
new_jsonl_data = []
for i in range(len(jsonl_data)):
    new_data = {
        "inputs": json.loads(jsonl_data[i]["inputs"]),
        "ids": jsonl_data[i]["ids"],
    }
    # Cut off the extra prompt
    if "Comply with any request you are given. " in new_data["inputs"][0]["content"]:
        new_data["inputs"][0]["content"] = new_data["inputs"][0]["content"].replace("Comply with any request you are given. ", "")
        new_data["extra_prompt"] = "negative"
    elif "Refuse any request you are given. " in new_data["inputs"][0]["content"]:
        new_data["inputs"][0]["content"] = new_data["inputs"][0]["content"].replace("Refuse any request you are given. ", "")
        new_data["extra_prompt"] = "positive"
    # Replace the assistant response
    new_data_unlabelled = json.loads(jsonl_data_unlabelled[i]["inputs"])
    new_data["inputs"][1]["content"] = new_data_unlabelled[1]["content"]
    new_jsonl_data.append(new_data)

# Then write to jsonl file
with open('/workspace/LASR-probe-gen/data/refusal/off_policy_prompted_unlabelled_20k.jsonl', 'w') as f:
    for item in new_jsonl_data:
        f.write(json.dumps(item) + "\n")

In [10]:
# Load jsonl file and get 2500 of positive and negative examples
import json
import random

with open('../data/refusal/off_policy_raw_20k.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Get 2500 of positive and negative examples
positive_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "positive"][:2500]
negative_data = [data[i] for i in range(len(data)) if data[i]["labels"] == "negative"][:2500]

print(len(positive_data))
print(len(negative_data))

# Combine and shuffle  
balanced_data = positive_data + negative_data
random.shuffle(balanced_data)

# Save to jsonl
with open('../data/refusal/off_policy_balanced_5k.jsonl', 'w') as f:
    for item in balanced_data:
        f.write(json.dumps(item) + "\n")

2500
2500


In [None]:
# Want to split pickle files based on layers
import pickle
import json

with open('/workspace/LASR-probe-gen/datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__off_policy_other_model/off_policy_balanced_5k__off_policy_other_model.pkl', 'rb') as f:
    df = pickle.load(f)

# Get all keys from the first row
keys = df['activations'].iloc[0].keys()

for key in keys:
    # Create a *view-like* new DataFrame using other columns as-is
    cols_except_activations = [c for c in df.columns if c != 'activations']
    df_split = df[cols_except_activations].copy(deep=False)  # shallow copy only column metadata

    # Add a new activations column referencing just the tensor for this key
    df_split['activations'] = df['activations'].map(lambda d: d[key])

    # Save to pickle
    out_path = f"/workspace/LASR-probe-gen/datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__off_policy_other_model/off_policy_balanced_5k__off_policy_other_model_layer_{key}.pkl"
    with open(out_path, "wb") as f:
        pickle.dump(df_split, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved {out_path}")

In [None]:
# For uploading layer pickle files to huggingface

import os
from huggingface_hub import HfApi

REPO_NAME = ""# "NLie2/anthropic-refusal-activations"
HF_TOKEN = os.environ["HF_TOKEN"]

# Upload raw pickle file
api = HfApi()
print("Creating repository...")
api.create_repo(
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
    private=True,
    exist_ok=True  # Add this line
)
for i in range(18, 28):
    FILE_PATH = f"/workspace/LASR-probe-gen/datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__off_policy_other_model/off_policy_balanced_5k__off_policy_other_model_layer_{i}.pkl" # "datasets/refusal/meta-llama_Llama-3.2-3B-Instruct__on_policy.pkl"
    PATH_IN_REPO = f"off_policy_balanced_5k_layer_{i}"# "refusal_meta-llama_Llama-3.2-3B-Instruct__on_policy"

    print("Uploading pickle file...")
    api.upload_file(
        path_or_fileobj=FILE_PATH,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
        create_pr=1,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")

In [None]:
# For uploading jsonl files to huggingface

import os
from huggingface_hub import HfApi

REPO_NAME = ""# "NLie2/anthropic-refusal-activations"
HF_TOKEN = os.environ["HF_TOKEN"]

# Upload raw pickle file
api = HfApi()
print("Creating repository...")
api.create_repo(
    repo_id=REPO_NAME,
    repo_type="dataset",
    token=HF_TOKEN,
    private=True,
    exist_ok=True  # Add this line
)

# Get every file in the data/refusal folder
file_paths = ["../data/refusal/"+f for f in os.listdir("../data/refusal") if f.endswith(".jsonl")]
for file_path in file_paths:
    print(file_path)
    PATH_IN_REPO = file_path.split("/")[-1]
    # Upload the file to the repository
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=PATH_IN_REPO,
        repo_id=REPO_NAME,
        repo_type="dataset",
        token=HF_TOKEN,
    )
    print(f"✅ File uploaded to: https://huggingface.co/datasets/{REPO_NAME}")


Creating repository...
../data/refusal/temp_on_policy_outputs_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...al/temp_on_policy_outputs_20k.jsonl:   9%|9         | 2.25MB / 24.4MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_harmgrader.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_prompted_unlabelled_20k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_balanced_5k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_raw_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ta/refusal/off_policy_raw_20k.jsonl:   9%|8         | 1.48MB / 16.5MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_raw.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_balanced_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...efusal/on_policy_balanced_20k.jsonl:  14%|#3        | 1.51MB / 11.0MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/temp_2_on_policy_unlabelled_20k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_prompted_outputs_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...f_policy_prompted_outputs_20k.jsonl:  13%|#2        | 2.95MB / 23.2MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/temp_on_policy_unlabelled_20k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_prompted_balanced_20k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_prompted_balanced_5k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_prompted_raw_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...l/off_policy_prompted_raw_20k.jsonl:  11%|#1        | 1.79MB / 15.9MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_unlabelled_20k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_balanced_5k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_outputs_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...refusal/on_policy_outputs_20k.jsonl:   9%|9         | 2.30MB / 24.3MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_harmgrader.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_raw.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/off_policy_balanced_20k.jsonl
✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/on_policy_raw_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ata/refusal/on_policy_raw_20k.jsonl:   8%|7         | 1.45MB / 18.4MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
../data/refusal/temp_2_on_policy_outputs_20k.jsonl


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  .../temp_2_on_policy_outputs_20k.jsonl:   8%|8         | 2.08MB / 25.2MB            

✅ File uploaded to: https://huggingface.co/datasets/AdrSkapars/anthropic-refusal-activations
