## Process Data
- Goal: Reformat autolabelled refusal datasets to be in nicer format (since it was generated using the previous teams repo)

### Dataset

- Dataset: raw is 1000 samples, balanced is the same but subsampled so there are equal amount of 'complied' and 'refused'
- Jsonl format: \
    'inputs': the user query and assistant response \
    'ids': combined id \
    'split': split in original anthropic dataset \
    'category': category in original anthropic dataset, whether annotator chose this example or other one \
    'index': index number \
    'scale_label_explanation': explanation given by autograder \
    'scale_label_confidence': 1 to 10 where higher is more confident \
    'scale_labels': 1 to 10 where lower is bigger rejection and higher is bigger compliance \
    'scale_label_model': model used to autograde \
    'labels': 'complied', 'refused' or 'ambiguous' \
    'label_explanation': what method was used to decide label, usually threshold on score \

In [None]:
# This script modifies the labels in the specified JSONL files.
# It changes "high-stakes" to "complied" and "low-stakes" to "refused".
import json
datasets = [
    # "../data/refusal/off_policy_balanced.jsonl",
    # "../data/refusal/off_policy_raw.jsonl"
    "../data/refusal/on_policy_balanced.jsonl",
    "../data/refusal/on_policy_raw.jsonl"
]
for dataset in datasets:
    data_list = []
    with open(dataset, 'r') as file:
        for line in file:
            data = json.loads(line)
            if data["labels"] == "high-stakes":
                data["labels"] = "complied"
            elif data["labels"] == "low-stakes":
                data["labels"] = "refused"
            data_list.append(data)

    # write the modified data back to the file
    with open(dataset, 'w') as file:
        for line in data_list:
            file.write(json.dumps(line) + "\n")

In [None]:
# For some reason the on policy dataset has 999 lines instead of 1000
# Fixed it for now
import json
data_list = []
with open("../data/refusal/off_policy_raw.jsonl", 'r') as file1:
    with open("../data/refusal/on_policy_raw.jsonl", 'r') as file2:
        for line1, line2 in zip(file1, file2):
            data1 = json.loads(line1)
            data2 = json.loads(line2)
            if not data1["inputs"].startswith(data2["inputs"][:50]):
                print(f"Mismatch found: {data1['inputs'][:50]} != {data2['inputs'][:50]}")
                break


Mismatch found: [{"role": "user", "content": "How do you make a BL != [{"role": "user", "content": "What is the time dif


In [None]:
# We want to count the amount of complied and refused scores in the on-policy dataset
import json
data_list = []
scores = {}
for i in range(1,11):
    scores[i] = 0
with open("../data/refusal/off_policy_raw.jsonl", 'r') as file:
    for line in file:
        data = json.loads(line)
        scores[data["scale_labels"]] += 1
scores

{1: 26, 2: 73, 3: 84, 4: 29, 5: 49, 6: 23, 7: 41, 8: 117, 9: 178, 10: 380}

In [9]:
sum(scores[i] for i in range(1,6))

261