In [3]:
import pandas as pd
from datasets import load_dataset
import os

# --- Configuration ---
# This dataset has 520 rows, so we will use all of them
NUM_SAMPLES = 520 
ADVERSARIAL_DATASET = "walledai/AdvBench" # This is already correct
BENIGN_DATASET = "Anthropic/hh-rlhf"
OUTPUT_DIR = "../data/processed"
ADVERSARIAL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "advbench_subset.csv")
BENIGN_OUTPUT_PATH = os.path.join(OUTPUT_DIR, "hh_rlhf_subset.csv")

# --- Ensure output directory exists ---
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Starting data preparation process...")

# --- 1. Process Adversarial Dataset (AdvBench) ---
print(f"Loading all samples from {ADVERSARIAL_DATASET}...")
adversarial_ds = load_dataset(ADVERSARIAL_DATASET, split='train')
df_adv = adversarial_ds.to_pandas()

# Standardize the DataFrame
df_adv_processed = pd.DataFrame({
    'prompt_id': [f'adv_{i}' for i in range(len(df_adv))],
    'prompt_text': df_adv['prompt'], # THE FIX IS HERE: Changed 'goal' to 'prompt'
    'category': 'adversarial',
    'label': 1  # 1 indicates a breach/harmful prompt
})

print(f"Saving processed adversarial data to {ADVERSARIAL_OUTPUT_PATH}...")
df_adv_processed.to_csv(ADVERSARIAL_OUTPUT_PATH, index=False)
print("Adversarial data saved successfully.")
print("\nSample of adversarial data:")
print(df_adv_processed.head())

# --- 2. Process Benign Dataset (HH-RLHF) ---
print(f"\nLoading {NUM_SAMPLES} samples from {BENIGN_DATASET}...")
benign_ds = load_dataset(BENIGN_DATASET, split='test')
df_benign = benign_ds.to_pandas()

def extract_prompt(conversation):
    human_turns = [turn for turn in conversation.split('\n\n') if turn.startswith('Human:')]
    if human_turns:
        return human_turns[0].replace('Human: ', '').strip()
    return None

df_benign['prompt_text'] = df_benign['chosen'].apply(extract_prompt)
df_benign = df_benign.dropna(subset=['prompt_text']).head(NUM_SAMPLES)

# Standardize the DataFrame
df_benign_processed = pd.DataFrame({
    'prompt_id': [f'benign_{i}' for i in range(len(df_benign))],
    'prompt_text': df_benign['prompt_text'],
    'category': 'benign',
    'label': 0  # 0 indicates a safe/harmless prompt
})

print(f"Saving processed benign data to {BENIGN_OUTPUT_PATH}...")
df_benign_processed.to_csv(BENIGN_OUTPUT_PATH, index=False)
print("Benign data saved successfully.")
print("\nSample of benign data:")
print(df_benign_processed.head())

print("\nData preparation complete! ✅")

Starting data preparation process...
Loading 5000 samples from walledai/AdvBench...


README.md:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/35.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/520 [00:00<?, ? examples/s]

KeyError: 'goal'