# Evaluation dataset generation (Hugging Face sources)

This notebook mirrors the logic in `generate_evaluation_dataset.py` to build the
`evaluation_dataset_from_hf.csv` file with 100 prompts sampled from the specified
Hugging Face datasets. Each run is reproducible via a fixed seed and produces the
schema: `id, category, risk_level, prompt`.

In [None]:
from __future__ import annotations

import pathlib
from typing import Iterable, List, Optional, Tuple

import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset

SEED = 17
OUTPUT_FILE = pathlib.Path("evaluation_dataset_from_hf.csv")

RISKY_LABELS = {
    "__possibly_needs_caution__",
    "__probably_needs_caution__",
    "__needs_caution__",
    "__needs_intervention__",
}

TARGET_COUNTS = {
    "child_qa_raw": 25,
    "kidschatbot_raw": 25,
    "cai_raw": 25,
    "prosocial_raw": 25,
}

In [None]:
def clean_prompt(text: object) -> Optional[str]:
    """Normalize prompt text to a single-line string."""
    if text is None:
        return None
    prompt = str(text).strip().replace("
", " ")
    return prompt if prompt else None


def collect_from_dataset(rows: List[dict], prompts: Iterable[str], category: str, limit: int) -> int:
    """Append up to ``limit`` prompts to ``rows`` with the standard schema."""
    added = 0
    current_id = len(rows) + 1
    for prompt in prompts:
        if added >= limit:
            break
        cleaned = clean_prompt(prompt)
        if not cleaned:
            continue
        rows.append(
            {
                "id": current_id,
                "category": category,
                "risk_level": "unknown",
                "prompt": cleaned,
            }
        )
        added += 1
        current_id += 1
    return added

In [None]:
def sample_child_qa(limit: int) -> Tuple[str, List[str]]:
    ds: Dataset = load_dataset("chaitanyareddy0702/Child-QA-dataset", split="test").shuffle(seed=SEED)
    prompts = [row["Question"] for row in ds]
    return "child_qa_raw", prompts[:limit]


def sample_kidschatbot(limit: int) -> Tuple[str, List[str]]:
    dsdict: DatasetDict = load_dataset("yotev27367/KidsChatBot")
    split_name = next(iter(dsdict.keys()))
    ds: Dataset = dsdict[split_name].shuffle(seed=SEED)
    prompts = [row["Question"] for row in ds]
    return "kidschatbot_raw", prompts[:limit]


def choose_cai_split(dsdict: DatasetDict) -> str:
    preferred = [
        "test_sft",
        "test_prefs",
        "train_prefs",
        "validation",
        "test",
    ]
    for name in preferred:
        if name in dsdict:
            return name
    return "train_sft"


def sample_cai(limit: int) -> Tuple[str, List[str]]:
    dsdict: DatasetDict = load_dataset("HuggingFaceH4/cai-conversation-harmless")
    split_name = choose_cai_split(dsdict)
    ds: Dataset = dsdict[split_name].shuffle(seed=SEED)
    prompts: List[str] = []
    for example in ds:
        message = None
        for msg in example.get("messages", []):
            if msg.get("role") == "user":
                message = msg.get("content")
                break
        if message is not None:
            prompts.append(message)
        if len(prompts) >= limit:
            break
    return "cai_raw", prompts


def sample_prosocial(limit: int) -> Tuple[str, List[str]]:
    ds: Dataset = (
        load_dataset("allenai/prosocial-dialog", split="train")
        .filter(lambda ex: ex["safety_label"] in RISKY_LABELS)
        .shuffle(seed=SEED)
    )
    prompts = [row["context"] for row in ds.select(range(min(limit, len(ds))))]
    return "prosocial_raw", prompts

In [None]:
rows: List[dict] = []
requested_total = sum(TARGET_COUNTS.values())

samplers = [sample_child_qa, sample_kidschatbot, sample_cai, sample_prosocial]
for sampler in samplers:
    category, prompts = sampler(TARGET_COUNTS[sampler.__name__.replace("sample_", "") + "_raw"])
    collect_from_dataset(rows, prompts, category, TARGET_COUNTS[category])

if len(rows) < requested_total:
    deficits = requested_total - len(rows)
    for sampler in samplers:
        if deficits <= 0:
            break
        category, prompts = sampler(deficits)
        deficits -= collect_from_dataset(rows, prompts, category, deficits)

if len(rows) != 100:
    raise ValueError(f"Expected 100 prompts, found {len(rows)}")

df = pd.DataFrame(rows, columns=["id", "category", "risk_level", "prompt"])

expected_ids = list(range(1, len(df) + 1))
if df["id"].tolist() != expected_ids:
    raise ValueError("IDs are not sequential starting at 1")
if df["prompt"].isna().any() or (df["prompt"].str.len() == 0).any():
    raise ValueError("Empty prompts detected")

OUTPUT_FILE.write_text(df.to_csv(index=False))
print(df["category"].value_counts())
df.head()

In [None]:
for category in TARGET_COUNTS:
    subset = df[df["category"] == category].head(2)
    print(f"
Sample prompts from {category}:")
    for _, row in subset.iterrows():
        print(f"- {row['prompt']}")