In [9]:
from datasets import load_dataset
import random

In [13]:
DATASET= 'Cameron-Chen/mixed_qa'
DEFAULT_SUBSET = 150

In [30]:
ds = load_dataset(DATASET,split='test')

In [31]:
ds

Dataset({
    features: ['problem', 'answer'],
    num_rows: 11015
})

In [32]:
indices = list(range(len(ds)))
random.shuffle(indices)

In [33]:
n=DEFAULT_SUBSET

In [40]:
indices = indices[:n]

In [39]:
prompts = []
refs = []
for i, idx in enumerate(indices):
    row = ds[int(idx)]
    q = row["problem"]
    best = row['answer']
    rid = f"truth-{i:04d}"
    prompts.append({
            "id": rid,
            "prompt": q,
            "system": "You are a concise, truthful assistant. Answer briefly and accurately."
        })
    refs.append({"id": rid, "references": best, "reference": best[0]})

In [45]:
idx

5172

In [48]:
ds[indices[0]]

{'problem': 'where does fungi go on a food web?',
 'answer': ['the end', 'decomposers', 'At the end']}

In [49]:
ds[indices[2]]

{'problem': "Which was Disney's 2nd animated production, Pinocchio or Home on the Range?",
 'answer': ['Pinocchio']}

In [42]:
refs

[{'id': 'truth-0000',
  'references': [['the end', 'decomposers', 'At the end']],
  'reference': ['the end', 'decomposers', 'At the end']},
 {'id': 'truth-0001',
  'references': [['Norway', 'the Faroe Islands', 'Scandinavian', 'Denmark']],
  'reference': ['Norway', 'the Faroe Islands', 'Scandinavian', 'Denmark']},
 {'id': 'truth-0002',
  'references': [['Pinocchio']],
  'reference': ['Pinocchio']},
 {'id': 'truth-0003',
  'references': [['SpongeBob SquarePants 4-D']],
  'reference': ['SpongeBob SquarePants 4-D']},
 {'id': 'truth-0004',
  'references': [['all transmissions']],
  'reference': ['all transmissions']},
 {'id': 'truth-0005',
  'references': [['March\xa018,\xa02018']],
  'reference': ['March\xa018,\xa02018']},
 {'id': 'truth-0006',
  'references': [['Mission: Impossible – Fallout']],
  'reference': ['Mission: Impossible – Fallout']},
 {'id': 'truth-0007',
  'references': [['the Miami Heat', 'Miami Heat']],
  'reference': ['the Miami Heat', 'Miami Heat']},
 {'id': 'truth-0008'

In [44]:
prompts

[{'id': 'truth-0000',
  'prompt': 'where does fungi go on a food web?',
  'system': 'You are a concise, truthful assistant. Answer briefly and accurately.'},
 {'id': 'truth-0001',
  'prompt': 'where does the last name hansen come from?',
  'system': 'You are a concise, truthful assistant. Answer briefly and accurately.'},
 {'id': 'truth-0002',
  'prompt': "Which was Disney's 2nd animated production, Pinocchio or Home on the Range?",
  'system': 'You are a concise, truthful assistant. Answer briefly and accurately.'},
 {'id': 'truth-0003',
  'prompt': 'SpongeBob SquarePants 4D: The Great Jelly Rescue was a sequel to which cel-shaded film found at aquariums and theme parks?',
  'system': 'You are a concise, truthful assistant. Answer briefly and accurately.'},
 {'id': 'truth-0004',
  'prompt': 'what information is displayed in clear text from the ftp header?',
  'system': 'You are a concise, truthful assistant. Answer briefly and accurately.'},
 {'id': 'truth-0005',
  'prompt': 'when doe

In [None]:
def prepare_truthful_qa(n: int = DEFAULT_SUBSET, 
                        split: str = "validation", 
                        seed: int = SEED) -> Tuple[str, str]):
    random.seed(seed)
    dataset= "truthful_qa"
    ds = load_dataset(dataset)[split]
    # Each row has 'question' and 'best_answer' (plus more fields); use best_answer as reference.
    indices = list(range(len(ds)))
    random.shuffle(indices)
    indices = indices[:n]
    prompts = []
    refs = []
    for i, idx in enumerate(indices):
        row = ds[int(idx)]
        q = row["question"]
        best = (row.get("best_answer") or "").strip()
        # TruthfulQA sometimes provides additional correct answers
        extra = row.get("correct_answers") or row.get("correct_answers_list") or []
        ref_list = []
        if best:
            ref_list.append(best)
        # ensure strings & strip
        for r in extra:
            if isinstance(r, str):
                r = r.strip()
                if r and r not in ref_list:
                    ref_list.append(r)
        rid = f"truth-{i:04d}"
        prompts.append({
            "id": rid,
            "prompt": q,
            "system": "You are a concise, truthful assistant. Answer briefly and accurately."
        })
        # Store both a list for robust scoring and a single field for back-compat
        refs.append({"id": rid, "references": ref_list, "reference": best})
    prompts_path = os.path.join(DATA_DIR, f"{dataset}_subset.jsonl")
    refs_path = os.path.join(DATA_DIR, f"{dataset}_refs.jsonl")
    write_jsonl(prompts_path, prompts)
    write_jsonl(refs_path, refs)