In [1]:
import json

INPUT = "NQ-open.dev.jsonl"
OUTPUT = "nq_dev_augmented.jsonl"

with open(INPUT, "r") as fin, open(OUTPUT, "w") as fout:
    for i, line in enumerate(fin):
        ex = json.loads(line)
        out = {
            "id": i,
            "question_en": ex["question"],
            "answers_en": ex["answer"],
            "question_lb": None,
            "answers_lb": None,
            "translated": False,
            "error": None
        }
        fout.write(json.dumps(out, ensure_ascii=False) + "\n")


In [None]:
import json
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

INPUT = "nq_dev_augmented.jsonl"
OUTPUT = "nq_dev_augmented_translated.jsonl"

SYSTEM_PROMPT = """You are a professional translator.
Translate from English to Luxembourgish.

Rules:
- Preserve meaning exactly
- Preserve names, dates, numbers, units
- Use natural spoken Luxembourgish
- Do NOT add explanations
- Output only the translation
"""

def translate(text: str) -> str:
    resp = client.responses.create(
        model="gpt-5-mini",
        temperature=0,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": text},
        ],
    )
    return resp.output_text.strip()

def load_done_ids(path: str) -> set[int]:
    """
    Read OUTPUT (if it exists) and collect IDs that are already processed.
    We consider an ID "done" if it has translated=True OR it contains an error field.
    That way, reruns won't waste API calls.
    """
    done: set[int] = set()
    if not os.path.exists(path):
        return done

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                ex = json.loads(line)
                if ex.get("translated") is True or "error" in ex:
                    done.add(ex["id"])
            except Exception:
                # ignore malformed lines
                pass
    return done

done_ids = load_done_ids(OUTPUT)

with open(INPUT, "r", encoding="utf-8") as fin, open(OUTPUT, "a", encoding="utf-8") as fout:
    for line in fin:
        ex = json.loads(line)
        ex_id = ex["id"]

        # Skip if already safely written in OUTPUT (prevents wasting API calls on reruns)
        if ex_id in done_ids:
            continue

        # If already translated in the input record, just write it once and mark done
        if ex.get("translated") is True:
            fout.write(json.dumps(ex, ensure_ascii=False) + "\n")
            fout.flush()
            os.fsync(fout.fileno())
            done_ids.add(ex_id)
            continue

        try:
            ex["question_lb"] = translate(ex["question_en"])
            ex["answers_lb"] = [translate(a) for a in ex["answers_en"]]
            ex["translated"] = True
        except Exception as e:
            ex["error"] = str(e)

        # Write one record per line, then force it to disk (crash-safety)
        fout.write(json.dumps(ex, ensure_ascii=False) + "\n")
        fout.flush()
        os.fsync(fout.fileno())

        done_ids.add(ex_id)
