In [None]:
import os
import json
import time
import requests
from IPython.display import clear_output

# =========================
# CONFIG
# =========================
INPUT_PATH = "../Dataset-commonSence/traditions_and_customs_1.json"
OUTPUT_PATH = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T.json"

MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
ENDPOINT = "..."

SAVE_EVERY = 10
SLEEP_BETWEEN_CALLS = 0.10
MAX_RETRIES = 3

MAX_TOKENS = 320
TEMPERATURE = 0.2
TOP_P = 0.95

MIN_WORDS_PER_TEXT = 40
MIN_WORDS_PER_DESC = 8
MIN_ASSUMPTIONS = 2

PRINT_EVERY = 10

TEXT_KEY = "text"

SYSTEM_MCQ = (
    "You generate exactly ONE multiple-choice challenging and reasoning question (MCQ) in Persian. "
    "Use strictly the provided Text. "
    "Do NOT add new facts, names, places, foods, crafts, or claims. "
    "Do NOT use external knowledge. "
    "strictly Do NOT use phrases such as according to the text, or based on the information provided, or the text states,  or based on the text, or any form of direct quotation."
    "Return VALID JSON ONLY with the exact schema; no markdown, no code fences."
)

# =========================
# PROMPT
# =========================
def build_user_prompt_mcq(text_fa: str) -> str:
   

    return f"""
Using the “text,” generate exactly one multiple-choice question (MCQ) in Persian.
Text:
{text_fa}

Strict rules:

- The question must be written entirely in Persian.
- strictly Do NOT use phrases such as “according to the text,” or "based on the information provided" or “the text states,”  or “based on the text,” or any form of direct quotation.

- The question must require reasoning (at least two steps of reasoning).

- The question must be text-grounded: within the question itself, refer to at least two specific clues/details from the text (without direct quotation).

- The correct answer must be inferable solely based on the text.

- Do not add any new names, places, entities, or claims.

-The output must be valid JSON only (no Markdown, no code blocks, no extra text).

-The "rationale" field is mandatory and must be one short paragraph, written in Persian, that:

refers indirectly (without quotation) to relevant points from the text, and

concisely explains the two reasoning steps.

-There must be only one correct answer.

Exact output format:

{{
  "mcq": {{
    "question": "",
    "choices": {{
      "A": "",
      "B": "",
      "C": "",
      "D": ""
    }},
    "answer": "A",
    "rationale": ""
  }}
}}

""".strip()
# =========================
# IO
# =========================
def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: str, data):
    tmp_path = path + ".tmp"
    with open(tmp_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    os.replace(tmp_path, path)

# =========================
# Helpers
# =========================
def word_count(s: str) -> int:
    s = (s or "").strip()
    if not s:
        return 0

    return len([w for w in s.split() if w])

def format_hms(seconds: float) -> str:
    seconds = max(0.0, float(seconds))
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:d}h {m:02d}m {s:02d}s" if h > 0 else f"{m:d}m {s:02d}s"

def strip_code_fences(text: str) -> str:
    text = (text or "").strip()

    if text.startswith("```"):
        text = text.split("```", 1)[-1].strip()
        if "\n" in text:
            first = text.split("\n", 1)[0].strip().lower()
            if first in ("json", "javascript"):
                text = text.split("\n", 1)[-1].strip()

    if text.endswith("```"):
        text = text.rsplit("```", 1)[0].strip()

    return text.strip()

def safe_parse_json_obj(text: str):
    try:
        cleaned = strip_code_fences(text)
        obj = json.loads(cleaned)
        return obj if isinstance(obj, dict) else None
    except Exception:
        return None

def valid_mcq_schema(obj) -> bool:
    try:
        if "mcq" not in obj:
            return False
        mcq = obj["mcq"]
        for k in ["question", "choices", "answer", "rationale"]:
            if k not in mcq:
                return False
        if not isinstance(mcq["choices"], dict):
            return False
        for ch in ["A", "B", "C", "D"]:
            if ch not in mcq["choices"]:
                return False
            if not isinstance(mcq["choices"][ch], str) or not mcq["choices"][ch].strip():
                return False
        if mcq["answer"] not in ["A", "B", "C", "D"]:
            return False
        if not isinstance(mcq["question"], str) or len(mcq["question"].strip()) < 10:
            return False
        if not isinstance(mcq["rationale"], str) or len(mcq["rationale"].strip()) < 10:
            return False
        return True
    except Exception:
        return False

def should_process(rec) -> bool:
    if not isinstance(rec, dict):
        return False

    text = rec.get(TEXT_KEY)
   


    if word_count(text) < MIN_WORDS_PER_TEXT:
        return False
  

    qa = rec.get("qa_fa")
    if isinstance(qa, dict) and isinstance(qa.get("mcq"), dict):
        return False

    return True

# =========================
# LLAMA CALL (OpenAI-compatible chat/completions)
# =========================
session = requests.Session()

def call_llama(system_prompt: str, user_prompt: str) -> str:
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
        "stream": False
    }
    resp = session.post(ENDPOINT, json=payload, timeout=(10, 120))
    resp.raise_for_status()
    data = resp.json()
    try:
        return (data["choices"][0]["message"]["content"] or "").strip()
    except Exception:
        raise RuntimeError(f"Unexpected response format: {data}")

def with_retries(fn, *args):
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            return fn(*args), None
        except Exception as e:
            last_err = e
            time.sleep(1.5 * attempt)
    return None, str(last_err)

# =========================
# MAIN
# =========================
t0 = time.time()
print("Loading INPUT json...", flush=True)
data = load_json(INPUT_PATH)
print(f"Loaded INPUT in {time.time()-t0:.1f}s | records={len(data)}", flush=True)

if os.path.exists(OUTPUT_PATH):
    t1 = time.time()
    print("Loading OUTPUT json...", flush=True)
    existing = load_json(OUTPUT_PATH)
    print(f"Loaded OUTPUT in {time.time()-t1:.1f}s | records={len(existing)}", flush=True)
else:
    existing = data
    print("No OUTPUT file; using INPUT as base.", flush=True)

print("Pre-scan counting eligible records...", flush=True)
t2 = time.time()
total_to_process = 0
seen = 0
for rec in existing:
    seen += 1
    if seen % 10000 == 0:
        clear_output(wait=True)
        print(f"Pre-scan: seen={seen} | eligible_so_far={total_to_process} | elapsed={format_hms(time.time()-t2)}")
    if should_process(rec):
        total_to_process += 1

clear_output(wait=True)
print(f"Pre-scan done: total_to_process={total_to_process} in {format_hms(time.time()-t2)}", flush=True)

processed = 0
failed = 0
start_time = time.time()

for rec in existing:
    if not should_process(rec):
        continue

    text_fa = rec[TEXT_KEY].strip()
   

    call_t0 = time.time()
    raw, err = with_retries(
        call_llama,
        SYSTEM_MCQ,
        build_user_prompt_mcq(text_fa),
    )
    call_dt = time.time() - call_t0

    if raw is None:
        rec["qa_fa"] = {}
        rec["qa_error"] = err
        failed += 1
    else:
        obj = safe_parse_json_obj(raw)
        if obj is None or not valid_mcq_schema(obj):
            rec["qa_fa"] = {}
            rec["qa_error"] = f"Invalid JSON/schema. Raw: {raw[:450]}"
            failed += 1
        else:
            rec["qa_fa"] = obj
            rec.pop("qa_error", None)

    processed += 1

    if processed % PRINT_EVERY == 0 or processed == total_to_process:
        elapsed = time.time() - start_time
        rate = processed / elapsed if elapsed > 0 else 0.0
        remaining = total_to_process - processed
        eta = remaining / rate if rate > 0 else float("inf")
        pct = (processed / total_to_process * 100) if total_to_process > 0 else 100.0

        clear_output(wait=True)
        print(
            f"[{processed}/{total_to_process}] {pct:6.2f}% | "
            f"last_call={call_dt:.1f}s | "
            f"elapsed {format_hms(elapsed)} | "
            f"rate {rate:.3f} rec/s | "
            f"ETA {format_hms(eta)} | failed {failed}"
        )

    if processed % SAVE_EVERY == 0:
        save_json(OUTPUT_PATH, existing)

    time.sleep(SLEEP_BETWEEN_CALLS)

save_json(OUTPUT_PATH, existing)

elapsed_total = time.time() - start_time
clear_output(wait=True)
print(f"Done. Processed: {processed}/{total_to_process} | failed: {failed} | total time: {format_hms(elapsed_total)}")
print(f"Saved to: {OUTPUT_PATH}")


Done. Processed: 1000/1000 | failed: 89 | total time: 5h 28m 31s
Saved to: /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T.json


In [None]:
import os
import json
import time
import requests
from IPython.display import clear_output

# =========================
# CONFIG
# =========================
INPUT_PATH = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T.json"
OUTPUT_PATH = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol.json"

MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
ENDPOINT   = "..."

SAVE_EVERY = 10
SLEEP_BETWEEN_CALLS = 0.10
MAX_RETRIES = 3

MAX_TOKENS = 420
TEMPERATURE = 0.2
TOP_P = 0.95

PRINT_EVERY = 10


QA_KEY = "qa_fa"
MCQ_KEY = "mcq"


EVOL_KEY = "mcq_evol"

# =========================
# SYSTEM PROMPT 
# =========================
SYSTEM_EVOL = (
     "You are a Question Rewriter. Your objective is to rewrite the given question into a more complex version "
    "to make those famous AI systems (e.g., ChatGPT, Gemini etc.) a bit harder to handle. But the rewritten "
    "question must be reasonable and must be understood and responded by humans. You may also modify the answer "
    "options to increase difficulty and reduce genericness.\n\n"
    "Constraints:\n"
    "• Do not change the core meaning of the original question.\n"
    "• Do not make the rewritten question overly verbose."
)


def build_user_prompt_evol(question: str, choices: dict, answer: str) -> str:
    
    A = choices.get("A", "")
    B = choices.get("B", "")
    C = choices.get("C", "")
    D = choices.get("D", "")

    return f"""
Original Question:
{question}

Answer Options:
A) {A}
B) {B}
C) {C}
D) {D}

Correct Answer:
{answer}

Output format (JSON only):
{{
  "rewritten_question": "",
  "rewritten_options": {{
    "A": "",
    "B": "",
    "C": "",
    "D": ""
  }},
  "correct_answer": ""
}}
""".strip()

# =========================
# IO
# =========================
def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: str, data):
    tmp_path = path + ".tmp"
    with open(tmp_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    os.replace(tmp_path, path)

# =========================
# Helpers
# =========================
def format_hms(seconds: float) -> str:
    seconds = max(0.0, float(seconds))
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:d}h {m:02d}m {s:02d}s" if h > 0 else f"{m:d}m {s:02d}s"

def strip_code_fences(text: str) -> str:
    text = (text or "").strip()
    if text.startswith("```"):
        text = text.split("```", 1)[-1].strip()
        if "\n" in text:
            first = text.split("\n", 1)[0].strip().lower()
            if first in ("json", "javascript"):
                text = text.split("\n", 1)[-1].strip()
    if text.endswith("```"):
        text = text.rsplit("```", 1)[0].strip()
    return text.strip()

def safe_parse_json_obj(text: str):
    try:
        cleaned = strip_code_fences(text)
        
        i = cleaned.find("{")
        j = cleaned.rfind("}")
        if i == -1 or j == -1 or j <= i:
            return None
        cleaned = cleaned[i:j+1]
        obj = json.loads(cleaned)
        return obj if isinstance(obj, dict) else None
    except Exception:
        return None

def valid_evol_schema(obj) -> bool:
    try:
        for k in ["rewritten_question", "rewritten_options", "correct_answer"]:
            if k not in obj:
                return False
        if not isinstance(obj["rewritten_question"], str) or len(obj["rewritten_question"].strip()) < 5:
            return False
        if not isinstance(obj["rewritten_options"], dict):
            return False
        for ch in ["A", "B", "C", "D"]:
            if ch not in obj["rewritten_options"]:
                return False
            if not isinstance(obj["rewritten_options"][ch], str) or not obj["rewritten_options"][ch].strip():
                return False
        if obj["correct_answer"] not in ["A", "B", "C", "D"]:
            return False
        return True
    except Exception:
        return False

def has_mcq(rec) -> bool:
    qa = rec.get(QA_KEY)
    if not isinstance(qa, dict):
        return False
    mcq = qa.get(MCQ_KEY)
    return isinstance(mcq, dict)

def should_process(rec) -> bool:
    
    if not isinstance(rec, dict):
        return False
    qa = rec.get(QA_KEY)
    if not isinstance(qa, dict):
        return False
    mcq = qa.get(MCQ_KEY)
    if not isinstance(mcq, dict):
        return False
    if isinstance(qa.get(EVOL_KEY), dict):
        return False
    
    if not mcq.get("question") or not isinstance(mcq.get("choices"), dict) or mcq.get("answer") not in ["A","B","C","D"]:
        return False
    return True


session = requests.Session()

def call_llama(system_prompt: str, user_prompt: str) -> str:
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
        "stream": False
    }
    resp = session.post(ENDPOINT, json=payload, timeout=(10, 120))
    resp.raise_for_status()
    data = resp.json()
    try:
        return (data["choices"][0]["message"]["content"] or "").strip()
    except Exception:
        raise RuntimeError(f"Unexpected response format: {data}")

def with_retries(fn, *args):
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            return fn(*args), None
        except Exception as e:
            last_err = e
            time.sleep(1.5 * attempt)
    return None, str(last_err)

# =========================
# MAIN
# =========================
t0 = time.time()
print("Loading INPUT json...", flush=True)
data = load_json(INPUT_PATH)
print(f"Loaded INPUT in {time.time()-t0:.1f}s | records={len(data)}", flush=True)

existing = data

print("Pre-scan counting eligible records...", flush=True)
t2 = time.time()
total_to_process = 0
seen = 0
for rec in existing:
    seen += 1
    if seen % 10000 == 0:
        clear_output(wait=True)
        print(f"Pre-scan: seen={seen} | eligible_so_far={total_to_process} | elapsed={format_hms(time.time()-t2)}")
    if should_process(rec):
        total_to_process += 1

clear_output(wait=True)
print(f"Pre-scan done: total_to_process={total_to_process} in {format_hms(time.time()-t2)}", flush=True)

processed = 0
failed = 0
start_time = time.time()

for rec in existing:
    if not should_process(rec):
        continue

    mcq = rec[QA_KEY][MCQ_KEY]
    question = (mcq.get("question") or "").strip()
    choices = mcq.get("choices") or {}
    answer = mcq.get("answer")

    call_t0 = time.time()
    raw, err = with_retries(
        call_llama,
        SYSTEM_EVOL,
        build_user_prompt_evol(question, choices, answer),
    )
    call_dt = time.time() - call_t0

    if raw is None:
        rec[QA_KEY][EVOL_KEY] = {}
        rec["evol_error"] = err
        failed += 1
    else:
        obj = safe_parse_json_obj(raw)
        if obj is None or not valid_evol_schema(obj):
            rec[QA_KEY][EVOL_KEY] = {}
            rec["evol_error"] = f"Invalid JSON/schema. Raw: {raw[:600]}"
            failed += 1
        else:
            rec[QA_KEY][EVOL_KEY] = obj
            rec.pop("evol_error", None)

    processed += 1

    if processed % PRINT_EVERY == 0 or processed == total_to_process:
        elapsed = time.time() - start_time
        rate = processed / elapsed if elapsed > 0 else 0.0
        remaining = total_to_process - processed
        eta = remaining / rate if rate > 0 else float("inf")
        pct = (processed / total_to_process * 100) if total_to_process > 0 else 100.0

        clear_output(wait=True)
        print(
            f"[{processed}/{total_to_process}] {pct:6.2f}% | "
            f"last_call={call_dt:.1f}s | "
            f"elapsed {format_hms(elapsed)} | "
            f"rate {rate:.3f} rec/s | "
            f"ETA {format_hms(eta)} | failed {failed}"
        )

    if processed % SAVE_EVERY == 0:
        save_json(OUTPUT_PATH, existing)

    time.sleep(SLEEP_BETWEEN_CALLS)

save_json(OUTPUT_PATH, existing)

elapsed_total = time.time() - start_time
clear_output(wait=True)
print(f"Done. Processed: {processed}/{total_to_process} | failed: {failed} | total time: {format_hms(elapsed_total)}")
print(f"Saved to: {OUTPUT_PATH}")


Pre-scan done: total_to_process=911 in 0m 00s


KeyboardInterrupt: 

In [None]:
#genericness - Score :
import json
import time
import os
import re
from typing import Any, Dict, Optional, List

import requests


INPUT_PATH  = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol.json"
OUTPUT_PATH = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN.json"

MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
ENDPOINT   = ".."

SAVE_EVERY = 10
SLEEP_BETWEEN_CALLS = 0.10
MAX_RETRIES = 3

MAX_TOKENS = 32          
TEMPERATURE = 0.2
TOP_P = 0.95

PRINT_EVERY = 10

QA_KEY   = "qa_fa"
EVOL_KEY = "mcq_evol"


SOURCE_TEXT_KEYS_QA = ["text"]


SYSTEM_PROMPT = (
    "You are a strict evaluator. "
    "Return ONLY a single integer: 1, 2, 3, 4, or 5. "
    "No other text, no punctuation, no JSON."
)

USER_PROMPT_TEMPLATE = """\
Rubric (Genericness score 1-5):
1: Fully generic - General/abstract; can be answered correctly without referring to the SOURCE TEXT.
2: Mostly generic - Lightly anchored to the SOURCE TEXT, but still answerable using general knowledge alone.
3: Partial - Some specific references to the SOURCE TEXT, yet remains somewhat general.
4: Mostly specific - Clearly grounded in the SOURCE TEXT with one strong, identifiable textual cue.
5: Highly specific - Two or more clear, specific cues from the SOURCE TEXT that are necessary to answer.

Task:
Given SOURCE TEXT and the evolved MCQ, output ONLY the genericness score as a single integer in [1..5].
Do NOT explain. Do NOT output anything except the digit.

SOURCE TEXT:
{source_text}

EVOLVED MCQ:
Question: {question}
Options:
A) {opt_a}
B) {opt_b}
C) {opt_c}
D) {opt_d}
CorrectAnswer: {correct}
"""

SCORE_RE = re.compile(r"\b([1-5])\b")


# -------------------------
# File I/O
# -------------------------
def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: str, data: Any) -> None:
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    os.replace(tmp, path)


# -------------------------
# Helpers
# -------------------------
def _first_nonempty_str(d: Dict[str, Any], keys: List[str]) -> Optional[str]:
    for k in keys:
        v = d.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return None

def get_source_text(item: Dict[str, Any]) -> Optional[str]:
    """
    Priority:
    1) qa_fa-level keys (most common)
    2) item-level keys
    """
    qa = item.get(QA_KEY)
    if isinstance(qa, dict):
        t = _first_nonempty_str(qa, SOURCE_TEXT_KEYS_QA)
        if t:
            return t
    return _first_nonempty_str(item, SOURCE_TEXT_KEYS_ITEM)

def build_prompt(source_text: str, evol: Dict[str, Any]) -> str:
    q = evol.get("rewritten_question", "") or ""
    opts = evol.get("rewritten_options", {}) or {}

    a = opts.get("A", "") or ""
    b = opts.get("B", "") or ""
    c = opts.get("C", "") or ""
    d = opts.get("D", "") or ""

    correct = evol.get("correct_answer", "") or ""

    return USER_PROMPT_TEMPLATE.format(
        source_text=source_text,
        question=q,
        opt_a=a,
        opt_b=b,
        opt_c=c,
        opt_d=d,
        correct=correct,
    )

def parse_score_only(raw_text: str) -> int:
    txt = (raw_text or "").strip()
    m = SCORE_RE.search(txt)
    if not m:
        raise ValueError(f"No valid score 1..5 found. Output: {txt[:200]}")
    return int(m.group(1))


# -------------------------
# LLM call
# -------------------------
def call_llama(prompt: str) -> str:
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
        ],
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
    }

    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(ENDPOINT, json=payload, timeout=120)
            r.raise_for_status()
            data = r.json()
            return data["choices"][0]["message"]["content"]
        except Exception as e:
            last_err = e
            if attempt < MAX_RETRIES:
                time.sleep(0.8 * attempt)
            else:
                raise RuntimeError(f"LLM call failed after {MAX_RETRIES} retries: {last_err}")


# -------------------------
# Main
# -------------------------
def main():
    data = load_json(INPUT_PATH)
    if not isinstance(data, list):
        raise ValueError("Expected top-level JSON to be a list of items.")

    newly_scored = 0
    skipped_no_evol = 0
    skipped_no_text = 0
    parse_fail = 0

    for idx, item in enumerate(data):
        qa = item.get(QA_KEY)
        if not isinstance(qa, dict):
            continue

        evol = qa.get(EVOL_KEY)
        if not isinstance(evol, dict):
            skipped_no_evol += 1
            continue

        # skip already-scored
        existing = evol.get("genericness_score")
        if isinstance(existing, int) and existing in [1, 2, 3, 4, 5]:
            continue

        source_text = get_source_text(item)
        if not source_text:
            skipped_no_text += 1
            continue

        prompt = build_prompt(source_text, evol)

        raw = call_llama(prompt)
        try:
            score = parse_score_only(raw)
            evol["genericness_score"] = score
        except Exception:
            evol["genericness_score"] = None
            parse_fail += 1

        newly_scored += 1

        if newly_scored % PRINT_EVERY == 0:
            print(f"[progress] idx={idx} newly_scored={newly_scored} "
                  f"skipped_no_text={skipped_no_text} skipped_no_evol={skipped_no_evol} "
                  f"parse_fail={parse_fail}")

        if newly_scored % SAVE_EVERY == 0:
            save_json(OUTPUT_PATH, data)
            print(f"[checkpoint] saved {OUTPUT_PATH} at newly_scored={newly_scored}")

        time.sleep(SLEEP_BETWEEN_CALLS)

    save_json(OUTPUT_PATH, data)
    print(f"[done] newly_scored={newly_scored} skipped_no_text={skipped_no_text} "
          f"skipped_no_evol={skipped_no_evol} parse_fail={parse_fail}")
    print(f"[done] output saved to {OUTPUT_PATH}")


if __name__ == "__main__":
    main()



[progress] idx=12 newly_scored=10 skipped_no_text=0 skipped_no_evol=3 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN.json at newly_scored=10
[progress] idx=23 newly_scored=20 skipped_no_text=0 skipped_no_evol=4 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN.json at newly_scored=20
[progress] idx=34 newly_scored=30 skipped_no_text=0 skipped_no_evol=5 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN.json at newly_scored=30
[progress] idx=44 newly_scored=40 skipped_no_text=0 skipped_no_evol=5 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN.json at newly_scored=40
[progress] idx=54 newly_scored=50 skipped_no_text=0 skipped_no_evol=5 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/tradit

In [None]:
import json
import time
import os
import re
from typing import Any, Dict, Optional, List

import requests


INPUT_PATH  = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN.json"
OUTPUT_PATH = "../Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN_RD.json"

MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
ENDPOINT   = ".."

SAVE_EVERY = 10
SLEEP_BETWEEN_CALLS = 0.10
MAX_RETRIES = 3

MAX_TOKENS = 32
TEMPERATURE = 0.2
TOP_P = 0.95

PRINT_EVERY = 10

QA_KEY   = "qa_fa"
EVOL_KEY = "mcq_evol"




# -------------------------
# Reasoning Depth prompt (STRICT: digit only)
# -------------------------
SYSTEM_PROMPT = (
    "You are a strict evaluator. "
    "Return ONLY a single integer: 1, 2, 3, 4, or 5. "
    "No other text, no punctuation, no JSON."
)

USER_PROMPT_TEMPLATE = """\
Definition (Reasoning Depth):
Does arriving at the correct answer require more than one step of reasoning?

Scores:
1: Direct/obvious answer; almost no reasoning
2: One simple step (e.g., a simple cause–effect relation)
3: Two identifiable steps (combining two pieces of information)
4: Multiple points plus the need for synthesis/comparison/inference
5: Clear multi-step reasoning with constraints or cases; unambiguous

Task:
Given SOURCE TEXT and the evolved MCQ, output ONLY the Reasoning Depth score as a single integer in [1..5].
Do NOT explain. Output nothing except the digit.

SOURCE TEXT:
{source_text}

EVOLVED MCQ:
Question: {question}
Options:
A) {opt_a}
B) {opt_b}
C) {opt_c}
D) {opt_d}
CorrectAnswer: {correct}
"""

SCORE_RE = re.compile(r"\b([1-5])\b")


# -------------------------
# File I/O
# -------------------------
def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: str, data: Any) -> None:
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    os.replace(tmp, path)


# -------------------------
# Helpers
# -------------------------
def _first_nonempty_str(d: Dict[str, Any], keys: List[str]) -> Optional[str]:
    for k in keys:
        v = d.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip()
    return None

def get_source_text(item: Dict[str, Any]) -> Optional[str]:
    qa = item.get(QA_KEY)
    if isinstance(qa, dict):
        t = _first_nonempty_str(qa, SOURCE_TEXT_KEYS_QA)
        if t:
            return t
    return _first_nonempty_str(item, SOURCE_TEXT_KEYS_ITEM)

def build_prompt(source_text: str, evol: Dict[str, Any]) -> str:
    q = evol.get("rewritten_question", "") or ""
    opts = evol.get("rewritten_options", {}) or {}
    a = opts.get("A", "") or ""
    b = opts.get("B", "") or ""
    c = opts.get("C", "") or ""
    d = opts.get("D", "") or ""
    correct = evol.get("correct_answer", "") or ""
    return USER_PROMPT_TEMPLATE.format(
        source_text=source_text,
        question=q,
        opt_a=a, opt_b=b, opt_c=c, opt_d=d,
        correct=correct,
    )

def parse_score_only(raw_text: str) -> int:
    txt = (raw_text or "").strip()
    m = SCORE_RE.search(txt)
    if not m:
        raise ValueError(f"No valid score 1..5 found. Output: {txt[:200]}")
    return int(m.group(1))


# -------------------------
# LLM call
# -------------------------
def call_llama(prompt: str) -> str:
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt},
        ],
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
    }

    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(ENDPOINT, json=payload, timeout=120)
            r.raise_for_status()
            data = r.json()
            return data["choices"][0]["message"]["content"]
        except Exception as e:
            last_err = e
            if attempt < MAX_RETRIES:
                time.sleep(0.8 * attempt)
            else:
                raise RuntimeError(f"LLM call failed after {MAX_RETRIES} retries: {last_err}")


# -------------------------
# Main
# -------------------------
def main():
    data = load_json(INPUT_PATH)
    if not isinstance(data, list):
        raise ValueError("Expected top-level JSON to be a list.")

    newly_scored = 0
    skipped_no_text = 0
    skipped_no_evol = 0
    parse_fail = 0

    for idx, item in enumerate(data):
        qa = item.get(QA_KEY)
        if not isinstance(qa, dict):
            continue

        evol = qa.get(EVOL_KEY)
        if not isinstance(evol, dict):
            skipped_no_evol += 1
            continue

        # skip if already exists
        existing = evol.get("reasoning_depth")
        if isinstance(existing, int) and existing in [1, 2, 3, 4, 5]:
            continue

        source_text = get_source_text(item)
        if not source_text:
            skipped_no_text += 1
            continue

        prompt = build_prompt(source_text, evol)

        raw = call_llama(prompt)
        try:
            score = parse_score_only(raw)
            evol["reasoning_depth"] = score
        except Exception:
            evol["reasoning_depth"] = None
            parse_fail += 1

        newly_scored += 1

        if newly_scored % PRINT_EVERY == 0:
            print(f"[progress] idx={idx} newly_scored={newly_scored} "
                  f"skipped_no_text={skipped_no_text} skipped_no_evol={skipped_no_evol} "
                  f"parse_fail={parse_fail}")

        if newly_scored % SAVE_EVERY == 0:
            save_json(OUTPUT_PATH, data)
            print(f"[checkpoint] saved {OUTPUT_PATH} at newly_scored={newly_scored}")

        time.sleep(SLEEP_BETWEEN_CALLS)

    save_json(OUTPUT_PATH, data)
    print(f"[done] newly_scored={newly_scored} skipped_no_text={skipped_no_text} "
          f"skipped_no_evol={skipped_no_evol} parse_fail={parse_fail}")
    print(f"[done] output saved to {OUTPUT_PATH}")


if __name__ == "__main__":
    main()


[progress] idx=12 newly_scored=10 skipped_no_text=0 skipped_no_evol=3 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN_RD.json at newly_scored=10
[progress] idx=23 newly_scored=20 skipped_no_text=0 skipped_no_evol=4 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN_RD.json at newly_scored=20
[progress] idx=34 newly_scored=30 skipped_no_text=0 skipped_no_evol=5 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN_RD.json at newly_scored=30
[progress] idx=44 newly_scored=40 skipped_no_text=0 skipped_no_evol=5 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-commonSence/traditions_and_customs_1_with_mcq_fa_T_evol_GN_RD.json at newly_scored=40
[progress] idx=54 newly_scored=50 skipped_no_text=0 skipped_no_evol=5 parse_fail=0
[checkpoint] saved /home/llm-mehrnoush/Dataset-common