In [16]:
import os
import json
import time
from openai import OpenAI, OpenAIError
from dotenv import load_dotenv
import random
# Load environment variables from .env file
load_dotenv()
# Initialize OpenRouter-compatible client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY")
)

# Optional ranking headers—set these in your environment if you care about OpenRouter rankings
extra_headers = {
    "HTTP-Referer": os.getenv("HTTP_REFERER", "<YOUR_SITE_URL>"),
    "X-Title":      os.getenv("X_TITLE", "<YOUR_SITE_NAME>"),
}

In [17]:
def is_answer_correct(
    question: str,
    answer: str,
    model: str = "openai/gpt-4o",
    retries: int = 3,
    candidate_answers: list[str] = None
) -> bool:
    """
    Returns True if the LLM verifies the proposed answer is correct.
    """
    system_prompt = (
    "You are a fact‐checking assistant. Your task is to select questions from a diverse dataset. Questions will be given to you along with a proposed answer and a list of other candidate answers. "
    "If the Proposed Answer is non-text (emoji, mathematical symbol, non-Latin script), then output NO and select 'Non-text Answer' as your reason. "
    "If the Proposed Answer is not **exactly** correct, then output NO and select 'Wrong Answer' as your reason. "
    "If the candidate answers contain at least one correct answer, then output NO and select 'At Least One Correct Candidate Answer' as your reason. "
    "REMEMBER THAT GIVEN CANDIDATE ANSWERS CAN BE PLAUSIBLE AND CLOSE TO THE PROPOSED ANSWER, BUT THE PROPOSED ANSWER MUST BE EXACTLY CORRECT FOR THAT QUESTION. "
    "Answer YES or NO and IF ONLY IF your answer is NO than select one of these three options: [Wrong Answer, At Least One Correct Candidate Answer, Non-text Answer]."
    "Your answer must be in this format:"
    "Decision: [YES,NO]"
    "Reasoning for skipping: [Wrong Answer, At Least One Correct Candidate Answer, Non-text Answer]"
    "If your answer is NO and you selected one of the three options, provide 1 sentence explanation why you selected that option. "
    "If your answer is YES, you should not output the Reasoning for skipping part and the explanation. "
    )

    user_prompt = (
        f"Question: {question}\n"
        f"Proposed Answer: {answer}\n"
        f"Incorrect Candidate Answers: {', '.join(candidate_answers) or 'None'}\n"
        "Is the Proposed Answer exactly correct *and* are *all* the other candidate answers are not correct? "
        "Answer YES or NO."
    )

    for _ in range(retries):
        try:
            resp = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user",   "content": user_prompt},
                ],
                temperature=0,
            )
        except OpenAIError:
            time.sleep(1)
            continue

        verdict = resp.choices[0].message.content.strip().upper()
        print(f"User prompt: {user_prompt}")
        print(f"LLM verdict:\n {verdict}")
        print("=" * 50)
        return "YES" in verdict.split()

    # Default to False after exhausting retries
    return False

In [18]:
def filter_verified(path_in: str, path_out: str) -> None:
    """
    Reads a JSON array of QA entries, keeps only those with correct gold answers,
    and writes them out.
    """
    current_dir = os.getcwd()
    path_in = os.path.join(current_dir, path_in)
    path_out = os.path.join(current_dir, path_out)
    with open(path_in, "r", encoding="utf-8") as f:
        data = json.load(f)

    confusing_kept = []
    non_confusing_kept = []
    general_kept = []

    """ 
    for entry in data["confusing"]:
        if is_answer_correct(question = entry["question"],
                             answer = entry["answer"], 
                             candidate_answers=list(entry.get("candidate_answers", {}).keys()),
                             retries=3,
                             model="google/gemini-2.5-flash"):
            entry.pop("pairwise", None)
            confusing_kept.append(entry)
    for entry in data["non_confusing"]:
        if is_answer_correct(question = entry["question"],
                             answer = entry["answer"], 
                             candidate_answers=list(entry.get("candidate_answers", {}).keys()),
                             retries=3,
                             model="google/gemini-2.5-flash"):
            entry.pop("pairwise", None)
            non_confusing_kept.append(entry) 
    """
    # shuffle the data
    random.shuffle(data)  
    
    for entry in data:
        if is_answer_correct(question=entry["question"],
                             answer=entry["answer"],
                             candidate_answers=list(entry.get("candidate_answers", {}).keys()),
                             retries=3,
                             model="google/gemini-2.5-flash"):
            entry.pop("pairwise", None)
            general_kept.append(entry)


    kept = {
        "confusing": confusing_kept,
        "non_confusing": non_confusing_kept
    }
    with open(path_out, "w", encoding="utf-8") as f:
        json.dump(general_kept, f, ensure_ascii=False, indent=2)

    print(f"Kept {len(general_kept)} out of {len(data)} questions.")

In [19]:
filter_verified("./data/random_test_data/3000_questions.json", "./verified_w_candidates_from_3000.json")

User prompt: Question: A ‘gricer’ is a slang term for a what?
Proposed Answer: Foamer
Incorrect Candidate Answers: Train Enthusiast, Coffee Connoisseur, Surfer, Food Critic, Music Fan, Hiker, Car Enthusiast, Cyclist, Photographer, Collector
Is the Proposed Answer exactly correct *and* are *all* the other candidate answers are not correct? Answer YES or NO.
LLM verdict:
 DECISION: NO
REASONING FOR SKIPPING: AT LEAST ONE CORRECT CANDIDATE ANSWER
THE CANDIDATE ANSWER 'TRAIN ENTHUSIAST' IS ALSO A CORRECT TERM FOR A 'GRICER'.
User prompt: Question: What name is given to describes the phenomenon of the gradual mixing of two different substances which are in contact?
Proposed Answer: Diffuse
Incorrect Candidate Answers: Interblend, Commingle, Permeate, Intermix, Transfuse, Infuse, Mingle, Amalgamate, Coalesce, Blendation
Is the Proposed Answer exactly correct *and* are *all* the other candidate answers are not correct? Answer YES or NO.
LLM verdict:
 DECISION: NO
REASONING FOR SKIPPING: WRONG