In [20]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os, json, random, re
import ast
import numpy as np
load_dotenv()

# Set the OpenAI API key
OpenAI_Client = openai.OpenAI(
    default_headers={
        "HTTP-Referer": "cfe-paper",
    },
    #base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENAI_API_KEY"),   
)

OpenRouter_Client = openai.OpenAI(
    default_headers={
        "HTTP-Referer": "cfe-paper",
    },
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),   
)
EVAL_MODEL = "openai/gpt-5-2025-08-07"

In [21]:
def get_api_response(system_prompt, user_prompt, model=EVAL_MODEL, reasoning_effort="medium"):
    res = OpenRouter_Client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
            ],
            response_format={
            "type": "text"
        },
        reasoning_effort=reasoning_effort)
    
    response_text = res.choices[0].message.content.strip()
    return response_text

In [22]:
df = pd.read_csv("../../data/eval_results/gpt/100q_gpt_conf.csv")

In [23]:
## we first start with generating the ideal set of candidate answers for each question
## we want model to generate incorrect but plausible answers for each question
## the number of candidate answers will be up to model, the model will decide how many to generate
## we will ask for the ideal number of ideal candidate answers to be generated
## also ask the model to explain why each candidate answer is plausible yet incorrect
IDEAL_SET_GENERATION_SYSTEM_PROMPT = """
You are an expert question analyst specializing in educational question answering. You will be given a question along with its correct answer.

Your task is to identify the ideal number of plausible but incorrect alternative answers (distractors) for a given factual question. 
These are the answers that a knowledgeable person might realistically confuse with the correct answer.

### Instructions:
- List only the plausible but *incorrect* alternatives that could reasonably be mistaken for the correct answer.
- Focus on alternatives that can make someone who has some familiarity with the topic hesitate or second-guess themselves.
- Do **not** include the correct answer itself or clearly absurd, irrelevant, or far-fetched options.
- Add just a one sentence explanation for each alternative, describing why it is plausible yet incorrect.
- Do **not** state the number of items explicitly; just list them naturally.
- Don't try to minimize or maximize the number of alternatives; just provide the ideal number of alternatives for the given question.
- Ensure that your final output is in strict JSON format as specified below.

### Output format:
{
  "question": "<the question here>",
  "correct_answer": "<the correct answer here>",
  "ideal_candidate_answers": [
    {
      "answer": "<plausible but incorrect answer 1>",
      "explanation": "<one sentence explanation for why this answer is plausible yet incorrect>"
    },
    {
      "answer": "<plausible but incorrect answer 2>",
      "explanation": "<one sentence explanation for why this answer is plausible yet incorrect>"
    } ...
  ]
}
"""

In [28]:
def generate_ideal_candidate_answers(df, checkpoint_interval=3, out_df="../../data/eval_results/gpt/100q_gpt_candidates"):
    filled = 0
    questions = df['question'].unique()

    for i, question in enumerate(questions, start=1):
        rows = df[df['question'] == question]

        # 1) skip if non-confusing (use the first row's score for the question)
        conf = rows['gpt_conf_score'].iloc[0]
        if pd.notnull(conf) and conf <= 50:
            print(f"Low confusion score (<=50), skipping: {question}")
            continue

        # 2) skip if already has candidates (for this question)
        existing = rows['gpt_cands'].dropna()
        if len(existing) > 0:
            print(f"Skipping (already has candidate answers): {question}")
            continue

        # 3) prepare prompt
        correct_answer = rows['gold_answer'].iloc[0]
        user_prompt = f"Question: {question}\nCorrect Answer: {correct_answer}"

        # 4) call GPT
        try:
            response_text = get_api_response(IDEAL_SET_GENERATION_SYSTEM_PROMPT, user_prompt, reasoning_effort="medium")
            response_json = json.loads(response_text)
            ideal_candidates = response_json.get("ideal_candidate_answers", [])
            # optional: sanity check it’s a list of dicts with 'answer'
            if not (isinstance(ideal_candidates, list) and all(isinstance(x, dict) and 'answer' in x for x in ideal_candidates)):
                raise ValueError("ideal_candidate_answers not in expected format.")

            print(f"{user_prompt}\nGenerated Ideal Candidates:")
            for i_cand in ideal_candidates:
                print(f"- Answer: {i_cand['answer']} | Explanation: {i_cand['explanation']}")
            print("****************************")

            # 5) write to ALL rows of this question (consistent)
            df.loc[df['question'] == question, 'gpt_cands'] = json.dumps(ideal_candidates)
            filled += 1

        except Exception as e:
            print(f"Error processing '{question}': {e}")
            # leave as NaN; we can retry later

        # 6) checkpoint every N successful fills
        if filled > 0 and filled % checkpoint_interval == 0:
            out_df_path = f"{out_df}_ckpt_{filled}.csv"
            df.to_csv(out_df_path, index=False)
            print(f"Checkpoint saved ({filled} filled): {out_df_path}")

    # FINAL SAVE (no column overwrite!)
    out_df_path = f"{out_df}_final.csv"
    df.to_csv(out_df_path, index=False)
    print(f"Final saved: {out_df_path}")


In [30]:
checkpoint_df = pd.read_csv("../../data/eval_results/gpt/100q_gpt_candidates86.csv")

In [31]:
generate_ideal_candidate_answers(checkpoint_df, checkpoint_interval=2, out_df="../../data/eval_results/gpt/100q_gpt_candidates")

Low confusion score (<=50), skipping: "Feel Like Making Love" and "The First Time Ever I Saw Your Face" were hit singles for which female artist?
Low confusion score (<=50), skipping: Anellini pasta is what type of shape?
Skipping (already has candidate answers): Art Garfunkel trained for which profession although he didn't qualify?
Question: At 7am on Saturday 19 May 2012 which gold medalist started the torch delay in Lands End?
Correct Answer: Ben Ainslie
Generated Ideal Candidates:
- Answer: Sir Steve Redgrave | Explanation: As a five-time Olympic rowing champion who carried the torch into the Olympic Stadium and was central to the cauldron handover, many assume he also started the relay.
- Answer: Sir Chris Hoy | Explanation: The multiple Olympic cycling champion was a prominent figure of London 2012 and a torchbearer, making him an easy but incorrect guess for the relay’s start.
- Answer: Sir Bradley Wiggins | Explanation: Already an Olympic gold medalist and a high-profile figure

In [50]:
# getting the intersection of mentioned_cands and gpt_candidate_answers 
# by based on the phrasing of the options of gpt_candidate_answers
INTERSECTION_SYSTEM_PROMPT = """
You are an expert semantic evaluator.
Your task is to find the semantic intersection between two lists of entity names. 
You will be provided with List A and List B.
### Instructions:
- Identify entities that are semantically equivalent between the two lists.
- We consider entities to be semantically equivalent if they are the abbreviated, synonymous, or slight variations of each other that represent the same thing.
- Return a JSON array containing the names of the entities that are present in both lists based on semantic similarity.
- Ensure that the output is a valid JSON array.
### Output format:
[
  "<entity name 1>",
  "<entity name 2>",
  ...
]
"""

In [None]:
def get_intersections(df, checkpoint_interval = 2, out_df_path="../../data/results/100q_with_gpt_intersections.csv"):
    all_intersections = []
    grouped = df.groupby("question")
    for idx, group in enumerate(grouped):
        question, rows = group
        mentioned_cands = rows['mentioned_cands'].iloc[0]
        gpt_cands = rows['gpt_cands'].iloc[0]
        if pd.isnull(mentioned_cands) or pd.isnull(gpt_cands):
            all_intersections.append([])
            print(f"Skipping question '{question}', missing data.")
            continue
        try:
            mentioned_cands_list = ast.literal_eval(mentioned_cands)
            gpt_cands_list = [item['answer'] for item in ast.literal_eval(gpt_cands)]
            user_prompt = f"List A: {mentioned_cands_list}\nList B: {gpt_cands_list}"
            response_text = get_api_response(INTERSECTION_SYSTEM_PROMPT, user_prompt, reasoning_effort="low")
            response_json = json.loads(response_text)
            print(f"{user_prompt}\nIntersection: {response_json}\n")
            print("****************************")
        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            response_json = []
        all_intersections.append(response_json)
        if (idx + 1) % checkpoint_interval == 0:
            df.loc[df['question'] == question, 'gpt_cands_intersection'] = json.dumps(all_intersections[-1])
            df.to_csv(out_df_path, index=False)
            print(f"Checkpoint saved at question {idx + 1}")
    # Final save
    df['gpt_cands_intersection'] = pd.Series(all_intersections)
    df.to_csv(out_df_path, index=False)
    print("Final checkpoint saved after all questions.")

In [53]:
get_intersections(df, checkpoint_interval = 2, out_df_path="../../data/results/100q_with_gpt_candidates.csv")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
def calculate_metrics(df, out_df_path="../../data/results/100q_gpt_candidates.csv"):
    precisions, recalls, f1_scores = [], [], []
    for idx, row in df.iterrows():
        mentioned_cands = row['mentioned_cands']
        gpt_cands = row['gpt_cands']
        intersection = row['gpt_cands_intersection']

        if pd.isnull(mentioned_cands) or pd.isnull(gpt_cands) or pd.isnull(intersection):
            precisions.append(np.nan)
            recalls.append(np.nan)
            f1_scores.append(np.nan)
            continue

        try:
            mentioned_cands_list = ast.literal_eval(mentioned_cands)
            gpt_cands_list = [item['answer'] for item in ast.literal_eval(gpt_cands)]
            intersection_list = ast.literal_eval(intersection)

            tp = len(intersection_list)
            fp = len(mentioned_cands_list) - tp     # mentioned but not in ideal
            fn = len(gpt_cands_list) - tp  # ideal but not mentioned

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1_score)

        except Exception as e:
            print(f"Error calculating metrics for row {idx}: {e}")
            precisions.append(np.nan)
            recalls.append(np.nan)
            f1_scores.append(np.nan)

    df['gpt_precision'] = precisions
    df['gpt_recall'] = recalls
    df['gpt_f1_score'] = f1_scores
    df.to_csv(out_df_path, index=False)
    print(f"Metrics calculation completed and saved to {out_df_path}.")
    return df
