In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os, json, random, re
import ast
import numpy as np
load_dotenv()

# Set the OpenAI API key
OpenAI_Client = openai.OpenAI(
    default_headers={
        "HTTP-Referer": "cfe-paper",
    },
    #base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENAI_API_KEY"),   
)

OpenRouter_Client = openai.OpenAI(
    default_headers={
        "HTTP-Referer": "cfe-paper",
    },
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),   
)
EVAL_MODEL = "openai/gpt-5-2025-08-07"

In [2]:
def get_api_response(system_prompt, user_prompt, model=EVAL_MODEL, reasoning_effort="medium"):
    res = OpenRouter_Client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
            ],
            response_format={
            "type": "text"
        },
        reasoning_effort=reasoning_effort)
    
    response_text = res.choices[0].message.content.strip()
    return response_text

In [22]:
df = pd.read_csv("../../data/eval_results/gpt/100q_gpt_conf.csv")

# Ideal Candidate Generation

In [23]:
## we first start with generating the ideal set of candidate answers for each question
## we want model to generate incorrect but plausible answers for each question
## the number of candidate answers will be up to model, the model will decide how many to generate
## we will ask for the ideal number of ideal candidate answers to be generated
## also ask the model to explain why each candidate answer is plausible yet incorrect
IDEAL_SET_GENERATION_SYSTEM_PROMPT = """
You are an expert question analyst specializing in educational question answering. You will be given a question along with its correct answer.

Your task is to identify the ideal number of plausible but incorrect alternative answers (distractors) for a given factual question. 
These are the answers that a knowledgeable person might realistically confuse with the correct answer.

### Instructions:
- List only the plausible but *incorrect* alternatives that could reasonably be mistaken for the correct answer.
- Focus on alternatives that can make someone who has some familiarity with the topic hesitate or second-guess themselves.
- Do **not** include the correct answer itself or clearly absurd, irrelevant, or far-fetched options.
- Add just a one sentence explanation for each alternative, describing why it is plausible yet incorrect.
- Do **not** state the number of items explicitly; just list them naturally.
- Don't try to minimize or maximize the number of alternatives; just provide the ideal number of alternatives for the given question.
- Ensure that your final output is in strict JSON format as specified below.

### Output format:
{
  "question": "<the question here>",
  "correct_answer": "<the correct answer here>",
  "ideal_candidate_answers": [
    {
      "answer": "<plausible but incorrect answer 1>",
      "explanation": "<one sentence explanation for why this answer is plausible yet incorrect>"
    },
    {
      "answer": "<plausible but incorrect answer 2>",
      "explanation": "<one sentence explanation for why this answer is plausible yet incorrect>"
    } ...
  ]
}
"""

In [28]:
def generate_ideal_candidate_answers(df, checkpoint_interval=3, out_df="../../data/eval_results/gpt/100q_gpt_candidates"):
    filled = 0
    questions = df['question'].unique()

    for i, question in enumerate(questions, start=1):
        rows = df[df['question'] == question]

        # 1) skip if non-confusing (use the first row's score for the question)
        conf = rows['gpt_conf_score'].iloc[0]
        if pd.notnull(conf) and conf <= 50:
            print(f"Low confusion score (<=50), skipping: {question}")
            continue

        # 2) skip if already has candidates (for this question)
        existing = rows['gpt_cands'].dropna()
        if len(existing) > 0:
            print(f"Skipping (already has candidate answers): {question}")
            continue

        # 3) prepare prompt
        correct_answer = rows['gold_answer'].iloc[0]
        user_prompt = f"Question: {question}\nCorrect Answer: {correct_answer}"

        # 4) call GPT
        try:
            response_text = get_api_response(IDEAL_SET_GENERATION_SYSTEM_PROMPT, user_prompt, reasoning_effort="medium")
            response_json = json.loads(response_text)
            ideal_candidates = response_json.get("ideal_candidate_answers", [])
            # optional: sanity check it’s a list of dicts with 'answer'
            if not (isinstance(ideal_candidates, list) and all(isinstance(x, dict) and 'answer' in x for x in ideal_candidates)):
                raise ValueError("ideal_candidate_answers not in expected format.")

            print(f"{user_prompt}\nGenerated Ideal Candidates:")
            for i_cand in ideal_candidates:
                print(f"- Answer: {i_cand['answer']} | Explanation: {i_cand['explanation']}")
            print("****************************")

            # 5) write to ALL rows of this question (consistent)
            df.loc[df['question'] == question, 'gpt_cands'] = json.dumps(ideal_candidates)
            filled += 1

        except Exception as e:
            print(f"Error processing '{question}': {e}")
            # leave as NaN; we can retry later

        # 6) checkpoint every N successful fills
        if filled > 0 and filled % checkpoint_interval == 0:
            out_df_path = f"{out_df}_ckpt_{filled}.csv"
            df.to_csv(out_df_path, index=False)
            print(f"Checkpoint saved ({filled} filled): {out_df_path}")

    # FINAL SAVE (no column overwrite!)
    out_df_path = f"{out_df}_final.csv"
    df.to_csv(out_df_path, index=False)
    print(f"Final saved: {out_df_path}")


In [30]:
checkpoint_df = pd.read_csv("../../data/eval_results/gpt/100q_gpt_candidates86.csv")

In [31]:
generate_ideal_candidate_answers(checkpoint_df, checkpoint_interval=2, out_df="../../data/eval_results/gpt/100q_gpt_candidates")

Low confusion score (<=50), skipping: "Feel Like Making Love" and "The First Time Ever I Saw Your Face" were hit singles for which female artist?
Low confusion score (<=50), skipping: Anellini pasta is what type of shape?
Skipping (already has candidate answers): Art Garfunkel trained for which profession although he didn't qualify?
Question: At 7am on Saturday 19 May 2012 which gold medalist started the torch delay in Lands End?
Correct Answer: Ben Ainslie
Generated Ideal Candidates:
- Answer: Sir Steve Redgrave | Explanation: As a five-time Olympic rowing champion who carried the torch into the Olympic Stadium and was central to the cauldron handover, many assume he also started the relay.
- Answer: Sir Chris Hoy | Explanation: The multiple Olympic cycling champion was a prominent figure of London 2012 and a torchbearer, making him an easy but incorrect guess for the relay’s start.
- Answer: Sir Bradley Wiggins | Explanation: Already an Olympic gold medalist and a high-profile figure

# Getting the Intersections

In [9]:
# getting the intersection of mentioned_cands and gpt_candidate_answers 
# by based on the phrasing of the options of gpt_candidate_answers
INTERSECTION_SYSTEM_PROMPT = """
You are an expert semantic evaluator.
Your task is to find the semantic intersection between two lists of entity names. 
You will be provided with List A and List B.
### Instructions:
- Identify entities that are semantically equivalent between the two lists.
- We consider entities to be semantically equivalent if they are the abbreviated, synonymous, or slight variations of each other that represent the same thing.
- Return a JSON array containing the names of the entities that are present in both lists based on semantic similarity.
- For each entity in the intersection use the name from both lists.
- Ensure that the output is a valid JSON.
### Output format:
{
    "matches": [
        {"A": "New York City", "B": "NYC"},
        {"A": "L.A.", "B": "Los Angeles"}
    ]
}
"""

In [10]:
df = pd.read_csv("../../data/eval_results/gpt/100q_gpt_candidates_final.csv")

In [16]:
def get_intersections(
    df,
    checkpoint_interval=50,
    out_df_path="../../data/eval_results/gpt/100q_with_gpt_intersections.csv",
    force=False,  # if True, recompute even if intersection already exists
):
    filled = 0
    total_rows = len(df)

    for i in range(total_rows):
        # quick local refs
        conf = df.at[i, 'gpt_conf_score'] if 'gpt_conf_score' in df.columns else None
        mentioned_cands = df.at[i, 'mentioned_cands'] if 'mentioned_cands' in df.columns else None
        gpt_cands = df.at[i, 'gpt_cands'] if 'gpt_cands' in df.columns else None

        # skip if already computed (unless force)
        if not force and 'gpt_cands_intersection' in df.columns and pd.notnull(df.at[i, 'gpt_cands_intersection']):
            continue

        # 1) non-confusing rows → empty intersection
        if pd.notnull(conf) and conf <= 50:
            print(f"[row {i}] Low confusion score (<=50), skipping intersection.")
            df.at[i, 'gpt_cands_intersection'] = json.dumps([])
            continue

        # 2) missing inputs → empty intersection
        if pd.isnull(mentioned_cands) or pd.isnull(gpt_cands):
            print(f"[row {i}] Missing inputs, skipping intersection.")
            df.at[i, 'gpt_cands_intersection'] = json.dumps([])
            continue

        # 3) parse inputs safely
        try:
            mentioned_list = ast.literal_eval(mentioned_cands)
            if not isinstance(mentioned_list, list):
                mentioned_list = []
        except Exception:
            mentioned_list = []

        try:
            gpt_cands_list_raw = ast.literal_eval(gpt_cands)
            # gpt_cands is list of dicts; extract 'answer'
            if isinstance(gpt_cands_list_raw, list) and all(isinstance(x, dict) and 'answer' in x for x in gpt_cands_list_raw):
                ideal_list = [x['answer'] for x in gpt_cands_list_raw]
            else:
                ideal_list = []
        except Exception:
            ideal_list = []

        # empty inputs → empty intersection (precision/recall downstream will be 0)
        if len(mentioned_list) == 0:
            print(f"[row {i}] Empty mentioned list, skipping intersection.")
            df.at[i, 'gpt_cands_intersection'] = json.dumps([])
            continue

        if len(ideal_list) == 0:
            print(f"[row {i}] Empty ideal list, skipping intersection.")
            df.at[i, 'gpt_cands_intersection'] = json.dumps([])
            continue

        # 4) call GPT for semantic intersection
        try:
            user_prompt = f"List A: {mentioned_list}\nList B: {ideal_list}"
            resp = get_api_response(INTERSECTION_SYSTEM_PROMPT, user_prompt, reasoning_effort="medium")
            #print(user_prompt)
            #print(resp)

            # your INTERSECTION_SYSTEM_PROMPT returns a JSON array;
            # but be defensive in case a dict is returned
            parsed = json.loads(resp)
            # change the name of the keys of the elements of the matches array
            # so inside the matches array we will have arrays of {"mentioned": ..., "ideal": ...} instead of {"A": ..., "B": ...}
            if isinstance(parsed, dict) and 'matches' in parsed and isinstance(parsed['matches'], list):
                intersection = [{"mentioned": match["A"], "ideal": match["B"]} for match in parsed['matches'] if 'A' in match and 'B' in match]


            
        except Exception as e:
            print(f"[row {i}] GPT intersection error: {e}")
            intersection = []

        # 5) write per-row result
        print(f"Mentioned List: {mentioned_list}\nIdeal List: {ideal_list}\nIntersection: {intersection}\n****************************")
        df.at[i, 'gpt_cands_intersection'] = json.dumps(intersection)
        filled += 1

        # 6) checkpoint
        """if filled % checkpoint_interval == 0:
            df.to_csv(out_df_path, index=False)
            print(f"Checkpoint saved after {filled} intersections → {out_df_path}") """

    # final save
    df.to_csv(out_df_path, index=False)
    print(f"Final saved with intersections → {out_df_path}")
    return df

In [17]:
get_intersections(df, out_df_path="../../data/eval_results/gpt/100q_with_gpt_intersections.csv")

Mentioned List: ['Chris Hoy']
Ideal List: ['Sir Steve Redgrave', 'Sir Chris Hoy', 'Sir Bradley Wiggins', 'Sebastian Coe', 'Dame Kelly Holmes', 'Sir Matthew Pinsent']
Intersection: [{'mentioned': 'Chris Hoy', 'ideal': 'Sir Chris Hoy'}]
****************************
[row 64] Empty mentioned list, skipping intersection.
[row 65] Empty mentioned list, skipping intersection.
[row 66] Empty mentioned list, skipping intersection.
[row 67] Empty mentioned list, skipping intersection.
[row 68] Empty mentioned list, skipping intersection.
[row 69] Empty mentioned list, skipping intersection.
[row 70] Empty mentioned list, skipping intersection.
[row 71] Empty mentioned list, skipping intersection.
[row 72] Empty mentioned list, skipping intersection.
[row 73] Empty mentioned list, skipping intersection.
[row 74] Empty mentioned list, skipping intersection.
[row 75] Empty mentioned list, skipping intersection.
[row 76] Empty mentioned list, skipping intersection.
[row 77] Empty mentioned list, ski

Unnamed: 0,id,prompt_variant,model,question,gold_answer,candidate_answers,llm_response,response_length,accuracy,simple_conf,...,adv_conf,mentioned_cands,simple_metric,adv_metric,just_reward,num_mentions,gpt_conf_score,gpt_conf_explanation,gpt_cands,gpt_cands_intersection
0,trivia_5819,baseline,llama-3.1-8b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The female artist who had hit singles with ""Fe...",126,True,True,...,False,[],0.000000,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[]
1,trivia_5819,baseline,qwen-2.5-72b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The hit singles ""Feel Like Making Love"" and ""T...",471,True,True,...,False,[],0.000000,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[]
2,trivia_5819,baseline,llama-3.3-70b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The female artist who had hit singles with ""Fe...",126,True,True,...,False,[],0.000000,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[]
3,trivia_5819,baseline,gpt-4o,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"""Feel Like Makin' Love"" and ""The First Time Ev...",101,True,True,...,False,[],0.000000,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[]
4,trivia_5819,clarify-doubts,llama-3.1-8b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The female artist who had the hit singles ""Fee...",125,True,True,...,False,[],0.000000,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,nq_1529,cfe-unrestricted,gpt-4o,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,"In ""The Hobbit,"" the character who can transfo...",756,True,False,...,False,"['Gandalf', 'Thorin Oakenshield']",0.801781,0.801781,0.233773,2,24,In The Hobbit there’s essentially one notable ...,,[]
1857,nq_1529,cfe,llama-3.1-8b,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,Beorn.\n\nPlausible but incorrect alternative ...,167,True,False,...,False,[],1.000000,1.000000,0.000000,0,24,In The Hobbit there’s essentially one notable ...,,[]
1858,nq_1529,cfe,qwen-2.5-72b,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,"Beorn turns into a bear in ""The Hobbit.""\n\nCl...",393,True,False,...,False,['Gandalf'],0.917757,0.917757,0.220022,1,24,In The Hobbit there’s essentially one notable ...,,[]
1859,nq_1529,cfe,llama-3.3-70b,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,Beorn.,6,True,False,...,False,[],1.000000,1.000000,0.000000,0,24,In The Hobbit there’s essentially one notable ...,,[]


In [19]:
import ast, json, numpy as np, pandas as pd

def calculate_metrics(df, out_df_path="../../data/eval_results/gpt/100q_with_gpt_f1_scores.csv"):
    precisions, recalls, f1_scores = [], [], []

    for idx, row in df.iterrows():
        # 1️⃣ Skip non-confusing questions
        if 'gpt_conf_score' in row and pd.notnull(row['gpt_conf_score']) and row['gpt_conf_score'] <= 50:
            precisions.append(np.nan)
            recalls.append(np.nan)
            f1_scores.append(np.nan)
            continue

        mentioned_cands = row.get('mentioned_cands')
        gpt_cands = row.get('gpt_cands')
        intersection = row.get('gpt_cands_intersection')

        if pd.isnull(mentioned_cands) or pd.isnull(gpt_cands) or pd.isnull(intersection):
            precisions.append(np.nan)
            recalls.append(np.nan)
            f1_scores.append(np.nan)
            continue

        try:
            # parse model-mentioned list
            mentioned_cands_list = ast.literal_eval(mentioned_cands)
            if not isinstance(mentioned_cands_list, list):
                mentioned_cands_list = []

            # parse ideal (GPT) candidates
            gpt_cands_parsed = ast.literal_eval(gpt_cands)
            if isinstance(gpt_cands_parsed, list) and all(isinstance(x, dict) and 'answer' in x for x in gpt_cands_parsed):
                gpt_cands_list = [x['answer'] for x in gpt_cands_parsed]
            else:
                gpt_cands_list = []

            # parse intersections: list of {"mentioned": "...", "ideal": "..."}
            intersection_parsed = ast.literal_eval(intersection)
            if isinstance(intersection_parsed, list):
                intersection_list = intersection_parsed
            elif isinstance(intersection_parsed, dict) and "matches" in intersection_parsed:
                intersection_list = intersection_parsed["matches"]
            else:
                intersection_list = []

            tp = len(intersection_list)
            fp = len(mentioned_cands_list) - tp    # mentioned but not matched
            fn = len(gpt_cands_list) - tp          # ideal but not matched

            # handle empty sets and division safely
            if len(mentioned_cands_list) == 0:
                precision = np.nan  # undefined (no mentions)
                recall = 0.0
            else:
                precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

            f1_score = (
                2 * (precision * recall) / (precision + recall)
                if precision and recall and (precision + recall) > 0
                else 0.0
            )

            precisions.append(precision)
            recalls.append(recall)
            f1_scores.append(f1_score)

        except Exception as e:
            print(f"Error calculating metrics for row {idx}: {e}")
            precisions.append(np.nan)
            recalls.append(np.nan)
            f1_scores.append(np.nan)

    df['gpt_precision'] = precisions
    df['gpt_recall'] = recalls
    df['gpt_f1_score'] = f1_scores
    df.to_csv(out_df_path, index=False)
    print(f"Metrics calculation completed and saved to {out_df_path}.")
    return df


In [20]:
df = pd.read_csv("../../data/eval_results/gpt/100q_with_gpt_intersections.csv")

In [21]:
calculate_metrics(df, out_df_path="../../data/eval_results/gpt/100q_with_gpt_f1_scores.csv")

Metrics calculation completed and saved to ../../data/eval_results/gpt/100q_with_gpt_f1_scores.csv.


Unnamed: 0,id,prompt_variant,model,question,gold_answer,candidate_answers,llm_response,response_length,accuracy,simple_conf,...,adv_metric,just_reward,num_mentions,gpt_conf_score,gpt_conf_explanation,gpt_cands,gpt_cands_intersection,gpt_precision,gpt_recall,gpt_f1_score
0,trivia_5819,baseline,llama-3.1-8b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The female artist who had hit singles with ""Fe...",126,True,True,...,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[],,,
1,trivia_5819,baseline,qwen-2.5-72b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The hit singles ""Feel Like Making Love"" and ""T...",471,True,True,...,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[],,,
2,trivia_5819,baseline,llama-3.3-70b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The female artist who had hit singles with ""Fe...",126,True,True,...,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[],,,
3,trivia_5819,baseline,gpt-4o,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"""Feel Like Makin' Love"" and ""The First Time Ev...",101,True,True,...,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[],,,
4,trivia_5819,clarify-doubts,llama-3.1-8b,"""Feel Like Making Love"" and ""The First Time Ev...",Roberta Flack,Aretha Franklin (60)|Randy Crawford (55)|Glady...,"The female artist who had the hit singles ""Fee...",125,True,True,...,1.000000,0.000000,0,47,Both songs have many covers and “Feel Like Mak...,,[],,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,nq_1529,cfe-unrestricted,gpt-4o,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,"In ""The Hobbit,"" the character who can transfo...",756,True,False,...,0.801781,0.233773,2,24,In The Hobbit there’s essentially one notable ...,,[],,,
1857,nq_1529,cfe,llama-3.1-8b,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,Beorn.\n\nPlausible but incorrect alternative ...,167,True,False,...,1.000000,0.000000,0,24,In The Hobbit there’s essentially one notable ...,,[],,,
1858,nq_1529,cfe,qwen-2.5-72b,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,"Beorn turns into a bear in ""The Hobbit.""\n\nCl...",393,True,False,...,0.917757,0.220022,1,24,In The Hobbit there’s essentially one notable ...,,[],,,
1859,nq_1529,cfe,llama-3.3-70b,who turns into a bear in the hobbit?,Beorn,Gwaihir (22)|Gandalf (20)|Smaug (18)|Radagast ...,Beorn.,6,True,False,...,1.000000,0.000000,0,24,In The Hobbit there’s essentially one notable ...,,[],,,
