In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os, json, random, re
import matplotlib.pyplot as plt
import ast
import numpy as np
import re
from typing import List, Optional, Tuple
load_dotenv()

# Set the OpenAI API key
OpenAI_Client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)
EVAL_MODEL = "gpt-5"

In [15]:
def parse_candidate_scores(cands_str: str) -> List[float]:
    """
    Parse 'cand_1 (score)|cand_2 (score)|...' -> [score1, score2, ...] as floats.
    Ignores segments that don't have a '(number)'.
    """
    if pd.isna(cands_str):
        return []
    parts = [p.strip() for p in str(cands_str).split('|') if p.strip()]
    scores: List[float] = []
    pat = re.compile(r'\(([-+]?\d*\.?\d+)\)')  # capture the number inside the last parentheses
    for part in parts:
        m = pat.findall(part)
        if m:
            scores.append(float(m[-1]))
    return scores

# ---------- knee / drop utilities ----------

def knee_index_point_before_largest_drop(raw_scores: List[float]) -> Optional[int]:
    """
    Given raw candidate scores, sort them DESC and return the index i such that
    the drop between sorted[i] and sorted[i+1] is maximal. If ties, return the first.
    Returns 0 for single-element lists. Returns None for empty lists.
    """
    if not raw_scores:
        return None
    s = sorted(raw_scores, reverse=True)
    if len(s) == 1:
        return 0
    drops = [s[i] - s[i+1] for i in range(len(s)-1)]
    # tie-breaker: first largest drop (lowest i)
    i_max = max(range(len(drops)), key=lambda i: drops[i])
    return i_max  # "point before" means we include index i in the mass

# ---------- CI computation (per-row) ----------

def compute_ci_from_scores(raw_scores: List[float], scale: float = 1000.0) -> float:
    """
    CI = (M * S) / scale
      - S: RAW sum of all candidate scores (0..100 each; with 10 cands max S is 1000)
      - M: sum of NORMALIZED scores (score/S) up to and including the knee index
    Returns np.nan if scores are empty or sum is zero.
    """
    if not raw_scores:
        return np.nan
    S = float(sum(raw_scores))
    if S <= 0:
        return np.nan

    k = knee_index_point_before_largest_drop(raw_scores)
    if k is None:
        return np.nan

    # normalize by total RAW sum S (i.e., turn into a probability mass over candidates)
    s_desc = sorted(raw_scores, reverse=True)
    norm_desc = [x / S for x in s_desc]
    M = float(sum(norm_desc[:k+1]))  # include the knee index itself

    CI = (M * S) / float(scale)
    return CI

def compute_ci_from_str(cands_str: str, scale: float = 1000.0) -> float:
    return compute_ci_from_scores(parse_candidate_scores(cands_str), scale=scale)

# ---------- simple filtering ----------

def simple_filtering(df: pd.DataFrame,
                     cand_col: str = "candidate_answers",
                     threshold: float = 50.0) -> pd.DataFrame:
    """
    Adds boolean column 'simple_conf' and returns a *copy* of df.
    - If ge50_means_nonconf=True (your latest instruction): simple_conf = False when any score >= 50.
      i.e., >=50 => NON-confusing
    - If ge50_means_nonconf=False (earlier slide): simple_conf = True when any score >= 50.
      i.e., >=50 => confusing
    """
    def flag(cands_str: str) -> bool:
        scores = parse_candidate_scores(cands_str)
        print(f"{scores}")
        return scores[0] > threshold

    out = df.copy()
    out["simple_conf"] = out[cand_col].apply(flag)
    return out

# ---------- global CI threshold (precompute once) ----------

def compute_ci_series(df: pd.DataFrame,
                      cand_col: str = "candidate_answers",
                      scale: float = 1000.0) -> pd.Series:
    """
    Compute CI per row; returns a float Series (index aligned with df).
    """
    return df[cand_col].apply(lambda s: compute_ci_from_str(s, scale=scale))

def knee_threshold_over_ci(ci_series: pd.Series) -> float:
    """
    Sort CI values DESC, find the point before the largest drop, and return that CI value as threshold.
    Ties -> pick the first (earliest) in the DESC list.
    Returns np.nan if not enough values.
    """
    vals = ci_series.dropna().sort_values(ascending=False).tolist()
    if not vals:
        return np.nan
    if len(vals) == 1:
        return vals[0]
    drops = [vals[i] - vals[i+1] for i in range(len(vals)-1)]
    i_max = max(range(len(drops)), key=lambda i: drops[i])
    return vals[i_max]  # the value *before* the largest drop

# ---------- advanced filtering ----------

def advanced_filtering(df: pd.DataFrame,
                       ci_threshold: float,
                       cand_col: str = "candidate_answers",
                       scale: float = 1000.0,
                       add_ci_column: bool = True) -> pd.DataFrame:
    """
    Adds boolean column 'adv_conf' using global ci_threshold and returns a *copy* of df.
    Also adds a 'CI' column if add_ci_column=True.
    Rule: adv_conf = (CI >= ci_threshold)
    """
    CIs = compute_ci_series(df, cand_col=cand_col, scale=scale)
    out = df.copy()
    if add_ci_column:
        out["CI"] = CIs
    out["adv_conf"] = CIs >= ci_threshold
    return out

In [3]:
df = pd.read_csv("../data/results/test_100q_results.csv")
print("Loaded test data with", len(df), "rows.")

Loaded test data with 45 rows.


### Accuracy Filtering

In [4]:
def evaluate_accuracy(question: str, gold: str, response: str):
    system_prompt = """
    You are a helpful assistant that evaluates the correctness of model answers that are given to different questions.
    Your task is to determine if the model's answer correctly contains the gold answer.
    You will be given a question, a gold answer(actually correct answer), and a model answer.
    Your response should be in the following format:
    In one sentence, explain how the model answer compares to the gold answer.
    On a new line, output exactly YES if the model answer correctly contains the gold answer, otherwise NO.
    IF YOU THINK THAT GIVEN GOLD ANSWER IS NOT CORRECT, YOU SHOULD FIRST PROVIDE AN EXPLANATION WHY IT IS NOT CORRECT AND THEN OUTPUT N/A.
    """ 
    user_prompt = f"""
    Question: {question}
    Gold answer: "{gold}"
    Model answer: "{response}"

    Is the model answer correct based on the gold answer? Only output YES or NO.
    """
    res = OpenAI_Client.chat.completions.create(
        model=EVAL_MODEL,
        messages=[
            # we will not use the system prompt for this basic task
            {"role": "user", "content": user_prompt}]
                    )
    verdict = res.choices[0].message.content.strip()
    print(f"Question: {question}\nGold: {gold}\nModel Response: {response}\nVerdict: {verdict} - {verdict == 'YES'}")
    return verdict == "YES"

df[['accuracy']] = df.apply(
    lambda r: pd.Series(
        evaluate_accuracy(r['question'], r['gold_answer'], r['llm_response'])
    ),
    axis=1
)

Question: when was the last year the eagles went to the superbowl?
Gold: 2017
Model Response: The last time the Philadelphia Eagles went to the Super Bowl was in 2018. They played in Super Bowl LII (52) against the New England Patriots at U.S. Bank Stadium in Minneapolis, Minnesota. The Eagles won that game with a score of 41-33.
Verdict: NO - False
Question: when was the last year the eagles went to the superbowl?
Gold: 2017
Model Response: A great question for all the Eagles fans out there!

The Philadelphia Eagles last appeared in the Super Bowl in 2023, when they played in Super Bowl LVII (57) against the Kansas City Chiefs on February 12, 2023. Unfortunately, they lost the game 38-35.

However, if you're thinking of a more recent or specific year, I'd be happy to try and help you with that!
Verdict: NO - False
Question: when was the last year the eagles went to the superbowl?
Gold: 2017
Model Response: The Philadelphia Eagles last went to the Super Bowl in the 2022 season, which w

In [5]:
df.to_csv("../data/results/test_100q_results.csv", index=False)

In [11]:

df_acc = df[df["accuracy"] == True].copy()
print("Filtered to", len(df_acc), "rows with accuracy == True.")


Filtered to 28 rows with accuracy == True.


### Confusability Filtering

In [16]:

# # simple filtering (latest spec: >=50 => NON-confusing, so confusing=True when NOT any >=50)
df_acc = simple_filtering(df_acc, cand_col="candidate_answers", threshold=50.0)
print(f"After simple filtering, confusing rows: {len(df_acc[df_acc['simple_conf'] == True])}, non-confusing rows: {len(df_acc[df_acc['simple_conf'] == False])}")

[80.0, 60.0, 55.0, 50.0, 45.0, 40.0, 30.0, 25.0, 20.0, 10.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 8.0, 6.0, 5.0, 4.0]
[22.0, 20.0, 18.0, 15.0, 12.0, 10.0, 8.0, 6.0, 5.0, 4.0]
[22.0, 20.0, 18.0, 15.0, 12.0, 10.0, 8.0, 6.0, 5.0, 4.0]
[22.0, 20.0, 18.0, 15.0, 12.0, 10.0, 8.0, 6.0, 5.0, 4.0]
[22.0, 20.0, 18.0, 15.0, 12.0, 10.0, 8.0, 6.0, 5.0, 4.0]
[22.0, 20.0, 18.0, 15.0, 12.0, 10.0, 8.

In [24]:
# load the whole json file
with open("../data/questions/1200_verified_questions.json", "r", encoding="utf-8") as f:
    global_questions = json.load(f)
from tqdm import tqdm
rows = []
for q in tqdm(global_questions, desc="questions"):
        question_text   = q["question"]
        correct_answer  = q["answer"]
        cand_dict = q.get("candidate_answers", {})
        # sort answer texts by their listwise score descending
        candidate_ans = sorted(
            cand_dict.keys(),
            key=lambda ans: cand_dict[ans].get("listwise", 0),
            reverse=True
        )
        # add listwise scores to the candidates
        candidate_ans_scores = [
            f"{ans} ({cand_dict[ans].get('listwise', 0)})"
            for ans in candidate_ans
        ]

        user_prompt = (
            f"{question_text}"
        )
                
        rows.append(
            {
                "id"               : q["id"],
                "question"         : question_text,
                "gold_answer"      : correct_answer,
                "candidate_answers": "|".join(candidate_ans),
            }
        )
df_global = pd.DataFrame(rows)
print("Loaded global questions with", len(df_global), "rows.")



questions: 100%|██████████| 1266/1266 [00:00<00:00, 90364.33it/s]

Loaded global questions with 1266 rows.





In [27]:
ci_series = compute_ci_series(df_global, cand_col="candidate_answers", scale=1000.0)
ci_threshold = knee_threshold_over_ci(ci_series)
print(f"Computed CI threshold: {ci_threshold}")
print(ci_series)

Computed CI threshold: 1.2
0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
1261   NaN
1262   NaN
1263   NaN
1264   NaN
1265   NaN
Name: candidate_answers, Length: 1266, dtype: float64


In [None]:
df_acc = advanced_filtering(df_acc, ci_threshold=ci_threshold, cand_col="candidate_answers", scale=1000.0, add_ci_column=True)
print(f"After advanced filtering, confusing rows: {len(df_acc[df_acc['adv_conf'] == True])}, non-confusing rows: {len(df_acc[df_acc['adv_conf'] == False])}")

In [12]:
import pandas as pd
path = "../data/eval_results/100q_results_ambigious.csv"
df = pd.read_csv(path)
# change the name of the column 1st_layer_score and 1st_layer_explanation to gpt_conf_score and gpt_conf_explanation
df.rename(columns={"1st_layer_score": "gpt_conf_score", "1st_layer_explanation": "gpt_conf_explanation"}, inplace=True)
df.to_csv(path, index=False)

In [4]:
df_2st_gpt = pd.read_csv("../data/results/100q_with_gpt_candidates.csv")
unique_questions = df_2st_gpt['question'].nunique()
print(f"Number of unique questions in the first GPT evaluation results: {unique_questions}")
df_2st_gpt.info()

Number of unique questions in the first GPT evaluation results: 88
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861 entries, 0 to 1860
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1861 non-null   object 
 1   prompt_variant         1861 non-null   object 
 2   model                  1861 non-null   object 
 3   question               1861 non-null   object 
 4   gold_answer            1861 non-null   object 
 5   candidate_answers      1861 non-null   object 
 6   llm_response           1861 non-null   object 
 7   response_length        1861 non-null   int64  
 8   human_score            0 non-null      float64
 9   accuracy               1861 non-null   bool   
 10  simple_conf            1861 non-null   bool   
 11  CI                     1861 non-null   float64
 12  adv_conf               1861 non-null   bool   
 13  mentioned_cands        1861 non-null   ob

In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os, json, random, re
import ast
import numpy as np
load_dotenv()

# Set the OpenAI API key
OpenAI_Client = openai.OpenAI(
    default_headers={
        "HTTP-Referer": "cfe-paper",
    },
    #base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENAI_API_KEY"),   
)

OpenRouter_Client = openai.OpenAI(
    default_headers={
        "HTTP-Referer": "cfe-paper",
    },
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),   
)
EVAL_MODEL = "openai/gpt-5-2025-08-07"

In [2]:
def get_api_response(system_prompt, user_prompt, model=EVAL_MODEL, reasoning_effort="medium"):
    res = OpenRouter_Client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
            ],
            response_format={
            "type": "text"
        },
        reasoning_effort=reasoning_effort)
    
    response_text = res.choices[0].message.content.strip()
    return response_text

In [3]:
# getting the intersection of mentioned_cands and gpt_candidate_answers 
# by based on the phrasing of the options of gpt_candidate_answers
INTERSECTION_SYSTEM_PROMPT = """
You are an expert semantic evaluator.
Your task is to find the semantic intersection between two lists of entity names. 
You will be provided with List A and List B.
### Instructions:
- Identify entities that are semantically equivalent between the two lists.
- We consider entities to be semantically equivalent if they are the abbreviated, synonymous, or slight variations of each other that represent the same thing.
- Return a JSON array containing the names of the entities that are present in both lists based on semantic similarity.
- Ensure that the output is a valid JSON array.
### Output format:
[
  "<entity name 1>",
  "<entity name 2>",
  ...
]
"""