# Import Libraries, Setup the Environment and Data

In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os, json, random, re
import ast
import numpy as np
load_dotenv()

# Set the OpenAI API key
OpenAI_Client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)
EVAL_MODEL = "gpt-5"

## Load the Data

In [5]:
# Load the test results from a CSV file
path = "../data/results/100q_results_filtered.csv"
df_acc = pd.read_csv(path)

# Metric Pipeline

## Accuracy Filtering

In [3]:
def evaluate_accuracy(question: str, gold: str, response: str):
    system_prompt = """
    You are a helpful assistant that evaluates the correctness of model answers that are given to different questions.
    Your task is to determine if the model's answer correctly contains the gold answer.
    You will be given a question, a gold answer(actually correct answer), and a model answer.
    Your response should be in the following format:
    In one sentence, explain how the model answer compares to the gold answer.
    On a new line, output exactly YES if the model answer correctly contains the gold answer, otherwise NO.
    IF YOU THINK THAT GIVEN GOLD ANSWER IS NOT CORRECT, YOU SHOULD FIRST PROVIDE AN EXPLANATION WHY IT IS NOT CORRECT AND THEN OUTPUT N/A.
    """ 
    user_prompt = f"""
    Question: {question}
    Gold answer: "{gold}"
    Model answer: "{response}"

    Is the model answer correct based on the gold answer? Only output YES or NO.
    """
    res = OpenAI_Client.chat.completions.create(
        model=EVAL_MODEL,
        messages=[
            # we will not use the system prompt for this basic task
            {"role": "user", "content": user_prompt}]
                    )
    verdict = res.choices[0].message.content.strip()
    print(f"Question: {question}\nGold: {gold}\nModel Response: {response}\nVerdict: {verdict} - {verdict == 'YES'}")
    return verdict == "YES"

In [4]:
# Run the accuracy evaluation on the DataFrame
df[['accuracy']] = df.apply(
    lambda r: pd.Series(
        evaluate_accuracy(r['question'], r['gold_answer'], r['llm_response'])
    ),
    axis=1
)

Question: when was the last year the eagles went to the superbowl?
Gold: 2017
Model Response: The last year the Eagles went to the Super Bowl was 2018.
Verdict: NO - False
Question: when was the last year the eagles went to the superbowl?
Gold: 2017
Model Response: The correct answer is 2018.

Plausible but incorrect alternative answer: 2020. This is incorrect because while the Philadelphia Eagles did appear in Super Bowl LII (52) in 2018, they did not appear in Super Bowl LIV (54) in 2020.
Verdict: NO - False
Question: when was the last year the eagles went to the superbowl?
Gold: 2017
Model Response: The last year the Philadelphia Eagles went to the Super Bowl was 2022, where they played in Super Bowl LVII against the Kansas City Chiefs.

Clarification: It's important to note that while the game was played in 2023 (February 12, 2023), the NFL season for which they qualified and played is referred to as the 2022 season. Therefore, 2022 is the correct year to mention.
Verdict: NO - Fal

In [5]:
# Save the data frame to the same file with accuracy results
df.to_csv(path, index=False)

In [5]:
df_acc = df[df["accuracy"] == True].copy()
print("Filtered to", len(df_acc), "rows with accuracy == True out of", len(df), "total rows.")

Filtered to 408 rows with accuracy == True out of 408 total rows.


## Detect Mentioned Candidates in the Responses

In [7]:
def detect_mentions(question: str,response: str, candidates: list[str], gold_answer: str):
    """
    Returns two lists:
      - in_list:   which of 'candidates' appear in 'response'
    """
    # candidates is a string and in this form "cand_1 (score)|cand_2 (score)|..."
    # we need them in this format: ["cand_1", "cand_2", ...]
    cands_split = candidates.split("|")
    cands_arr = [cand.split(" (")[0] for cand in cands_split]
    system = "You are an assistant that will check different model responses and a set of ground truth candidate answers. " \
    "You will be given a question, a correct gold answer, INCORRECT candidate answers and a model response." \
    "The model responses will be to various trivial questions. Candidates will be the INCORRECT candidate answers for that question." \
    "In the response, the correct answer will be given already. But sometimes the model will also mention other plausible sounding " \
    "but incorrect answers. Your task is to find which of the ground truth candidates are mentioned in the model response." \
    "You will output an array: 'mentioned_cands' " \
    "So you will find the ground truth candidates that are mentioned in the response and put them in the array." \
    "If you can't find any suiting answers for this case, you should return an empty array." \
    "REMEMBER THAT DO NOT PUT THE GOLD ANSWER INTO ANY OUTPUT LIST. THAT IS NOT CONSIDERED AS AN INCORRECT CANDIDATE" \
    "DO NOT FORMAT THE JSON ARRAY IN MARKDOWN, JUST WRITE IT IN PLAIN STRING." 
    user = f"""
Question: {question}
Gold Answer: {gold_answer}
Ground Candidates: {cands_arr}
Response: \"{response}\"

Return an array of strings in JSON format with one key:
- "mentioned_cands": a JSON array of mentioned answers from the response that are also in the ground candidates list.
"""
    res = OpenAI_Client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role":"system","content":system},
            {"role":"user","content":user}
        ],
        temperature=0,
        max_tokens=200,
    )
    # print response
    # print(res.choices[0].message.content)
    # parse the assistant’s JSON
    parsed = json.loads(res.choices[0].message.content)
    print(question)
    print(gold_answer)
    print(candidates)
    print(response)
    print(parsed)
    print("=" * 50)
    # check if the key "mentioned_cands" exists in the parsed JSON
    if "mentioned_cands" not in parsed:
        print("Error: 'mentioned_cands' key not found in the response.")
        return []
    return parsed["mentioned_cands"]

df["mentioned_cands"] = df_acc.apply(
    lambda r: detect_mentions(r['question'], r['llm_response'], r['candidate_answers'], r['gold_answer']),
    axis=1
)
df.to_csv(path, index=False)



Who plays the role of Bubble in Absolutely Fabulous?
Jane Horrocks
Joanna Lumley (20)|Harriet Thorpe (18)|Julia Sawalha (15)|June Whitfield (12)|Miranda Richardson (10)|Phyllis Logan (9)|Kathy Burke (8)|Helena Bonham Carter (6)|Tilda Swinton (5)|Kristin Scott Thomas (4)
Jennifer Saunders plays the role of Edina Monsoon, but the character Bubble is actually played by Jane Horrocks.
{'mentioned_cands': []}
Who plays the role of Bubble in Absolutely Fabulous?
Jane Horrocks
Joanna Lumley (20)|Harriet Thorpe (18)|Julia Sawalha (15)|June Whitfield (12)|Miranda Richardson (10)|Phyllis Logan (9)|Kathy Burke (8)|Helena Bonham Carter (6)|Tilda Swinton (5)|Kristin Scott Thomas (4)
The role of Bubble in Absolutely Fabulous is played by Jane Horrocks. 

There are no widely recognized alternative actors who have played this role in the main series or films of Absolutely Fabulous, so no further clarifications are necessary.
{'mentioned_cands': []}
Who plays the role of Bubble in Absolutely Fabulous?


## Confusability Labeling

In [6]:
import re
from typing import List, Optional, Tuple, Dict

# ---------- parsing ----------

def parse_candidate_str(cand_str: str) -> List[Tuple[str, float]]:
    """
    Parse 'cand (score)|cand (score)|...' -> [(cand, score), ...]
    Robust to extra spaces and to candidate names that contain parentheses.
    Assumes the LAST '(number)' at the end of each chunk is the score.
    """
    if cand_str is None or str(cand_str).strip() == "":
        return []
    parts = [p.strip() for p in str(cand_str).split("|") if p.strip()]
    out: List[Tuple[str, float]] = []
    for p in parts:
        m = re.search(r"\(([-+]?\d*\.?\d+)\)\s*$", p)  # last '(number)' at end
        if not m:
            # fallback: naked number at end
            m = re.search(r"([-+]?\d*\.?\d+)\s*$", p)
        if m:
            score = float(m.group(1))
            name = p[:m.start()].strip()
            # strip a trailing '(' if name ended right before the score
            if name.endswith("("):
                name = name[:-1].rstrip()
            out.append((name, score))
    return out

# ---------- breaking point ----------

def breaking_point_index(sorted_desc_scores: List[float]) -> int:
    """
    Given scores already sorted descending, return the index i that is
    'the point before the largest drop'. If multiple equal largest drops,
    return the FIRST one (lowest i).
    Edge cases:
      - length 0 -> 0
      - length 1 -> 0
      - all equal -> first diff==0 at i=0 -> returns 0
    """
    n = len(sorted_desc_scores)
    if n <= 1:
        return 0
    diffs = [sorted_desc_scores[i] - sorted_desc_scores[i+1] for i in range(n-1)]
    max_drop = max(diffs)
    i = diffs.index(max_drop)  # first occurrence
    return i  # index BEFORE the largest drop

# ---------- CI from scores ----------

def ci_from_scores(raw_scores: List[float], s_max: float = 1000.0) -> Dict[str, object]:
    """
    Compute M, S, CI using the 'breaking point' rule.
      - Sort scores descending
      - Find breaking point index i
      - S = raw sum of ALL candidates
      - Normalize scores by S, then M = sum(normalized[0..i])  (INCLUDE index i)
      - CI = (M * S) / s_max   (default s_max=1000)
    Returns dict with useful metadata.
    """
    scores = [float(s) for s in raw_scores if s is not None]
    if len(scores) == 0:
        return {"M": 0.0, "S": 0.0, "CI": 0.0, "break_idx": 0, "sorted_scores": []}

    sorted_scores = sorted(scores, reverse=True)
    i = breaking_point_index(sorted_scores)

    S = sum(sorted_scores)  # RAW sum over ALL candidates
    if S <= 0:
        return {"M": 0.0, "S": 0.0, "CI": 0.0, "break_idx": i, "sorted_scores": sorted_scores}

    normalized = [s / S for s in sorted_scores]
    M = sum(normalized[: i + 1])  # include the breaking point index

    CI = (M * S) / s_max  # normalized to ~[0,1] under 0..100 scores and ≤10 cands

    return {
        "M": float(M),
        "S": float(S),
        "CI": float(CI),
        "break_idx": int(i),
        "sorted_scores": sorted_scores,
    }

# ---------- CI from CSV string column ----------

def ci_from_candidate_string(cand_str: str, s_max: float = 1000.0) -> Dict[str, object]:
    """
    One-stop function for your CSV rows.
    Input is the raw 'cand (score)|cand (score)|...' string.
    Returns {M, S, CI, break_idx, sorted_scores, parsed_pairs}
    """
    pairs = parse_candidate_str(cand_str)
    scores = [score for _, score in pairs]
    out = ci_from_scores(scores, s_max=s_max)
    # also return the sorted (name, score) for convenience
    out["parsed_pairs_sorted"] = sorted(pairs, key=lambda x: x[1], reverse=True)
    return out

# ---------- threshold from PlausibleQA-style JSON ----------

def ci_list_from_plausibleqa_json_items(items: List[dict], s_max: float = 1000.0) -> List[float]:
    """
    items: list of QA dicts in the same shape as sample_plausible_qa.json.
    Uses 'listwise' field as the raw 0..100 score for each candidate.
    """
    cis: List[float] = []
    for ex in items:
        cand_obj: dict = ex.get("candidate_answers", {})
        raw_scores = []
        for _cand, payload in cand_obj.items():
            # Prefer listwise; default to 0 if missing
            raw_scores.append(float(payload.get("listwise", 0)))
        ci_meta = ci_from_scores(raw_scores, s_max=s_max)
        cis.append(ci_meta["CI"])
    return cis

def breaking_threshold(values: List[float]) -> Dict[str, object]:
    """
    Sort values descending, find the breaking point (point before largest drop),
    and return the threshold = values_sorted[i].
    """
    vs = sorted([float(v) for v in values], reverse=True)
    if len(vs) == 0:
        return {"threshold": 0.0, "index": 0, "sorted": vs}
    i = breaking_point_index(vs)
    return {"threshold": vs[i], "index": i, "sorted": vs}

def ci_threshold_from_plausibleqa_json(path: str, s_max: float = 1000.0) -> float:
    """
    Load the 1.2k verified PlausibleQA JSON file (same format as sample),
    compute CI per question, then compute the breaking-point threshold over CIs.
    Returns {'threshold': float, 'index': int, 'sorted': [...], 'cis': [...]}
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    cis = ci_list_from_plausibleqa_json_items(data, s_max=s_max)
    sorted_cis = sorted([float(v) for v in cis], reverse=True)
    print(f"Loaded {len(cis)} CIs from {path}.")
    print(f"Mean CI: {np.mean(cis):.4f}, Median CI: {np.median(cis):.4f}")
    #print(sorted_cis)
    #bp = knee_point_of_ci_series(cis)
    print(len(cis))
    #bp["cis"] = cis
    print(sorted(cis, reverse=True))
    #bp = 0.5
    bp = np.mean(cis)  # fallback to mean if no knee found
    print(f"Number of confusing questions from general data: {len([ci for ci in cis if ci >= bp])}")

    
    return bp

# ---------- global CI threshold (precompute once) ----------

def compute_ci_series(df: pd.DataFrame,
                      cand_col: str = "candidate_answers",
                      scale: float = 1000.0) -> pd.Series:
    """
    Compute CI per row; returns a float Series (index aligned with df).
    """
    return df[cand_col].apply(lambda s: ci_from_candidate_string(s, scale=scale))

def knee_point_of_ci_series(ci_series: List[float]) -> float:
    """
    Return the CI threshold using the Kneedle algorithm over the DESC-sorted CI values.
    - Sort CIs in descending order.
    - Run Kneedle (decreasing curve). If multiple variants yield a knee, pick the
      earliest index in the DESC list.
    - If no knee is found or kneed isn't available, fall back to:
        point before the largest drop (ties -> earliest).
    Returns np.nan if there are no values.
    """
    
    vals = sorted([float(v) for v in ci_series], reverse=True)
    n = len(vals)
    if n == 1:
        return float(vals[0])

    # Try Kneedle first
    
    from kneed import KneeLocator

    x = np.arange(n)

    # Try a couple of settings; collect any found knees and pick earliest
    candidate_knees = []

    # Typical for a DESC series with a sharp early drop is convex+decreasing
    for curve, interp in [
        ("convex", "interp1d"),
        ("convex", "polynomial"),
        ("concave", "interp1d"),
        ("concave", "polynomial"),
    ]:
        kl = KneeLocator(
            x, vals,
            curve=curve,
            direction="decreasing",
            interp_method=interp
        )
        if kl.knee is not None:
            candidate_knees.append(int(kl.knee))

    if candidate_knees:
        knee_idx = min(candidate_knees)  # earliest in DESC list
        return float(vals[knee_idx])

def _scores_from_cand_str(cand_str: str):
    return [s for _, s in parse_candidate_str(cand_str)]

def simple_labeling(df, cand_col="candidate_answers", threshold=50, out_col="simple_conf"):
    """
    Adds a boolean column `out_col` and returns df.
    Rule (as you specified): a question is NON-CONFUSING if it has at least one candidate score >= threshold.
    We store `simple_conf = True` when the item is *confusing* (to match the column name),
    i.e., simple_conf = NOT(any score >= threshold).
    """
    def is_confusing(cand_str: str) -> bool:
        scores = _scores_from_cand_str(cand_str)
        any_ge = any(score >= threshold for score in scores)
        return any_ge  # True => confusing; False => non-confusing

    df[out_col] = df[cand_col].apply(is_confusing)
    return df


# ---- CI computation into columns (if you haven't added them yet) ----

def add_ci_columns(df, cand_col="candidate_answers", s_max=1000.0,
                   ci_col="CI", m_col="M", s_col="S", bp_col="break_idx"):
    """
    Computes M, S, CI, break_idx per row from the candidate string column and adds them to df.
    """
    def _compute(cand_str: str):
        meta = ci_from_candidate_string(cand_str, s_max=s_max)
        return meta["CI"]

    out = df[cand_col].apply(_compute)
    df[[ci_col]] = pd.DataFrame(out.tolist(), index=df.index)
    return df


# ---- advanced labeling (uses precomputed CI threshold from your 1.2k JSON) ----

def advanced_labeling(df, ci_threshold: float, cand_col="candidate_answers",
                       s_max=1000.0, ci_col="CI", out_col="adv_conf"):
    """
    Ensures CI exists, then adds boolean `out_col` and returns df.
    Convention: higher CI => more confusing, so `adv_conf = (CI >= ci_threshold)`.
    """
    if ci_col not in df.columns:
        add_ci_columns(df, cand_col=cand_col, s_max=s_max, ci_col=ci_col)
    df[out_col] = df[ci_col] >= ci_threshold
    print(f"Advanced labeling: {len(df[df[out_col]])} rows are adv_conf == True and {len(df[~df[out_col]])} are adv_conf == False")
    return df




In [12]:
merged_path = "../data/results/100q_results_all.csv"
merged_df = pd.read_csv(merged_path)

### Simple Labeling

In [7]:
# 1) simple labeling
df_acc = simple_labeling(df_acc)
# simple_conf = True and simple_conf = False;
print(f"Out of {len(df_acc)} accuracy == True rows, {len(df_acc[df_acc['simple_conf']])} are simple_conf == True and {len(df_acc[~df_acc['simple_conf']])} are simple_conf == False")
df_acc.to_csv(path, index=False)

Out of 1861 accuracy == True rows, 1061 are simple_conf == True and 800 are simple_conf == False


### Advanced Labeling

In [8]:
ci_threshold = ci_threshold_from_plausibleqa_json("../data/questions/1200_verified_questions.json", s_max=825.0)
print(f"Computed CI threshold from ~1200 verified questions: {ci_threshold}")


Loaded 1266 CIs from ../data/questions/1200_verified_questions.json.
Mean CI: 0.1084, Median CI: 0.0970
1266
[0.9151515151515152, 0.9066666666666666, 0.8181818181818182, 0.8, 0.7636363636363637, 0.7636363636363637, 0.7636363636363637, 0.7636363636363637, 0.7636363636363637, 0.7636363636363637, 0.7636363636363637, 0.7636363636363637, 0.7333333333333333, 0.7333333333333333, 0.7333333333333333, 0.7333333333333333, 0.7090909090909091, 0.696969696969697, 0.6787878787878788, 0.6787878787878788, 0.6545454545454545, 0.6545454545454545, 0.6545454545454545, 0.6545454545454545, 0.6484848484848484, 0.6363636363636364, 0.6363636363636364, 0.6363636363636364, 0.6363636363636364, 0.6036363636363636, 0.5818181818181818, 0.5454545454545454, 0.5272727272727272, 0.5151515151515151, 0.5151515151515151, 0.5103030303030303, 0.48484848484848486, 0.48484848484848486, 0.48484848484848486, 0.4666666666666667, 0.4666666666666667, 0.4666666666666667, 0.45454545454545453, 0.43636363636363634, 0.43636363636363634, 

In [9]:
# advanced
df_acc = advanced_labeling(df_acc, ci_threshold=ci_threshold, cand_col="candidate_answers",
                            s_max=1000.0, ci_col="CI", out_col="adv_conf")


print(f"Out of {len(df_acc)} accuracy == True rows, {len(df_acc[df_acc['adv_conf']])} are adv_conf == True and {len(df_acc[~df_acc['adv_conf']])} are adv_conf == False")
df_acc.to_csv(path, index=False)

Advanced labeling: 248 rows are adv_conf == True and 1613 are adv_conf == False
Out of 1861 accuracy == True rows, 248 are adv_conf == True and 1613 are adv_conf == False


# Rewards and Penalties

In [10]:
import ast
import re
import math
import pandas as pd
from typing import Dict, List, Tuple

# ----------------------------
# Helpers: parsing & canonicalization
# ----------------------------

_cand_re = re.compile(r"\s*(?P<label>.+?)\s*\(\s*(?P<score>-?\d+(?:\.\d+)?)\s*\)\s*$")

def parse_candidate_scores(cand_str: str) -> Dict[str, float]:
    """
    Parse 'candidate_answers' like: "Paris (87)|Lyon (42)|Marseille (15)" -> { "Paris":87.0, ... }
    Returns empty dict on bad/missing strings.
    """
    if not isinstance(cand_str, str) or not cand_str.strip():
        return {}
    out = {}
    for chunk in cand_str.split("|"):
        m = _cand_re.match(chunk)
        if m:
            lab = m.group("label").strip()
            try:
                out[lab] = float(m.group("score"))
            except Exception:
                # skip unparseable scores
                continue
    return out

def _canon(s: str) -> str:
    """Lowercase + collapse whitespace for robust matching."""
    return " ".join(str(s).lower().split())

def parse_mentioned(val) -> List[str]:
    """
    'mentioned_cands' may be a real list or a stringified list.
    Returns [] if missing.
    """
    if isinstance(val, list):
        return [str(x) for x in val]
    if pd.isna(val):
        return []
    s = str(val).strip()
    # try JSON/python-literal list first
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(x) for x in parsed]
    except Exception:
        pass
    # very permissive fallback: split by |,; (rarely needed)
    if any(sep in s for sep in "|;,"):
        return [t.strip() for t in re.split(r"[|;,]", s) if t.strip()]
    return [s] if s else []

# ----------------------------
# Scoring primitives
# ----------------------------

def reward_score(mentioned: List[str], cand_scores: Dict[str, float]) -> float:
    """
    Reward = sum(p_i) over mentioned / sum(p_i) over all candidates, with p_i in [0,1] (raw/100).
    0 if no candidates or denom=0.
    Clipped to [0,1].
    """
    if not cand_scores:
        return 0.0
    # normalize to [0,1]
    p = { _canon(k): v for k, v in cand_scores.items() }
    mentioned_set = { _canon(m) for m in (mentioned or []) }
    # squared sum of mentioned scores
    num = sum(p[k]**2 for k in p.keys() if k in mentioned_set)
    # squared sum of all scores
    den = sum(p[k]**2 for k in p.keys())
    if den <= 0:
        return 0.0
    return float(max(0.0, min(1.0, num/den)))

def penalty_score(mentioned: List[str], cand_scores: Dict[str, float]) -> float:
    """
    Penalty = sum(1 - p_i) over mentioned / sum(1 - p_i) over all candidates, p_i in [0,1].
    0 if no candidates or denom=0.
    Clipped to [0,1].
    """
    if not cand_scores:
        return 0.0
    p = { _canon(k): v for k, v in cand_scores.items() }
    q = { k: (100.0 - v) for k, v in p.items() }  # (1 - p_i)
    mentioned_set = { _canon(m) for m in (mentioned or []) }
    num = sum(q[k]**2 for k in q.keys() if k in mentioned_set)
    den = sum(q[k]**2 for k in q.keys())
    if den <= 0:
        # all p_i == 1.0 -> there's nothing to penalize
        return 0.0
    return float(max(0.0, min(1.0, num/den)))

def metric_for_row(row: pd.Series, conf_bool: bool) -> float:
    """
    Unified metric per row:
      - if conf_bool is True  (confusing):        score = Reward
      - if conf_bool is False (non-confusing):    score = 1 - Penalty
    """
    print(f"Question: {row.get('question', 'N/A')}")
    print(f"LLM Response: {row.get('llm_response', 'N/A')}")
    print(f"Candidate Answers: {row.get('candidate_answers', 'N/A')}")
    print(f"Mentioned Candidates: {row.get('mentioned_cands', 'N/A')}")
    print(f"Confusing: {conf_bool}")
    
    cand_scores = parse_candidate_scores(row.get("candidate_answers", ""))
    mentioned = parse_mentioned(row.get("mentioned_cands", []))
    if conf_bool:
        score = reward_score(mentioned, cand_scores)
        print(f"Metric Score: {score}")
        print("=" * 50)
        return score
    else:
        score = 1.0 - penalty_score(mentioned, cand_scores)
        print(f"Metric Score: {score}")
        print("=" * 50)
        return score

# ----------------------------
# Main entry
# ----------------------------

def get_metric_score(
    df: pd.DataFrame,
    labeling: str = "simple",          # "simple" or "advanced"
    just_reward: bool = False,         # if True, also compute pure-reward (ignores labels)
    only_accuracy: bool = True,        # compute only where accuracy == True
    accuracy_col: str = "accuracy",    # accuracy column name (bool)
) -> pd.DataFrame:
    """
    Adds metric columns to df and returns it.

    Columns this expects on df:
      - 'candidate_answers' (str): "cand (score)|cand (score)|..."
      - 'mentioned_cands'  (list OR stringified list)
      - one of:
          * if labeling == "simple":   'simple_conf' (bool)
          * if labeling == "advanced": 'adv_conf'    (bool)
      - optional: 'accuracy' (bool) if only_accuracy=True

    Adds:
      - 'simple_metric' or 'adv_metric'  (float in [0,1])
      - optionally 'just_reward'         (float in [0,1]) if just_reward=True
    """
    label_col = "simple_conf" if labeling == "simple" else "adv_conf"
    out_col   = "simple_metric" if labeling == "simple" else "adv_metric"

    if label_col not in df.columns and not just_reward:
        raise ValueError(f"Missing labeling column '{label_col}'. Run your labeling step first.")

    target_idx = df.index
    if only_accuracy and accuracy_col in df.columns:
        target_idx = df.index[df[accuracy_col] == True]

    if just_reward:
        def _row_reward(i: int) -> float:
            if only_accuracy and i not in target_idx:
                return math.nan
            cand_scores = parse_candidate_scores(df.at[i, "candidate_answers"] if "candidate_answers" in df.columns else "")
            mentioned   = parse_mentioned(df.at[i, "mentioned_cands"] if "mentioned_cands" in df.columns else [])
            
            return reward_score(mentioned, cand_scores)

        df["just_reward"] = [ _row_reward(i) for i in df.index ]
    else:

    # Compute the labeled metric
        if label_col in df.columns:
            # Fill missing labels with False (non-confusing) to be safe
            labels = df[label_col].fillna(False)

            def _row_metric(i: int) -> float:
                if only_accuracy and i not in target_idx:
                    return math.nan
                return metric_for_row(df.loc[i], bool(labels.loc[i]))

            df[out_col] = [ _row_metric(i) for i in df.index ]

    

    return df


In [11]:
df_acc = get_metric_score(df_acc, labeling="simple", just_reward=False, only_accuracy=True)
df_acc.to_csv(path, index=False)

Question: "Feel Like Making Love" and "The First Time Ever I Saw Your Face" were hit singles for which female artist?
LLM Response: The female artist who had hit singles with "Feel Like Making Love" and "The First Time Ever I Saw Your Face" is Roberta Flack.
Candidate Answers: Aretha Franklin (60)|Randy Crawford (55)|Gladys Knight (50)|Natalie Cole (45)|Anita Baker (40)|Etta James (35)|Patti LaBelle (30)|Dionne Warwick (25)|Diana Ross (20)|Jennifer Holliday (15)
Mentioned Candidates: []
Confusing: True
Metric Score: 0.0
Question: "Feel Like Making Love" and "The First Time Ever I Saw Your Face" were hit singles for which female artist?
LLM Response: The hit singles "Feel Like Making Love" and "The First Time Ever I Saw Your Face" were both recorded and popularized by the British-American singer and songwriter Roberta Flack. "The First Time Ever I Saw Your Face" was a number-one hit for her in 1972, and "Feel Like Making Love" reached the top ten in 1974. Roberta Flack is known for her 

In [13]:
df_acc = get_metric_score(df_acc, labeling="advanced", just_reward=False, only_accuracy=True)
df_acc.to_csv(path, index=False)

Question: Who plays the role of Bubble in Absolutely Fabulous?
LLM Response: Jennifer Saunders plays the role of Edina Monsoon, but the character Bubble is actually played by Jane Horrocks.
Candidate Answers: Joanna Lumley (20)|Harriet Thorpe (18)|Julia Sawalha (15)|June Whitfield (12)|Miranda Richardson (10)|Phyllis Logan (9)|Kathy Burke (8)|Helena Bonham Carter (6)|Tilda Swinton (5)|Kristin Scott Thomas (4)
Mentioned Candidates: []
Confusing: False
Metric Score: 1.0
Question: Who plays the role of Bubble in Absolutely Fabulous?
LLM Response: The role of Bubble in Absolutely Fabulous is played by Jane Horrocks. 

There are no widely recognized alternative actors who have played this role in the main series or films of Absolutely Fabulous, so no further clarifications are necessary.
Candidate Answers: Joanna Lumley (20)|Harriet Thorpe (18)|Julia Sawalha (15)|June Whitfield (12)|Miranda Richardson (10)|Phyllis Logan (9)|Kathy Burke (8)|Helena Bonham Carter (6)|Tilda Swinton (5)|Kristin 

In [12]:
df_acc = get_metric_score(df_acc, labeling="simple", just_reward=True, only_accuracy=True)
df_acc.to_csv(path, index=False)

In [None]:
def add_num_mentions(pd_series: pd.Series) -> pd.Series:
    """
    Adds a column with the number of mentions in 'mentioned_cands'.
    """
    return len(parse_mentioned(pd_series["mentioned_cands"]))

merged_df["num_mentions"] = merged_df.apply(
    lambda r: add_num_mentions(r),
    axis=1
) 
merged_df.to_csv(merged_path, index=False)