In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os, json, random, re
import matplotlib.pyplot as plt
import ast
import numpy as np
load_dotenv()

True

In [2]:
path = "../data/results/100q_results_all.csv"
df = pd.read_csv(path)

In [8]:
import re
import ast
import pandas as pd
from typing import Dict, List
from scipy.stats import pearsonr, spearmanr

S_MAX = 825.0  # given by you; do not recompute

# ----------------------------
# Parsing helpers (from your old metric code)
# ----------------------------
_cand_re = re.compile(r"\s*(?P<label>.+?)\s*\(\s*(?P<score>-?\d+(?:\.\d+)?)\s*\)\s*$")

def parse_candidate_scores(cand_str: str) -> Dict[str, float]:
    """
    Parse 'candidate_answers' like: "Paris (87)|Lyon (42)|Marseille (15)" -> { "Paris":87.0, ... }
    Returns empty dict on bad/missing strings.
    """
    if not isinstance(cand_str, str) or not cand_str.strip():
        return {}
    out = {}
    for chunk in cand_str.split("|"):
        m = _cand_re.match(chunk)
        if m:
            lab = m.group("label").strip()
            try:
                out[lab] = float(m.group("score"))
            except Exception:
                continue
    return out

def _canon(s: str) -> str:
    """Lowercase + collapse whitespace for robust matching."""
    return " ".join(str(s).lower().split())

def parse_mentioned(val) -> List[str]:
    """
    'mentioned_cands' may be a real list or a stringified list.
    Returns [] if missing.
    """
    if isinstance(val, list):
        return [str(x) for x in val]
    if pd.isna(val):
        return []
    s = str(val).strip()
    # try JSON/python-literal list first
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(x) for x in parsed]
    except Exception:
        pass
    # very permissive fallback: split by |,; (rarely needed)
    import re as _re
    if any(sep in s for sep in "|;,"):
        return [t.strip() for t in _re.split(r"[|;,]", s) if t.strip()]
    return [s] if s else []

# ----------------------------
# Mentioned raw score sum per row
# ----------------------------
def mentioned_raw_sum(row: pd.Series) -> float:
    """
    Sum of RAW candidate scores for the candidates that were mentioned in the LLM response.
    Range theoretically 0..1000 (10 cands * 100), but practically lower.
    """
    cand_scores = parse_candidate_scores(row.get("candidate_answers", ""))
    if not cand_scores:
        return 0.0
    canon_scores = { _canon(k): float(v) for k, v in cand_scores.items() }
    mentioned = parse_mentioned(row.get("mentioned_cands", []))
    mentioned_set = { _canon(m) for m in (mentioned or []) }
    return float(sum(canon_scores[k] for k in canon_scores.keys() if k in mentioned_set))




In [9]:
# Ensure numeric types
df["CI"] = pd.to_numeric(df["CI"], errors="coerce")
print(df["CI"].isna().sum(), "rows with NaN CI values")

# Compute raw mentioned sum
df["mentioned_raw_sum"] = df.apply(mentioned_raw_sum, axis=1)

# Optional: normalize to match CI scale (not required for Pearson, but provided for convenience)
df["mentioned_raw_sum_norm"] = df["mentioned_raw_sum"] / S_MAX

# Drop rows with missing CI
df = df.dropna(subset=["CI"])

# --- Correlations: CI vs RAW mentioned sum ---
pearson_raw, p_raw = pearsonr(df["CI"], df["mentioned_raw_sum"])
spearman_raw, sp_raw = spearmanr(df["CI"], df["mentioned_raw_sum"])

print("[RAW] Pearson r:", pearson_raw, "p:", p_raw)
print("[RAW] Spearman ρ:", spearman_raw, "p:", sp_raw)

# --- (Optional) Correlations: CI vs NORMALIZED mentioned sum ---
pearson_norm, p_norm = pearsonr(df["CI"], df["mentioned_raw_sum_norm"])
spearman_norm, sp_norm = spearmanr(df["CI"], df["mentioned_raw_sum_norm"])

print("[NORM] Pearson r:", pearson_norm, "p:", p_norm)
print("[NORM] Spearman ρ:", spearman_norm, "p:", sp_norm)

0 rows with NaN CI values
[RAW] Pearson r: 0.2538125671765854 p: 3.1133278777848485e-25
[RAW] Spearman ρ: 0.2046483874383687 p: 8.887830564597392e-17
[NORM] Pearson r: 0.2538125671765854 p: 3.1133278777848485e-25
[NORM] Spearman ρ: 0.2046483874383687 p: 8.887830564597392e-17
