In [None]:

import numpy as np, pandas as pd, scipy.stats as ss
from sacrebleu.metrics import BLEU
from rouge_score import rouge_scorer
from sklearn.metrics import roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss

In [None]:
df = pd.read_csv("../output/quantitative_metrics/all_scalers_testset.csv")

# DISTRIBUTION PLOTS (one plot per metric)
metrics = ["alignscore"]
for m in metrics:
    plt.figure()
    plt.hist(df[m].dropna(), bins=30)
    plt.title(f"Distribution of {m}")
    plt.xlabel(m)
    plt.ylabel("Frequency")
    plt.show()

In [None]:


df = pd.read_csv("../output/quantitative_metrics/all_scalers_testset.csv")

# Answer length in *tokens* (white‑space split)
df["answer_len"] = df["final_answer"].str.split().str.len()

# DISTRIBUTION PLOTS (one plot per metric)
metrics = ["lex_score", "ecc_score", "deg_score"]
for m in metrics:
    plt.figure()
    plt.hist(df[m].dropna(), bins=30)
    plt.title(f"Distribution of {m}")
    plt.xlabel(m)
    plt.ylabel("Frequency")
    plt.show()

# LENGTH vs UNCERTAINTY SCATTERS 
for m in metrics:
    plt.figure()
    plt.scatter(df["answer_len"], df[m], alpha=0.6)
    plt.title(f"{m} vs Answer Length")
    plt.xlabel("Answer Length (tokens)")
    plt.ylabel(m)
    plt.show()

    # print Spearman rank correlation
    rho, p = ss.spearmanr(df["answer_len"], df[m])
    print(f"Spearman ρ(length, {m}) = {rho:.3f} (p = {p:.3e})")


In [None]:
# same plots with scaled values

# Answer length in *tokens* (white‑space split)
df["answer_len"] = df["final_answer"].str.split().str.len()

# ---------- 1. DISTRIBUTION PLOTS (one plot per metric) ----------
metrics = ["lex_score_conf_q", "lex_score_conf_iso", "lex_score_conf_sig", "ecc_score_conf_q",  "ecc_score_conf_iso","ecc_score_conf_sig","deg_score_conf_q", "deg_score_conf_iso","deg_score_conf_sig"]
for m in metrics:
    plt.figure()
    plt.hist(df[m].dropna(), bins=30)
    plt.title(f"Distribution of {m}")
    plt.xlabel(m)
    plt.ylabel("Frequency")
    plt.show()

# ---------- 2. LENGTH vs UNCERTAINTY SCATTERS ----------
for m in metrics:
    plt.figure()
    plt.scatter(df["answer_len"], df[m], alpha=0.6)
    plt.title(f"{m} vs Answer Length")
    plt.xlabel("Answer Length (tokens)")
    plt.ylabel(m)
    plt.show()

    # Optional: print Spearman rank correlation in the notebook output.
    rho, p = ss.spearmanr(df["answer_len"], df[m])
    print(f"Spearman ρ(length, {m}) = {rho:.3f} (p = {p:.3e})")


### alignscore results

=== Spearman ρ (UE ↓  vs  AlignScore ↑) ===
lex_score        ρ = +0.018   p = 7.82e-01
ecc_score        ρ = +0.047   p = 4.83e-01
deg_score        ρ = +0.093   p = 1.62e-01

=== PRR ===
lex_score        PRR = +0.156
ecc_score        PRR = +0.161
deg_score        PRR = +0.217

=== Binary AUROC (good = AlignScore >= 0.7) ===
lex_score        AUROC = 0.883
ecc_score        AUROC = 0.909
deg_score        AUROC = 0.897

=== ECE on held‑out test fold (lower is better) ===
lex_score        ECE = 0.212
ecc_score        ECE = 0.187
deg_score        ECE = 0.238
 ---

Spearman ρ ≈ 0.3 → UE is mildly correlated with factual quality; ρ ≤ 0 shows the ranking is broken.

PRR > 0.2 on only 220 answers is a respectable signal; < 0 means you’d do worse than random by trusting that UE score.

AUROC ≥ 0.7 indicates the score can separate good vs bad answers fairly well at some threshold.

ECE ≤ 0.1 on the 20 % hold‑out set suggests reasonable calibration; large ECE flags that the raw UE numbers aren’t usable as probabilities without extra scaling.

In [None]:
df = pd.read_csv("../output/quantitative_metrics/all_scalers_testset.csv")

# Add BLEU (sentence‑level) & ROUGE‑L‑F1 columns
bleu_metric  = BLEU(tokenize='13a', effective_order=True)  
rl_scorer    = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def rouge_l_f1(pred, ref):
    return rl_scorer.score(ref, pred)['rougeL'].fmeasure       # 0‑1

df["bleu"]     = [bleu_metric.sentence_score(sys, [ref]).score / 100
                  for sys, ref in zip(df["final_answer"], df["reference"])]
df["rouge_l"]  = [rouge_l_f1(sys, ref)
                  for sys, ref in zip(df["final_answer"], df["reference"])]


# Configure UE columns and quality metrics (+ thresholds)
ue_cols   = ["lex_score", "ecc_score", "deg_score"]
qual_cfgs = {
    "alignscore": 0.70,   # factual‑consistency “good” cut‑off
    "bleu":       0.20,   # surface‑overlap “good” cut‑off
    "rouge_l":    0.25    # sentence‑LCS “good” cut‑off
}

#  Prediction–Rejection Ratio
def prr(quality, uncertainty, higher_quality_better=True):
    q = quality if higher_quality_better else -quality
    # coverage‑sorted areas
    def area(vals):                      # vals sorted *descending* quality
        csum = np.cumsum(vals)
        cov  = np.arange(1, len(vals)+1) / len(vals)
        return np.trapz(csum / np.arange(1, len(vals)+1), cov)

    oracle_area  = area(np.sort(q)[::-1])
    random_area  = q.mean()                   # flat line over coverage
    ue_area      = area(q[np.argsort(uncertainty)])
    return (ue_area - random_area) / (oracle_area - random_area + 1e-12)


rows = []          # gather result rows here

for qual, thr in qual_cfgs.items():
    quality = df[qual]
    good    = (quality >= thr).astype(int)

    for ue in ue_cols:
        # Spearman (UE ↓ vs quality ↑) – negate UE so higher rank = better
        rho, p = ss.spearmanr(-df[ue], quality)

        # PRR   (quality first, UE second) – correct order!
        prr_val = prr(quality.values, df[ue].values)

        # AUROC (UE ↓ so pass -UE)
        auroc   = roc_auc_score(good, -df[ue])

        rows.append({
            "quality_metric": qual,
            "ue_metric":      ue,
            "spearman_rho":   round(rho, 3),
            "prr":            round(prr_val, 3),
            "auroc":          round(auroc, 3)
        })


results_df = pd.DataFrame(rows)
display(results_df)               
results_df.to_csv("../output/quantitative_metrics/ue_metrics_overview.csv", index=False)
print("\nSaved to ue_metrics_overview.csv ")


### calibration results

In [None]:
df = pd.read_csv("../output/quantitative_metrics/all_scalers_testset.csv")


# only using the 20 % test fold that the quantile scaler never saw
test_mask = df["split"] == "test"
n_bins = 10

def ece(prob, label, n_bins=10):
    bins = pd.qcut(prob, q=n_bins, duplicates="drop")
    ece_val = 0.0
    for b in bins.unique():
        mask = bins == b
        if mask.any():
            conf = prob[mask].mean()
            acc  = label[mask].mean()
            ece_val += np.abs(acc - conf) * mask.mean()
    return ece_val

print("\n=== ECE on held‑out test fold (lower is better) ===")
for m in ue_cols:
    # rescale UE to (0,1] confidence by min‑max inversion
    conf = 1.0 - (df.loc[test_mask, m]  - df[m].min()) / (df[m].max() - df[m].min())
    lab  = df.loc[test_mask, "is_good"]
    print(f"{m:15s}  ECE = {ece(conf.values, lab.values, n_bins):.3f}")

In [None]:
# ---------- 0. config ----------
ue_cols   = ["lex_score", "ecc_score", "deg_score"]
n_bins    = 10
test_mask = df["split"] == "test"
labels    = (df["alignscore"] >= 0.70).astype(int)       # good‑answer flag

def ece(prob, label, n_bins=10):
    """Vectorised Expected Calibration Error."""
    bins = pd.qcut(prob, q=n_bins, duplicates="drop")
    ece_val = 0.0
    for b in bins.unique():
        mask = bins == b
        if mask.any():
            conf = prob[mask].mean()
            acc  = label[mask].mean()
            ece_val += np.abs(acc - conf) * mask.mean()
    return ece_val

# ---------- 1. compute & print ----------
print("\n=== ECE on held‑out 20 % test split ===")
header = f"{'UE':12s} | {'raw_minmax':>10} | {'quantile':>8} | {'isotonic':>8}"
print(header)
print("-"*len(header))

for m in ue_cols:
    # 1) raw min‑max rescale (0‑1)
    lo, hi = df[m].min(), df[m].max()
    raw_conf = 1.0 - (df.loc[test_mask, m] - lo) / (hi - lo + 1e-12)

    # 2) quantile confidence (f'{col}_conf_q')
    q_conf = df.loc[test_mask, f"{m}_conf_q"]

    # 3) isotonic confidence (f'{col}_conf_iso')
    iso_conf = df.loc[test_mask, f"{m}_conf_iso"]

    # sig conf
    sig_conf = df.loc[test_mask, f"{m}_conf_sig"]

    ece_raw = ece(raw_conf.values, labels.loc[test_mask].values, n_bins)
    ece_q   = ece(q_conf.values, labels.loc[test_mask].values, n_bins)
    ece_iso = ece(iso_conf.values, labels.loc[test_mask].values, n_bins)
    ece_sig  = ece(sig_conf.values, labels.loc[test_mask].values, n_bins)


    print(f"{m:12s} | {ece_raw:10.3f} | {ece_q:8.3f} | {ece_iso:8.3f} | {ece_sig:8.3f}")
