# $T_{bg}$ estimate

In this notebook we compare the variability metrics computed on the references'answers with the ones computed on the models under test to estimate their $T_{bg}$.

In [None]:
import editdistance
import numpy as np
import json
import os
from datasets import load_dataset
from tqdm import tqdm
import time
import tiktoken
from scipy.stats import ks_2samp
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import pandas as pd
import math
from math import comb
import bisect

### Variability Metrics and K-S
In the following cells we define the variability metrics considered and we implement the Kolmogorov-Smirnov distance used to measure the distance between distributions.

In [None]:
# Maximum exact match fraction
def exact_match_fraction(answers_model):
    freq_max = []
    for question, answers in answers_model.items():
        n_answers = len(answers)
        answers_as_tuples = [tuple(ans) for ans in answers]
        
        counter = Counter(answers_as_tuples)
        
        most_common_answer, freq = counter.most_common(1)[0]
        
        #freq_max[question] = freq / n_answers
        freq_max.append(freq / n_answers)
    return freq_max

In [None]:
# Levenshtein distance
def Levenshtein(answers_model, max_tokens):
    lev = []
    for question, answers in answers_model.items():
        n = len(answers)
        matrix_lev = np.zeros((n, n), dtype=float)
        
        for i in range(n):
            for j in range(n):
                matrix_lev[i, j] = editdistance.eval(tuple(answers[i]), tuple(answers[j]))/max_tokens
        triu_vals = matrix_lev[np.triu_indices(n, k=1)]
        #norma = np.linalg.norm(matrix_lev)/(n*n)
        lev.append(np.mean(triu_vals))
    return lev

In [None]:
def fast_lcs(a, b):
    # Computes the LCS length using a patience-sortingâ€“style algorithm for sequences with potentially repeated elements.
    pos = defaultdict(list)
    for i, x in enumerate(b):
        pos[x].append(i)

    seq = []
    for x in a:
        for j in reversed(pos.get(x, [])):
            idx = bisect.bisect_left(seq, j)
            if idx == len(seq):
                seq.append(j)
            else:
                seq[idx] = j
    return len(seq)


def average_lcs_distance(answers_model, use_fast_lcs=True):
    """
    Computes the average pairwise LCS-based distance between answers
    for each question, including identical-answer pairs.
    """
    distances = []

    for question, answers in answers_model.items():
        n = len(answers)
        if n <= 1:
            distances.append(0.0)
            continue

        # Deduplicate answers to reduce computations while preserving correct weighting of all original answer pairs.
        answers_as_tuples = [tuple(a) for a in answers]
        counter = Counter(answers_as_tuples)
        unique_answers = list(counter.keys())
        freqs = list(counter.values())
        K = len(unique_answers)

        # Compute the total weighted distance over all unique answer pairs.
        total_weighted_distance = 0.0

        for i in range(K):
            a = unique_answers[i]
            fa = freqs[i]
            for j in range(i, K):
                b = unique_answers[j]
                fb = freqs[j]

                # Identical answers have zero distance.
                if i == j:
                    d = 0.0
                else:
                    # Select LCS implementation (fast or standard DP).
                    if use_fast_lcs:
                        L = fast_lcs(a, b)
                    else:
                        m, n_ = len(a), len(b)
                        dp = [[0]*(n_+1) for _ in range(m+1)]
                        for ii in range(m):
                            for jj in range(n_):
                                if a[ii] == b[jj]:
                                    dp[ii+1][jj+1] = dp[ii][jj] + 1
                                else:
                                    dp[ii+1][jj+1] = max(dp[ii+1][jj], dp[ii][jj+1])
                        L = dp[m][n_]
                    d = 1 - L / min(len(a), len(b))

                # Number of original answer pairs represented by this pair.
                if i == j:
                    weight = fa * (fa - 1) / 2 
                else:
                    weight = fa * fb

                total_weighted_distance += d * weight

        # Normalize by the total number of answer pairs.
        total_pairs = comb(sum(freqs), 2)
        distances.append(total_weighted_distance / total_pairs)

    return distances

In [None]:
# Kolmogorv-Smirnov distance
def ks_distance(list1, list2):
    ks_stat, _ = ks_2samp(list1, list2)
    return ks_stat

### Loading responses from reference models and target models for $T_{bg}$  estimation

In [None]:
with open("references.json", "r", encoding="utf-8") as f:
    references = json.load(f)

In [None]:
# Insert in the following dictionary the names of the file with the asnwers given by the models whose T_bg is being estimated
tested_dict = {"gpt-4.1-nano_Azure":"answers_gpt.json",
               "claude-sonnet-4_AWS":"answers_claudesonnet4.json",
               "grok-3-mini_AWS": "answers_grok.json"}

tested = {}
for key in tested_dict:
    with open(tested_dict[key], "r", encoding="utf-8") as f:
        data = json.load(f)
    tested[key] = data

### Compute variability distributions
Below we compute the variability distributions both for the reference models and for the models whose $T_{bg}$ is being estimated.

In [None]:
variabilities = ["max_exact_match", "Levenshtein_lists", "average_lcs_distance"] 

In [None]:
ref_variabilities = {}
for key in references:
    print(key)
    ref_variabilities[key] = {}
    for var in variabilities:
        print(var)
        ref_variabilities[key][var] = {}
        for temp in tqdm(references[key]):
            temp_r = round(float(temp), 2)
            if var == "max_exact_match":
                ref_variabilities[key][var][temp_r] = exact_match_fraction(references[key][temp])
            elif var == "Levenshtein_lists":
                ref_variabilities[key][var][temp_r] = Levenshtein(references[key][temp], 32)
            elif var == "average_lcs_distance":
                ref_variabilities[key][var][temp_r] = average_lcs_distance(references[key][temp])

In [None]:
test_variabilities = {}
for key in tested.keys():
    test_variabilities[key] = {}
    for var in variabilities:
        if var == "max_exact_match":
            test_variabilities[key][var] = exact_match_fraction(tested[key])
        elif var == "Levenshtein_lists":
            test_variabilities[key][var] = Levenshtein(tested[key], 32)           
        elif var == "average_lcs_distance":
            test_variabilities[key][var] = average_lcs_distance(tested[key])
        

### Estimate of $T_{bg}$
Below, we compare the variability distributions computed for the models under study with the reference ones, thereby estimating $T_{bg}$.

In [None]:
# Consider only common questions - in the case in which the datasets of the references and of the models under test are different
index_common = {}
for ref in ref_variabilities.keys():
    index_common[ref] = {}
    ref_questions = list(references[ref][list(references[ref].keys())[0]].keys())
    for mod in test_variabilities.keys():
        test_questions = list(tested[mod].keys())
        index_common[ref][mod] = [i for i, x in enumerate(ref_questions) if x in test_questions]

In [None]:
# Compute the K-S distance between the variability distributions computed on the answers given by the models under test 
#and the variability distributions of the answers given by the reference models at the sampled temperatures.
distances = {}
for mod in test_variabilities.keys():
    distances[mod] = {}
    for ref in ref_variabilities.keys():
        distances[mod][ref] = {}
        distances[mod][ref]['num_prompt'] = len(index_common[ref][mod])
        for var in variabilities:
            dists = {}
            for temp in ref_variabilities[ref][var].keys():
                dists[temp] = ks_distance(test_variabilities[mod][var], [ref_variabilities[ref][var][temp][i] for i in index_common[ref][mod]])
                distances[mod][ref][var] = dists

In [None]:
#Find the temperatures that minimize the K-S distances computed previously for each reference model and model under test
results = {}
for test_mod, subdict in distances.items():
    results[test_mod] = {}
    for ref_mod, metrics in subdict.items():
        results[test_mod][ref_mod] = {}
        for metric_name, metric_values in metrics.items():
            if isinstance(metric_values, dict):  # solo metriche con coppie (temp, valore)
                min_val = min(metric_values.values())
                min_temps = [t for t, v in metric_values.items() if v == min_val]
                results[test_mod][ref_mod][metric_name] = {
                    "min_value": min_val,
                    "temperatures": min_temps,
                    "num_prompts": distances[test_mod][ref_mod]['num_prompt']
                }


In [None]:
flat_rows = []

for model1, subdict in results.items():
    row = {"Tested": model1}
    num_prompts = None  

    for model2, metrics in subdict.items():
        for metric_name, info in metrics.items():
            name_ref = model2.split("-")[0]
            col_prefix = f"{name_ref}_{metric_name}"
            temps = info["temperatures"]
            avg_temp = sum(temps) / len(temps) if temps else None

            if num_prompts is None and "num_prompts" in info:
                num_prompts = info["num_prompts"]

            # Save the mean temperatures for each couple variability metric / reference model
            row[f"{col_prefix}_Tn"] = avg_temp

    row["Num_prompts"] = num_prompts
    flat_rows.append(row)

df_flat = pd.DataFrame(flat_rows)

# Compure the for each variability metric
metric_cols = [c for c in df_flat.columns if "_Tn" in c]

for metric_name in variabilities:
    cols = [c for c in metric_cols if metric_name in c]
    df_flat[f"Mean_{metric_name}"] = df_flat[cols].mean(axis=1, skipna=True)

# Overall mean of the estimates obtained for each variability metric
df_flat["Tbg estimate"] = df_flat[metric_cols].mean(axis=1, skipna=True)

# Reordering columns
first_cols = ["Tested", "Tbg estimate", "Mean_max_exact_match", "Mean_Levenshtein_lists",'Mean_average_lcs_distance', "Num_prompts"]
other_cols = [c for c in df_flat.columns if c not in first_cols]
df_flat = df_flat[first_cols + sorted(other_cols)]

# Saving in a csv all the datas
df_flat.to_csv("results_Tbg.csv")

The table below (with the data saved in the csv above) reports, for each model, the final Tbg estimate, the values of the individual variability metrics, and the number of prompts used.


In [None]:
cols_main = [
    "Tested",
    "Tbg estimate",
    "Mean_max_exact_match",
    "Mean_Levenshtein_lists",
    "Mean_average_lcs_distance",
    "Num_prompts",
]

df_flat[cols_main].round(4)