# Information Extraction

Noah Meissner 01.09.2025

In [12]:
import pandas as pd
import numpy as np
from foodrec.utils.search.request_information_extraction import extract_information
from foodrec.config.structure.paths import DATASET_PATHS
from foodrec.config.structure.dataset_enum import ModelEnum
from tqdm import tqdm  # Fortschrittsbalken


## Get Queries

In [13]:
df_hand_annotation = pd.read_csv(DATASET_PATHS / "Hand_Annotation.csv")

In [14]:
df = pd.read_csv(DATASET_PATHS / "zw_personas.csv")
df_hand_annotation = pd.read_csv(DATASET_PATHS / "Hand_Annotation.csv")

In [15]:
df_sample = df

In [16]:
df_filtered = df_sample['query'].copy()

In [17]:
queries = list(df_filtered)


In [18]:
df_hand_annotation = df_hand_annotation.fillna(0)
df_hand_annotation

Unnamed: 0,query,time,ingredients_included,ingredients_avoid,cuisine,calories,carbohydrates,fat,protein
0,Can you suggest flat-shapes recipes that do no...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1,Can you suggest libyan or british-columbian re...,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0
2,Can you suggest non-alcoholic recipes that do ...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3,Can you suggest octopus recipes that do not co...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,Can you suggest peruvian recipes that do not c...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
95,What reynolds-wrap dishes don't have cornstarc...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
96,What somalian dishes can I make without chicke...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
97,What st-patricks-day dishes do not contain ing...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
98,What stocks dishes don't have dried navy beans?,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


## Information Extraction Pipeline

In [19]:
import pandas as pd

def pipeline(model:ModelEnum, queries):
    results = []
    for query in tqdm(queries, desc=model.name):
        information_extracted = extract_information(query, model)
        results.append({"query": query, model.name: information_extracted})
    
    dataframe = pd.DataFrame(results)
    return dataframe

In [20]:
models = [ModelEnum.LLAMA, ModelEnum.Gemini, ModelEnum.OpenAI]
dfs = [pipeline(model, queries) for model in models]

# alle DataFrames zusammenführen über die Spalte "query"
final_df = dfs[0]
for df in dfs[1:]:
    final_df = pd.merge(final_df, df, on="query", how="outer")

LLAMA:   0%|          | 0/100 [00:00<?, ?it/s]

LLAMA: 100%|██████████| 100/100 [02:34<00:00,  1.54s/it]
Gemini: 100%|██████████| 100/100 [01:56<00:00,  1.16s/it]
OpenAI: 100%|██████████| 100/100 [02:56<00:00,  1.76s/it]


## Create Annotation csv

In [21]:
final_df.to_csv("Annotation.csv", sep=";", index=False)

In [2]:
final_df = pd.read_csv(DATASET_PATHS / "Annotation.csv", delimiter=";")

In [3]:
## TODO fuege human hinzu

In [32]:
import ast
model_list = ["LLAMA", "Gemini", "OpenAI"]
for col in model_list:
    final_df[col] = final_df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [33]:
# Angenommen deine Modelle heißen so:
model_list = ["LLAMA", "Gemini", "OpenAI"]

# Und deine Felder:
fields = ["time", "ingredients_included", "ingredients_avoid", "cuisine", "calories"]

# Für jede Modell/Feld-Kombination eine eigene Spalte erstellen:
for model in model_list:
    for field in fields:
        final_df[f"{model}_{field}"] = final_df[model].apply(lambda d: len(d.get(field)) if isinstance(d, dict) else None)


for model in model_list:
    final_df[f"{model}_cuisine"] = final_df[f"{model}_cuisine"].apply(
        lambda d: d if (isinstance(d, (int, float)) and d <= 2) else 0
    )


In [24]:
df_hand_annotation = df_hand_annotation.fillna(0)


## Create Groundtruth

In [34]:
for field in fields:
    final_df[f"hand_{field}"] = df_hand_annotation[field]

In [35]:
# --- Helper: Majority Vote (2-von-3). Bei 3 verschiedenen Werten -> NaN.
def majority_vote(values, tie_break=None):
    """
    values: Iterable (z.B. 3 Werte von LLAMA/Gemini/OpenAI)
    tie_break: None | "median"
        - None   -> bei 3 versch. Werten NaN
        - median -> bei 3 versch. Werten Median (für ordinale Felder sinnvoll)
    """
    vals = [v for v in values if pd.notna(v)]
    if len(vals) < 2:
        return np.nan

    vc = pd.Series(vals).value_counts()
    if vc.iloc[0] >= 2:
        # klare Mehrheit
        return vc.index[0]

    # alle drei verschieden -> optionaler Tie-Breaker
    if tie_break == "median":
        # für ordinale Felder (z.B. time/calories) oft sinnvoll
        return float(np.median(vals))
    return np.nan

# Optional: Für welche Felder willst du bei 3 verschiedenen Werten den Median erzwingen?
# (Ordinale Felder: time, calories) – kannst du anpassen oder leer lassen.
median_tie_fields = {"time", "calories"}
gt_list = model_list
gt_list.append("hand")
gt_list.remove("Gemini")
# --- Ground-Truth-Spalten erzeugen
for f in fields:
    cols = [f"{m}_{f}" for m in gt_list]
    tb = "median" if f in median_tie_fields else None
    final_df[f"gt_{f}"] = final_df[cols].apply(lambda r: majority_vote(r.values, tie_break=tb), axis=1)

# (Optional) Wenn du sicherstellen willst, dass cuisine strikt binär bleibt:
# final_df["gt_cuisine"] = final_df["gt_cuisine"].apply(lambda x: 1.0 if x == 1 else (0.0 if x == 0 else np.nan))


In [36]:
import numpy as np
import pandas as pd
from collections import Counter
from itertools import combinations
from statsmodels.stats.inter_rater import fleiss_kappa
from sklearn.metrics import cohen_kappa_score

def _field_matrix_from_length_cols(df: pd.DataFrame, field: str, model_list):
    """
    Holt die Ratings (Längen) für ein Feld als Matrix mit shape (n_items, n_rater),
    wirft Zeilen mit NaN in irgendeinem Rater raus und gibt auch die verwendeten Kategorien zurück.
    """
    cols = [f"{m}_{field}" for m in model_list]
    sub = df[cols].copy()
    sub = sub.dropna(axis=0, how="any")  # nur vollständige Zeilen
    if sub.empty:
        return None, None, None
    # in int konvertieren (falls float)
    sub = sub.astype(int)
    # Kategorien (über alle Items/Rater)
    cats = sorted(pd.unique(sub.values.ravel()))
    return sub.values, cats, cols

def fleiss_kappa_from_length_cols(df: pd.DataFrame, field: str, model_list):
    """
    Fleiss’ κ über Längenkategorien aus bereits flachen Spalten {Model}_{field}.
    """
    M, cats, _ = _field_matrix_from_length_cols(df, field, model_list)
    if M is None:
        return np.nan
    # Häufigkeitstabelle je Item: counts pro Kategorie
    table = []
    for row in M:
        cnt = Counter(row)
        table.append([cnt.get(c, 0) for c in cats])
    freq = np.asarray(table)
    # Sicherheit: jede Zeile sollte genau m Ratings haben
    if not np.all(freq.sum(axis=1) == len(model_list)):
        return np.nan
    val = fleiss_kappa(freq)
    try:
        return float(val)
    except Exception:
        return np.nan

def pairwise_cohen_matrix_from_length_cols(df: pd.DataFrame, field: str, model_list, weights=None):
    """
    Paarweise Cohen-κ-Matrix (n×n) für ein Feld aus {Model}_{field}.
    """
    M, cats, cols = _field_matrix_from_length_cols(df, field, model_list)
    if M is None:
        return pd.DataFrame(np.nan, index=model_list, columns=model_list)
    n = len(model_list)
    mat = pd.DataFrame(np.nan, index=model_list, columns=model_list, dtype=float)
    np.fill_diagonal(mat.values, 1.0)
    for i, j in combinations(range(n), 2):
        a = M[:, i]
        b = M[:, j]
        v = cohen_kappa_score(a, b, weights=weights)
        mat.iloc[i, j] = v
        mat.iloc[j, i] = v
    return mat

def kappa_per_field_and_overall_from_lengths(
    df: pd.DataFrame,
    fields=("ingredients_included","ingredients_avoid","cuisine","calories","time"),
    model_list=("LLAMA","Gemini","OpenAI"),
):
    per_field = {}
    # Für die paarweisen Overalls akkumulieren wir (Summe, Anzahl) getrennt
    pairwise_acc = {
        "unweighted": pd.DataFrame(0.0, index=model_list, columns=model_list),
        "linear":     pd.DataFrame(0.0, index=model_list, columns=model_list),
        "quadratic":  pd.DataFrame(0.0, index=model_list, columns=model_list),
        "_counts":    pd.DataFrame(0,   index=model_list, columns=model_list, dtype=int),
    }

    for f in fields:
        # Fleiss
        fl = fleiss_kappa_from_length_cols(df, f, model_list)
        # Paarweise Matrizen
        M_unw  = pairwise_cohen_matrix_from_length_cols(df, f, model_list, weights=None)
        M_lin  = pairwise_cohen_matrix_from_length_cols(df, f, model_list, weights="linear")
        M_quad = pairwise_cohen_matrix_from_length_cols(df, f, model_list, weights="quadratic")

        # Feld-Zusammenfassung als Mittel über Off-Diagonal-Paare
        def offdiag_mean(M):
            vals = []
            for i, j in combinations(range(len(model_list)), 2):
                v = M.iloc[i, j]
                if pd.notna(v):
                    vals.append(v)
            return float(np.mean(vals)) if vals else np.nan

        per_field[f] = {
            "fleiss": fl,
            "cohen_unweighted_mean": offdiag_mean(M_unw),
            "cohen_linear_mean": offdiag_mean(M_lin),
            "cohen_quadratic_mean": offdiag_mean(M_quad),
            "pairwise_unweighted": M_unw,
            "pairwise_linear": M_lin,
            "pairwise_quadratic": M_quad,
        }

        # Für Overall paarweise akkumulieren
        for A, B in combinations(model_list, 2):
            for key, M in [("unweighted", M_unw), ("linear", M_lin), ("quadratic", M_quad)]:
                v = M.loc[A, B]
                if pd.notna(v):
                    pairwise_acc[key].loc[A, B] += v
                    pairwise_acc[key].loc[B, A] += v
                    pairwise_acc["_counts"].loc[A, B] += 1
                    pairwise_acc["_counts"].loc[B, A] += 1

    # Overall: Macro-Average über Felder (nicht paarweise)
    def avg_over_fields(key):
        vals = [per_field[f][key] for f in fields if pd.notna(per_field[f][key])]
        return float(np.mean(vals)) if vals else np.nan

    overall_macro = {
        "overall_fleiss": avg_over_fields("fleiss"),
        "overall_cohen_unweighted_mean": avg_over_fields("cohen_unweighted_mean"),
        "overall_cohen_linear_mean":     avg_over_fields("cohen_linear_mean"),
        "overall_cohen_quadratic_mean":  avg_over_fields("cohen_quadratic_mean"),
    }

    # Paarweise Overalls: Mittel über Felder
    counts = pairwise_acc["_counts"].replace(0, np.nan)
    pairwise_overall = {}
    for key in ("unweighted", "linear", "quadratic"):
        M = pairwise_acc[key] / counts
        # Diagonale = 1
        np.fill_diagonal(M.values, 1.0)
        pairwise_overall[key] = M

    return per_field, overall_macro, pairwise_overall

# ===== Ausführung / Beispiel =====


In [37]:
import numpy as np
import pandas as pd
from collections import Counter
from itertools import combinations
from statsmodels.stats.inter_rater import fleiss_kappa
from sklearn.metrics import cohen_kappa_score

def _field_matrix_from_length_cols(df: pd.DataFrame, field: str, model_list):
    """
    Holt die Ratings (Längen) für ein Feld als Matrix shape (n_items, n_rater),
    wirft Zeilen mit NaN raus, rundet sauber auf Integer und gibt die verdichteten
    Kategorien (0..K-1) zurück, plus die Spaltennamen.
    """
    cols = [f"{m}_{field}" for m in model_list]
    sub = df[cols].copy()
    sub = sub.apply(pd.to_numeric, errors="coerce").dropna(axis=0, how="any")
    if sub.empty:
        return None, None, None

    # sauber runden (statt floor), dann in int
    sub = sub.round().astype(int)

    # Feldweites Kategorien-Set und Rangabbildung (verdichtet auf 0..K-1)
    cats_orig = sorted(pd.unique(sub.values.ravel()))
    rank_map = {c: i for i, c in enumerate(cats_orig)}
    sub_rank = sub.replace(rank_map)

    cats_dense = list(range(len(cats_orig)))  # 0..K-1
    return sub_rank.values, cats_dense, cols

def fleiss_kappa_from_length_cols(df: pd.DataFrame, field: str, model_list):
    """
    Fleiss’ κ über Längenkategorien (verdichtete Ränge).
    """
    M, cats, _ = _field_matrix_from_length_cols(df, field, model_list)
    if M is None or len(cats) < 2:
        return np.nan

    # Häufigkeiten pro Item/Kategorie
    table = []
    for row in M:
        cnt = Counter(row)
        table.append([cnt.get(c, 0) for c in cats])
    freq = np.asarray(table, dtype=int)

    # Sicherheit: jede Zeile = #Rater
    if not np.all(freq.sum(axis=1) == len(model_list)):
        return np.nan

    val = fleiss_kappa(freq)  # method="fleiss" ist Default
    try:
        return float(val)
    except Exception:
        return np.nan

def pairwise_cohen_matrix_from_length_cols(df: pd.DataFrame, field: str, model_list, weights=None):
    """
    Paarweise Cohen-κ-Matrix (n×n) für ein Feld (verdichtete Ränge + feste Labels).
    """
    M, cats, cols = _field_matrix_from_length_cols(df, field, model_list)
    if M is None or len(cats) < 2:
        return pd.DataFrame(np.nan, index=model_list, columns=model_list)

    n = len(model_list)
    mat = pd.DataFrame(np.nan, index=model_list, columns=model_list, dtype=float)

    for i, j in combinations(range(n), 2):
        a = M[:, i]
        b = M[:, j]
        # ganz wichtig: fixes, feldweites Labels-Set!
        v = cohen_kappa_score(a, b, weights=weights, labels=cats)
        mat.iloc[i, j] = v
        mat.iloc[j, i] = v

    # wenn du 1.0 auf der Diagonalen willst:
    # np.fill_diagonal(mat.values, 1.0)
    return mat

def kappa_per_field_and_overall_from_lengths(
    df: pd.DataFrame,
    fields=("ingredients_included","ingredients_avoid","cuisine","calories","time"),
    model_list=("LLAMA","Gemini","OpenAI"),
):
    per_field = {}
    # Akkumulatoren für paarweise Overalls
    pairwise_acc = {
        "unweighted": pd.DataFrame(0.0, index=model_list, columns=model_list),
        "linear":     pd.DataFrame(0.0, index=model_list, columns=model_list),
        "quadratic":  pd.DataFrame(0.0, index=model_list, columns=model_list),
        "_counts":    pd.DataFrame(0,   index=model_list, columns=model_list, dtype=int),
    }

    for f in fields:
        fl = fleiss_kappa_from_length_cols(df, f, model_list)
        M_unw  = pairwise_cohen_matrix_from_length_cols(df, f, model_list, weights=None)
        M_lin  = pairwise_cohen_matrix_from_length_cols(df, f, model_list, weights="linear")
        M_quad = pairwise_cohen_matrix_from_length_cols(df, f, model_list, weights="quadratic")

        def offdiag_mean(M):
            vals = []
            for i, j in combinations(range(len(model_list)), 2):
                v = M.iloc[i, j]
                if pd.notna(v):
                    vals.append(v)
            return float(np.mean(vals)) if vals else np.nan

        per_field[f] = {
            "fleiss": fl,
            "cohen_unweighted_mean": offdiag_mean(M_unw),
            "cohen_linear_mean":     offdiag_mean(M_lin),
            "cohen_quadratic_mean":  offdiag_mean(M_quad),
            "pairwise_unweighted":   M_unw,
            "pairwise_linear":       M_lin,
            "pairwise_quadratic":    M_quad,
        }

        # Overall-Akkumulation (nur Off-Diagonale)
        for A, B in combinations(model_list, 2):
            for key, M in [("unweighted", M_unw), ("linear", M_lin), ("quadratic", M_quad)]:
                v = M.loc[A, B]
                if pd.notna(v):
                    pairwise_acc[key].loc[A, B] += v
                    pairwise_acc[key].loc[B, A] += v
                    pairwise_acc["_counts"].loc[A, B] += 1
                    pairwise_acc["_counts"].loc[B, A] += 1

    def avg_over_fields(key):
        vals = [per_field[f][key] for f in fields if pd.notna(per_field[f][key])]
        return float(np.mean(vals)) if vals else np.nan

    overall_macro = {
        "overall_fleiss":                    avg_over_fields("fleiss"),
        "overall_cohen_unweighted_mean":     avg_over_fields("cohen_unweighted_mean"),
        "overall_cohen_linear_mean":         avg_over_fields("cohen_linear_mean"),
        "overall_cohen_quadratic_mean":      avg_over_fields("cohen_quadratic_mean"),
    }

    # Paarweise Overalls: Mittel über Felder (Diagonale optional NaN oder 1.0)
    counts = pairwise_acc["_counts"].replace(0, np.nan)
    pairwise_overall = {}
    for key in ("unweighted", "linear", "quadratic"):
        M = pairwise_acc[key] / counts
        # wenn du 1.0 auf Diagonale willst: np.fill_diagonal(M.values, 1.0)
        pairwise_overall[key] = M

    return per_field, overall_macro, pairwise_overall


In [38]:
fields = ("ingredients_included","ingredients_avoid","cuisine","calories")
models = ("LLAMA","Gemini","OpenAI","gt", "hand")

per_field, overall_macro, pairwise_overall = kappa_per_field_and_overall_from_lengths(final_df, fields, models)

# Ausgabe (kurz)
print("Kappa pro Feld:")
for f, rec in per_field.items():
    print(f"\n• {f}")
    print(f"  Fleiss’ κ:                 {rec['fleiss']:.3f}" if pd.notna(rec['fleiss']) else "  Fleiss’ κ: NaN")
    for name, key in [("Cohen (ungew.)", "cohen_unweighted_mean"),
                      ("Cohen (linear)", "cohen_linear_mean"),
                      ("Cohen (quadr.)", "cohen_quadratic_mean")]:
        v = rec[key]
        print(f"  {name}:                {v:.3f}" if pd.notna(v) else f"  {name}: NaN")

print("\nOverall (Macro über Felder):")
for k, v in overall_macro.items():
    print(f"  {k}: {v:.3f}" if pd.notna(v) else f"  {k}: NaN")

Kappa pro Feld:

• ingredients_included
  Fleiss’ κ:                 0.693
  Cohen (ungew.):                0.692
  Cohen (linear):                0.681
  Cohen (quadr.):                0.664

• ingredients_avoid
  Fleiss’ κ:                 0.637
  Cohen (ungew.):                0.644
  Cohen (linear):                0.624
  Cohen (quadr.):                0.605

• cuisine
  Fleiss’ κ:                 0.578
  Cohen (ungew.):                0.580
  Cohen (linear):                0.553
  Cohen (quadr.):                0.525

• calories
  Fleiss’ κ: NaN
  Cohen (ungew.): NaN
  Cohen (linear): NaN
  Cohen (quadr.): NaN

Overall (Macro über Felder):
  overall_fleiss: 0.636
  overall_cohen_unweighted_mean: 0.639
  overall_cohen_linear_mean: 0.620
  overall_cohen_quadratic_mean: 0.598


In [39]:
def _field_item_count(df, field, model_list):
    cols = [f"{m}_{field}" for m in model_list]
    sub = df[cols].apply(pd.to_numeric, errors="coerce").dropna(axis=0, how="any")
    return len(sub)

def pairwise_overall_item_weighted(df, fields, model_list, weights=None):
    """
    Aggregiert paarweise Cohen-κ über Felder, gewichtet nach #Items pro Feld.
    Entspricht inhaltlich deiner „Item-gewichteten“ Auswertung – nur paarweise als Matrix.
    """
    # sammle pro Feld: paarweise Matrix + Gewicht n_f
    mats, weights_list = [], []
    for f in fields:
        M, cats, _ = _field_matrix_from_length_cols(df, f, model_list)
        if M is None or len(cats) < 2:
            continue
        n_f = _field_item_count(df, f, model_list)
        if n_f == 0:
            continue
        # paarweise Matrix für dieses Feld
        n = len(model_list)
        mat_f = pd.DataFrame(np.nan, index=model_list, columns=model_list, dtype=float)
        for i, j in combinations(range(n), 2):
            a, b = M[:, i], M[:, j]
            v = cohen_kappa_score(a, b, weights=weights, labels=cats)
            mat_f.iloc[i, j] = v
            mat_f.iloc[j, i] = v
        mats.append(mat_f)
        weights_list.append(n_f)

    if not mats:
        return pd.DataFrame(np.nan, index=model_list, columns=model_list, dtype=float)

    # gewichteter Durchschnitt (elementweise)
    W = float(np.sum(weights_list))
    out = pd.DataFrame(0.0, index=model_list, columns=model_list, dtype=float)
    wsum = pd.DataFrame(0.0, index=model_list, columns=model_list, dtype=float)
    for mat_f, w in zip(mats, weights_list):
        mask = mat_f.notna()
        out = out.add(mat_f.fillna(0.0) * w, fill_value=0.0)
        wsum = wsum.add(mask.astype(float) * w, fill_value=0.0)
    out = out / wsum.replace(0.0, np.nan)
    # Optional: Diagonale setzen
    # np.fill_diagonal(out.values, 1.0)
    return out
pw_unw_item = pairwise_overall_item_weighted(final_df, fields, models, weights=None)
pw_lin_item = pairwise_overall_item_weighted(final_df, fields, models, weights="linear")
pw_quad_item = pairwise_overall_item_weighted(final_df, fields, models, weights="quadratic")

print("\nItem-gewichtetes paarweises κ (ungewichtet):")
print(pw_unw_item.round(3))



Item-gewichtetes paarweises κ (ungewichtet):
        LLAMA  Gemini  OpenAI     gt   hand
LLAMA     NaN   0.952   0.962  0.988  0.154
Gemini  0.952     NaN   0.961  0.951  0.149
OpenAI  0.962   0.961     NaN  0.974  0.141
gt      0.988   0.951   0.974    NaN  0.163
hand    0.154   0.149   0.141  0.163    NaN


In [40]:
N_per_field = {f: _field_item_count(final_df, f, models) for f in fields}
print("Items je Feld:", N_per_field)

# Paarweise κ je Feld (ungewichtet), um „Problemfelder“ zu sehen:
for f in fields:
    M_unw = per_field[f]["pairwise_unweighted"]
    print(f"\n{f} – paarweise κ (ungewichtet):")
    print(M_unw.round(3))


Items je Feld: {'ingredients_included': 99, 'ingredients_avoid': 100, 'cuisine': 94, 'calories': 100}

ingredients_included – paarweise κ (ungewichtet):
        LLAMA  Gemini  OpenAI     gt   hand
LLAMA     NaN   0.981   0.981  0.981  0.252
Gemini  0.981     NaN   0.962  0.962  0.269
OpenAI  0.981   0.962     NaN  1.000  0.269
gt      0.981   0.962   1.000    NaN  0.269
hand    0.252   0.269   0.269  0.269    NaN

ingredients_avoid – paarweise κ (ungewichtet):
        LLAMA  Gemini  OpenAI    gt  hand
LLAMA     NaN    1.00    1.00  1.00  0.11
Gemini   1.00     NaN    1.00  1.00  0.11
OpenAI   1.00    1.00     NaN  1.00  0.11
gt       1.00    1.00    1.00   NaN  0.11
hand     0.11    0.11    0.11  0.11   NaN

cuisine – paarweise κ (ungewichtet):
        LLAMA  Gemini  OpenAI     gt   hand
LLAMA     NaN   0.871   0.903  0.984  0.098
Gemini  0.871     NaN   0.919  0.887  0.066
OpenAI  0.903   0.919     NaN  0.919  0.041
gt      0.984   0.887   0.919    NaN  0.109
hand    0.098   0.066   0