# 04_evaluation_and_analysis.ipynb

In [1]:
# %%
# Cell 1 — Imports and paths (load data)
import os
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from datetime import datetime
import json
import math

# Resolve project root (one level up from current working directory)
root_folder = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Answer key (ground-truth)
answer_invoices_path = os.path.join(root_folder, "data", "answerkey", "invoices.csv")
answer_lineitems_path = os.path.join(root_folder, "data", "answerkey", "lineitems.csv")

# LLM / extracted outputs (predicted)
llm_invoices_path = os.path.join(root_folder, "data", "extractions", "llm_invoices.csv")
llm_lineitems_path = os.path.join(root_folder, "data", "extractions", "llm_lineitems.csv")

print("Resolved paths:")
print(" Answer invoices:", answer_invoices_path)
print(" Answer lineitems:", answer_lineitems_path)
print(" LLM invoices:", llm_invoices_path)
print(" LLM lineitems:", llm_lineitems_path)

# Check files exist
for p in [answer_invoices_path, answer_lineitems_path, llm_invoices_path, llm_lineitems_path]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Required file not found: {p}")

# Load CSVs
ans_inv_df = pd.read_csv(answer_invoices_path)
ans_li_df = pd.read_csv(answer_lineitems_path)
sub_inv_df = pd.read_csv(llm_invoices_path)
sub_li_df = pd.read_csv(llm_lineitems_path)

print("\nLoaded shapes:")
print(" ans_inv:", ans_inv_df.shape)
print(" ans_li:", ans_li_df.shape)
print(" sub_inv (llm):", sub_inv_df.shape)
print(" sub_li (llm):", sub_li_df.shape)

Resolved paths:
 Answer invoices: c:\Stealth AI\Clean Reader\data\answerkey\invoices.csv
 Answer lineitems: c:\Stealth AI\Clean Reader\data\answerkey\lineitems.csv
 LLM invoices: c:\Stealth AI\Clean Reader\data\extractions\llm_invoices.csv
 LLM lineitems: c:\Stealth AI\Clean Reader\data\extractions\llm_lineitems.csv

Loaded shapes:
 ans_inv: (15, 6)
 ans_li: (60, 6)
 sub_inv (llm): (15, 7)
 sub_li (llm): (58, 7)


In [2]:
REQUIRED_INVOICE_COLUMNS = ["file_path", "invoice_id", "vendor", "date", "total", "invoice_number"]
REQUIRED_LINEITEM_COLUMNS = ["file_path", "invoice_id", "description", "quantity", "unit_price", "total"]

def validate_columns(df, required_cols, name):
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"{name} CSV missing required columns: {missing}")
    print(f"✅ {name}: required columns present.")

def filename_extension_key(p):
    """Return the normalized filename.extension (basename, lowercased)."""
    if pd.isna(p):
        return ""
    return os.path.basename(str(p)).strip().lower()

def normalize_text(s):
    if pd.isna(s):
        return ""
    return " ".join(str(s).strip().split()).lower()

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def parse_date_try(s):
    if pd.isna(s):
        return None
    s = str(s).strip()
    fmts = ["%d-%m-%Y","%d/%m/%Y","%Y-%m-%d","%m/%d/%Y","%d %b %Y","%d %B %Y"]
    for f in fmts:
        try:
            return datetime.strptime(s, f).date()
        except Exception:
            continue
    try:
        dt = pd.to_datetime(s, dayfirst=True, errors="coerce")
        return None if pd.isna(dt) else dt.date()
    except Exception:
        return None

def numeric_equal(a, b, tol=0.01):
    try:
        if (pd.isna(a) or a is None) and (pd.isna(b) or b is None):
            return True
        if (pd.isna(a) or a is None) or (pd.isna(b) or b is None):
            return False
        return abs(float(a) - float(b)) <= float(tol)
    except Exception:
        return False

def compute_metrics(tp, fp, fn):
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1  = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
    acc = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    return {"precision": prec, "recall": rec, "f1": f1, "accuracy": acc}

print("Helper functions ready.")


Helper functions ready.


In [3]:
# Validate column presence
validate_columns(ans_inv_df, REQUIRED_INVOICE_COLUMNS, "Answer Invoices")
validate_columns(ans_li_df, REQUIRED_LINEITEM_COLUMNS, "Answer Lineitems")
validate_columns(sub_inv_df, REQUIRED_INVOICE_COLUMNS, "LLM Invoices (submitted)")
validate_columns(sub_li_df, REQUIRED_LINEITEM_COLUMNS, "LLM Lineitems (submitted)")

# Create filename.extension keys
ans_inv_df["fname_ext"] = ans_inv_df["file_path"].apply(filename_extension_key)
sub_inv_df["fname_ext"] = sub_inv_df["file_path"].apply(filename_extension_key)

# Ensure unique invoice-level fname_ext
if ans_inv_df["fname_ext"].duplicated().any():
    dups = ans_inv_df[ans_inv_df["fname_ext"].duplicated(keep=False)][["file_path","fname_ext"]]
    raise ValueError(f"Duplicate filename.extension in answer invoices. Examples:\n{dups.head().to_string(index=False)}")
if sub_inv_df["fname_ext"].duplicated().any():
    dups = sub_inv_df[sub_inv_df["fname_ext"].duplicated(keep=False)][["file_path","fname_ext"]]
    raise ValueError(f"Duplicate filename.extension in llm invoices. Examples:\n{dups.head().to_string(index=False)}")

ans_paths = set(ans_inv_df["fname_ext"].astype(str).unique())
sub_paths = set(sub_inv_df["fname_ext"].astype(str).unique())

print(f"Unique invoice filenames — answer: {len(ans_paths)}, llm: {len(sub_paths)}")
if len(ans_paths) != len(sub_paths):
    print("⚠️ Count mismatch between answer invoices and llm invoices. Proceeding but results may show missing/mismatched invoices.")

# For lineitems create fname_ext and normalized description
ans_li_df["fname_ext"] = ans_li_df["file_path"].apply(filename_extension_key)
sub_li_df["fname_ext"] = sub_li_df["file_path"].apply(filename_extension_key)
ans_li_df["ndesc"] = ans_li_df["description"].apply(normalize_text)
sub_li_df["ndesc"] = sub_li_df["description"].apply(normalize_text)

# Check duplicates in answer lineitems for (fname_ext, ndesc)
dup_mask = ans_li_df.duplicated(subset=["fname_ext","ndesc"], keep=False)
if dup_mask.any():
    example_dups = ans_li_df[dup_mask].head()
    raise ValueError(f"Duplicate (fname_ext + description) pairs found in answer lineitems. Example:\n{example_dups[['file_path','description']].to_string(index=False)}")

print("Basic validation completed successfully.")

# Show small previews
print("\nAnswer invoices head:")
display(ans_inv_df.head())
print("\nLLM invoices head:")
display(sub_inv_df.head())
print("\nAnswer lineitems head:")
display(ans_li_df.head())
print("\nLLM lineitems head:")
display(sub_li_df.head())

✅ Answer Invoices: required columns present.
✅ Answer Lineitems: required columns present.
✅ LLM Invoices (submitted): required columns present.
✅ LLM Lineitems (submitted): required columns present.
Unique invoice filenames — answer: 15, llm: 15
Basic validation completed successfully.

Answer invoices head:


Unnamed: 0,file_path,invoice_id,vendor,date,total,invoice_number,fname_ext
0,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,375991,Pacific Food Importers,03-06-2025,491.83,375991,copy of arpfiinvoebtchlaser _10_-page1.jpg
1,Copy of ARPFIINVOEBTCHLASER _10_-page1.pdf,375991,Pacific Food Importers,03-06-2025,491.83,375991,copy of arpfiinvoebtchlaser _10_-page1.pdf
2,Copy of ARPFIINVOEBTCHLASER _11_-page1.pdf,375991,Pacific Food Importers,03-06-2025,491.83,375991,copy of arpfiinvoebtchlaser _11_-page1.pdf
3,Copy of ARPFIINVOEBTCHLASER _11_-page2.pdf,376063,Pacific Food Importers,04-06-2025,100.35,376063,copy of arpfiinvoebtchlaser _11_-page2.pdf
4,Copy of ARPFIINVOEBTCHLASER _11_-page3.jpg,376173,Pacific Food Importers,05-06-2025,100.35,376173,copy of arpfiinvoebtchlaser _11_-page3.jpg



LLM invoices head:


Unnamed: 0,file_path,invoice_id,vendor,date,total,invoice_number,visual_used,fname_ext
0,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,inv_00001,Pacific Food Importers,03-06-2025,491.83,375991,False,copy of arpfiinvoebtchlaser _10_-page1.jpg
1,Copy of ARPFIINVOEBTCHLASER _10_-page1.pdf,inv_00002,Pacific Food Importers,03-06-2025,488.97,375991,False,copy of arpfiinvoebtchlaser _10_-page1.pdf
2,Copy of ARPFIINVOEBTCHLASER _11_-page1.pdf,inv_00003,Pacific Food Importers,03-06-2025,488.97,375991,False,copy of arpfiinvoebtchlaser _11_-page1.pdf
3,Copy of ARPFIINVOEBTCHLASER _11_-page2.pdf,inv_00004,Pacific Food Importers,04-06-2025,,376063,True,copy of arpfiinvoebtchlaser _11_-page2.pdf
4,Copy of ARPFIINVOEBTCHLASER _11_-page3.jpg,inv_00005,Pacific Food Importers,05-06-2025,100.35,376173,False,copy of arpfiinvoebtchlaser _11_-page3.jpg



Answer lineitems head:


Unnamed: 0,file_path,invoice_id,description,quantity,unit_price,total,fname_ext,ndesc
0,Copy of ARPFIINVOEBTCHLASER _4_-page1.jpg,378093,FLOUR POWER,12,24.06,288.76,copy of arpfiinvoebtchlaser _4_-page1.jpg,flour power
1,Copy of ARPFIINVOEBTCHLASER _4_-page1.jpg,378093,CORNMEAL YELLOW FINE,1,25.1,25.1,copy of arpfiinvoebtchlaser _4_-page1.jpg,cornmeal yellow fine
2,Copy of ARPFIINVOEBTCHLASER _4_-page1.jpg,378093,GLOVE NITRILE MEDIUM,1,29.78,29.78,copy of arpfiinvoebtchlaser _4_-page1.jpg,glove nitrile medium
3,Copy of ARPFIINVOEBTCHLASER _4_-page1.jpg,378093,SESAME SEEDS WHITE+,1,80.25,80.25,copy of arpfiinvoebtchlaser _4_-page1.jpg,sesame seeds white+
4,Copy of ARPFIINVOEBTCHLASER _4_-page1.jpg,378093,POPPY SEED,1,96.0,96.0,copy of arpfiinvoebtchlaser _4_-page1.jpg,poppy seed



LLM lineitems head:


Unnamed: 0,file_path,invoice_id,description,quantity,unit_price,total,invoice_number,fname_ext,ndesc
0,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,inv_00001,flour power,8.0,24.063,192.5,375991,copy of arpfiinvoebtchlaser _10_-page1.jpg,flour power
1,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,inv_00001,flour a p unbleached,1.0,21.582,21.58,375991,copy of arpfiinvoebtchlaser _10_-page1.jpg,flour a p unbleached
2,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,inv_00001,flour medium rye,1.0,42.8,42.8,375991,copy of arpfiinvoebtchlaser _10_-page1.jpg,flour medium rye
3,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,inv_00001,cornmeal yellow fine,1.0,25.861,25.86,375991,copy of arpfiinvoebtchlaser _10_-page1.jpg,cornmeal yellow fine
4,Copy of ARPFIINVOEBTCHLASER _10_-page1.jpg,inv_00001,yeast fresh,1.0,54.62,54.62,375991,copy of arpfiinvoebtchlaser _10_-page1.jpg,yeast fresh


In [4]:
# Invoice-level evaluation (fields: invoice_id, vendor, date, total, invoice_number)
numeric_tolerance = 0.01
use_fuzzy_vendor = False
vendor_threshold = 0.85  # only used if use_fuzzy_vendor True

# Prepare indices
ans_inv_df.set_index("fname_ext", inplace=True)
sub_inv_df.set_index("fname_ext", inplace=True)
invoice_keys = sorted(list(ans_inv_df.index.astype(str)))

invoice_fields = ["invoice_id", "vendor", "date", "total", "invoice_number"]
counters = {f: {"tp":0, "fp":0, "fn":0} for f in invoice_fields}

for key in invoice_keys:
    arow = ans_inv_df.loc[key]
    if key not in sub_inv_df.index:
        # missing submission -> all non-null answer fields count as FN
        for f in invoice_fields:
            if pd.notna(arow.get(f, None)) and str(arow.get(f)) != "":
                counters[f]["fn"] += 1
        continue

    srow = sub_inv_df.loc[key]

    # invoice_id (string compare normalized)
    a = normalize_text(arow.get("invoice_id", ""))
    b = normalize_text(srow.get("invoice_id", ""))
    if a == b and a != "":
        counters["invoice_id"]["tp"] += 1
    else:
        if a != "": counters["invoice_id"]["fn"] += 1
        if b != "": counters["invoice_id"]["fp"] += 1

    # vendor (optionally fuzzy)
    a = normalize_text(arow.get("vendor", ""))
    b = normalize_text(srow.get("vendor", ""))
    if use_fuzzy_vendor:
        match = similar(a, b) >= vendor_threshold
    else:
        match = (a == b and a != "")
    if match:
        counters["vendor"]["tp"] += 1
    else:
        if a != "": counters["vendor"]["fn"] += 1
        if b != "": counters["vendor"]["fp"] += 1

    # date
    a_date = parse_date_try(arow.get("date", None))
    b_date = parse_date_try(srow.get("date", None))
    if (a_date is not None) and (b_date is not None) and (a_date == b_date):
        counters["date"]["tp"] += 1
    else:
        if a_date is not None: counters["date"]["fn"] += 1
        if b_date is not None: counters["date"]["fp"] += 1

    # total (numeric tolerance)
    if numeric_equal(arow.get("total", None), srow.get("total", None), numeric_tolerance):
        counters["total"]["tp"] += 1
    else:
        if pd.notna(arow.get("total", None)): counters["total"]["fn"] += 1
        if pd.notna(srow.get("total", None)): counters["total"]["fp"] += 1

    # invoice_number
    a = normalize_text(arow.get("invoice_number", ""))
    b = normalize_text(srow.get("invoice_number", ""))
    if a == b and a != "":
        counters["invoice_number"]["tp"] += 1
    else:
        if a != "": counters["invoice_number"]["fn"] += 1
        if b != "": counters["invoice_number"]["fp"] += 1

# Compute metrics
inv_metrics = {f: compute_metrics(c["tp"], c["fp"], c["fn"]) for f, c in counters.items()}
print("\nInvoice-level metrics:")
print(json.dumps(inv_metrics, indent=2))

avg_prec = np.mean([m["precision"] for m in inv_metrics.values()])
avg_rec  = np.mean([m["recall"] for m in inv_metrics.values()])
avg_f1   = np.mean([m["f1"] for m in inv_metrics.values()])

print(f"\nInvoice-level averages: Precision={avg_prec:.3f}, Recall={avg_rec:.3f}, F1={avg_f1:.3f}")



Invoice-level metrics:
{
  "invoice_id": {
    "precision": 0.0,
    "recall": 0.0,
    "f1": 0.0,
    "accuracy": 0.0
  },
  "vendor": {
    "precision": 0.9333333333333333,
    "recall": 0.9333333333333333,
    "f1": 0.9333333333333333,
    "accuracy": 0.9333333333333333
  },
  "date": {
    "precision": 0.8666666666666667,
    "recall": 0.8666666666666667,
    "f1": 0.8666666666666667,
    "accuracy": 0.8666666666666667
  },
  "total": {
    "precision": 0.7857142857142857,
    "recall": 0.7333333333333333,
    "f1": 0.7586206896551724,
    "accuracy": 0.7333333333333333
  },
  "invoice_number": {
    "precision": 1.0,
    "recall": 1.0,
    "f1": 1.0,
    "accuracy": 1.0
  }
}

Invoice-level averages: Precision=0.717, Recall=0.707, F1=0.712


In [5]:
# Line-item level evaluation
# We'll match by normalized description per invoice (fname_ext). Fuzzy match threshold used if no exact match found.

desc_threshold = 0.85  # fuzzy matching threshold
line_fields = ["description", "quantity", "unit_price", "total"]
lf_counters = {f: {"tp":0,"fp":0,"fn":0} for f in line_fields}

# Ensure we have the expected columns (done earlier), but re-check indexes
invoice_keys = sorted(list(ans_inv_df.index.astype(str)))

for key in invoice_keys:
    # all answer items for this invoice
    a_items = ans_li_df[ans_li_df["fname_ext"] == key].copy()
    s_items = sub_li_df[sub_li_df["fname_ext"] == key].copy()

    # build map of normalized description -> list of answer rows
    ans_map = {}
    for _, r in a_items.iterrows():
        ans_map.setdefault(r["ndesc"], []).append(r)

    # iterate submitted items and try to match one-by-one
    for _, srow in s_items.iterrows():
        sdesc = srow["ndesc"]
        best_desc = None
        best_score = 0.0

        # exact normalized match
        if sdesc in ans_map and len(ans_map[sdesc]) > 0:
            best_desc = sdesc
            best_score = 1.0
        else:
            # fuzzy search over remaining keys
            for adesc in list(ans_map.keys()):
                if len(ans_map.get(adesc, [])) == 0:
                    continue
                score = similar(sdesc, adesc)
                if score > best_score:
                    best_score = score
                    best_desc = adesc

        if best_score >= desc_threshold and best_desc is not None and len(ans_map.get(best_desc, [])) > 0:
            # pop one matched answer row
            arow = ans_map[best_desc].pop(0)
            # description matched (TP)
            lf_counters["description"]["tp"] += 1

            # quantity
            if numeric_equal(arow["quantity"], srow["quantity"], numeric_tolerance):
                lf_counters["quantity"]["tp"] += 1
            else:
                if pd.notna(arow["quantity"]): lf_counters["quantity"]["fn"] += 1
                if pd.notna(srow["quantity"]): lf_counters["quantity"]["fp"] += 1

            # unit_price
            if numeric_equal(arow["unit_price"], srow["unit_price"], numeric_tolerance):
                lf_counters["unit_price"]["tp"] += 1
            else:
                if pd.notna(arow["unit_price"]): lf_counters["unit_price"]["fn"] += 1
                if pd.notna(srow["unit_price"]): lf_counters["unit_price"]["fp"] += 1

            # total
            if numeric_equal(arow["total"], srow["total"], numeric_tolerance):
                lf_counters["total"]["tp"] += 1
            else:
                if pd.notna(arow["total"]): lf_counters["total"]["fn"] += 1
                if pd.notna(srow["total"]): lf_counters["total"]["fp"] += 1

        else:
            # no match found for this submitted item -> FP for fields present in submitted
            lf_counters["description"]["fp"] += 1
            if pd.notna(srow.get("quantity", None)): lf_counters["quantity"]["fp"] += 1
            if pd.notna(srow.get("unit_price", None)): lf_counters["unit_price"]["fp"] += 1
            if pd.notna(srow.get("total", None)): lf_counters["total"]["fp"] += 1

    # any remaining items in ans_map are unmatched -> FN
    for rem_list in ans_map.values():
        for arow in rem_list:
            lf_counters["description"]["fn"] += 1
            if pd.notna(arow.get("quantity", None)): lf_counters["quantity"]["fn"] += 1
            if pd.notna(arow.get("unit_price", None)): lf_counters["unit_price"]["fn"] += 1
            if pd.notna(arow.get("total", None)): lf_counters["total"]["fn"] += 1

# Compute line-item metrics
li_metrics = {f: compute_metrics(c["tp"], c["fp"], c["fn"]) for f, c in lf_counters.items()}
print("\nLine-item metrics:")
print(json.dumps(li_metrics, indent=2))

avg_prec_li = np.mean([m["precision"] for m in li_metrics.values()])
avg_rec_li  = np.mean([m["recall"] for m in li_metrics.values()])
avg_f1_li   = np.mean([m["f1"] for m in li_metrics.values()])
print(f"\nLine-item averages: Precision={avg_prec_li:.3f}, Recall={avg_rec_li:.3f}, F1={avg_f1_li:.3f}")


Line-item metrics:
{
  "description": {
    "precision": 0.9482758620689655,
    "recall": 0.9166666666666666,
    "f1": 0.9322033898305084,
    "accuracy": 0.9166666666666666
  },
  "quantity": {
    "precision": 0.8620689655172413,
    "recall": 0.8333333333333334,
    "f1": 0.847457627118644,
    "accuracy": 0.8333333333333334
  },
  "unit_price": {
    "precision": 0.8448275862068966,
    "recall": 0.8166666666666667,
    "f1": 0.8305084745762712,
    "accuracy": 0.8166666666666667
  },
  "total": {
    "precision": 0.9310344827586207,
    "recall": 0.9,
    "f1": 0.9152542372881356,
    "accuracy": 0.9
  }
}

Line-item averages: Precision=0.897, Recall=0.867, F1=0.881


In [6]:
results = {
    "invoice_metrics": inv_metrics,
    "invoice_avg": {"precision": float(avg_prec), "recall": float(avg_rec), "f1": float(avg_f1)},
    "lineitem_metrics": li_metrics,
    "lineitem_avg": {"precision": float(avg_prec_li), "recall": float(avg_rec_li), "f1": float(avg_f1_li)}
}

print("\nFinal summary:")
print(json.dumps(results, indent=2))


Final summary:
{
  "invoice_metrics": {
    "invoice_id": {
      "precision": 0.0,
      "recall": 0.0,
      "f1": 0.0,
      "accuracy": 0.0
    },
    "vendor": {
      "precision": 0.9333333333333333,
      "recall": 0.9333333333333333,
      "f1": 0.9333333333333333,
      "accuracy": 0.9333333333333333
    },
    "date": {
      "precision": 0.8666666666666667,
      "recall": 0.8666666666666667,
      "f1": 0.8666666666666667,
      "accuracy": 0.8666666666666667
    },
    "total": {
      "precision": 0.7857142857142857,
      "recall": 0.7333333333333333,
      "f1": 0.7586206896551724,
      "accuracy": 0.7333333333333333
    },
    "invoice_number": {
      "precision": 1.0,
      "recall": 1.0,
      "f1": 1.0,
      "accuracy": 1.0
    }
  },
  "invoice_avg": {
    "precision": 0.7171428571428572,
    "recall": 0.7066666666666667,
    "f1": 0.7117241379310345
  },
  "lineitem_metrics": {
    "description": {
      "precision": 0.9482758620689655,
      "recall": 0.91666