# ChronoMiner Extraction Evaluation

This notebook evaluates structured data extraction quality across multiple LLM providers and models.

## Setup

Ensure you have:
1. Corrected transcription files in `test_data/input/{category}/`
2. Ground truth JSON files in `test_data/ground_truth/{category}/`
3. Model outputs in `test_data/output/{category}/{model_name}/`

In [None]:
import json
import sys
from pathlib import Path
from datetime import datetime

import yaml

# Add parent directory for imports
sys.path.insert(0, str(Path.cwd().parent))

from metrics import (
    ExtractionMetrics,
    aggregate_metrics,
    compute_extraction_metrics,
    format_field_metrics_table,
    format_metrics_table,
)

print("Imports successful!")

## Load Configuration

In [None]:
# Load evaluation config
config_path = Path("eval_config.yaml")

with open(config_path, "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

# Extract paths
base_path = Path.cwd()
input_path = base_path / config["dataset"]["input_path"]
output_path = base_path / config["dataset"]["output_path"]
ground_truth_path = base_path / config["dataset"]["ground_truth_path"]
reports_path = base_path / config["evaluation"]["reports_path"]

# Evaluation settings
threshold = config["evaluation"].get("string_similarity_threshold", 0.85)
case_sensitive = config["evaluation"].get("case_sensitive", False)
normalize_ws = config["evaluation"].get("normalize_whitespace", True)
schema_fields = config["evaluation"].get("schema_fields", {})

categories = config["dataset"]["categories"]
models = config["models"]

print(f"Categories: {[c['name'] for c in categories]}")
print(f"Models: {[m['name'] for m in models]}")
print(f"Similarity threshold: {threshold}")

## Discover Available Files

In [None]:
def discover_files(category_name: str, model_name: str):
    """Find matching input, output, and ground truth files."""
    input_dir = input_path / category_name
    output_dir = output_path / category_name / model_name
    gt_dir = ground_truth_path / category_name
    
    files = []
    
    if not input_dir.exists():
        print(f"Warning: Input directory does not exist: {input_dir}")
        return files
    
    for input_file in sorted(input_dir.glob("*.txt")):
        stem = input_file.stem
        if stem.endswith("_line_ranges") or stem.endswith("_context"):
            continue
        
        gt_file = gt_dir / f"{stem}.json"
        output_file = output_dir / f"{stem}.json"
        
        if gt_file.exists():
            files.append({
                "name": stem,
                "input": input_file,
                "output": output_file,
                "ground_truth": gt_file,
                "has_output": output_file.exists(),
            })
    
    return files

# Show available files for each category
for cat in categories:
    cat_name = cat["name"]
    files = discover_files(cat_name, models[0]["name"])
    print(f"\n{cat_name}: {len(files)} files with ground truth")
    for f in files[:5]:  # Show first 5
        status = "✓" if f["has_output"] else "✗"
        print(f"  {status} {f['name']}")
    if len(files) > 5:
        print(f"  ... and {len(files) - 5} more")

## Run Evaluation

In [None]:
def load_json(path: Path) -> dict:
    """Load JSON file."""
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return {}

def evaluate_category(category: dict, model: dict) -> ExtractionMetrics:
    """Evaluate all files in a category for a model."""
    cat_name = category["name"]
    schema_name = category["schema"]
    model_name = model["name"]
    
    fields = schema_fields.get(schema_name, [])
    files = discover_files(cat_name, model_name)
    
    file_metrics = []
    
    for f in files:
        if not f["has_output"]:
            continue
        
        output_data = load_json(f["output"])
        gt_data = load_json(f["ground_truth"])
        
        if not output_data or not gt_data:
            continue
        
        metrics = compute_extraction_metrics(
            ground_truth=gt_data,
            hypothesis=output_data,
            fields_to_evaluate=fields if fields else None,
            threshold=threshold,
            case_sensitive=case_sensitive,
            normalize_ws=normalize_ws,
        )
        
        file_metrics.append(metrics)
    
    if file_metrics:
        return aggregate_metrics(file_metrics)
    return ExtractionMetrics()

# Run evaluation
all_metrics = {}

for model in models:
    model_name = model["name"]
    all_metrics[model_name] = {}
    
    for category in categories:
        cat_name = category["name"]
        
        metrics = evaluate_category(category, model)
        
        if metrics.total_gt_entries > 0:
            all_metrics[model_name][cat_name] = metrics
            print(f"{model_name} / {cat_name}: Entry F1={metrics.entry_f1:.2%}, Micro F1={metrics.micro_f1:.2%}")

print("\nEvaluation complete!")

## Results Summary Table

In [None]:
from IPython.display import Markdown, display

# Display summary table
summary_table = format_metrics_table(all_metrics)
display(Markdown(summary_table))

## Field-Level Breakdown

In [None]:
# Show field-level breakdown for each model/category
for model_name, cat_metrics in all_metrics.items():
    for cat_name, metrics in cat_metrics.items():
        display(Markdown(f"### {model_name} / {cat_name}"))
        display(Markdown(format_field_metrics_table(metrics)))
        print()

## Save Reports

In [None]:
import csv

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
reports_path.mkdir(parents=True, exist_ok=True)

# Save JSON
json_path = reports_path / f"eval_results_{timestamp}.json"
json_data = {
    "timestamp": timestamp,
    "config": {
        "threshold": threshold,
        "case_sensitive": case_sensitive,
        "normalize_whitespace": normalize_ws,
    },
    "results": {
        model: {cat: m.to_dict() for cat, m in cats.items()}
        for model, cats in all_metrics.items()
    },
}
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=2, ensure_ascii=False)
print(f"Saved: {json_path}")

# Save CSV
csv_path = reports_path / f"eval_results_{timestamp}.csv"
rows = []
for model_name, cat_metrics in all_metrics.items():
    for cat_name, m in cat_metrics.items():
        rows.append({
            "model": model_name,
            "category": cat_name,
            "entry_precision": round(m.entry_precision * 100, 2),
            "entry_recall": round(m.entry_recall * 100, 2),
            "entry_f1": round(m.entry_f1 * 100, 2),
            "micro_precision": round(m.micro_precision * 100, 2),
            "micro_recall": round(m.micro_recall * 100, 2),
            "micro_f1": round(m.micro_f1 * 100, 2),
        })
if rows:
    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)
    print(f"Saved: {csv_path}")

# Save Markdown
md_path = reports_path / f"eval_results_{timestamp}.md"
md_content = f"""# ChronoMiner Extraction Evaluation Results

**Generated:** {timestamp}

## Summary

{format_metrics_table(all_metrics)}
"""
with open(md_path, "w", encoding="utf-8") as f:
    f.write(md_content)
print(f"Saved: {md_path}")

## Visualization (Optional)

In [None]:
try:
    import matplotlib.pyplot as plt
    import numpy as np
    
    # Prepare data for plotting
    model_names = list(all_metrics.keys())
    cat_names = list(set(
        cat for cats in all_metrics.values() for cat in cats.keys()
    ))
    
    # Create grouped bar chart for F1 scores
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(model_names))
    width = 0.8 / len(cat_names)
    
    for i, cat in enumerate(cat_names):
        f1_scores = [
            all_metrics[model].get(cat, ExtractionMetrics()).micro_f1 * 100
            for model in model_names
        ]
        offset = (i - len(cat_names) / 2 + 0.5) * width
        ax.bar(x + offset, f1_scores, width, label=cat)
    
    ax.set_ylabel("Micro F1 Score (%)")
    ax.set_xlabel("Model")
    ax.set_title("Extraction Quality by Model and Category")
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, rotation=45, ha="right")
    ax.legend(title="Category")
    ax.set_ylim(0, 100)
    ax.grid(axis="y", alpha=0.3)
    
    plt.tight_layout()
    
    # Save chart
    chart_path = reports_path / f"eval_chart_{timestamp}.png"
    plt.savefig(chart_path, dpi=150)
    print(f"Saved: {chart_path}")
    
    plt.show()
    
except ImportError:
    print("matplotlib not available - skipping visualization")