# Audubon Bird Plates - Handoff + Contract Assertions

Use this notebook as the entrypoint once a dataset has been scaffolded (i.e., `plates_structured/`, schemas, checksums, ledger).

Dataset root resolution:
- Colab: mounts Drive and searches under `/content/drive/MyDrive/burning-world-series/`
- Local: searches upward from the current working directory for a folder starting with `audubon-bird-plates`
- Override: set `BWS_DATASET_ROOT` to an explicit path


## Handoff: load project context (read-only)

Purpose:
- Resolve dataset root
- Expose canonical paths
- Perform light sanity checks only

This cell must:
- Not write files
- Not validate schemas deeply
- Not compute checksums


In [None]:
from pathlib import Path
import json
import os

def in_colab() -> bool:
    try:
        import google.colab
        return True
    except Exception:
        return False

def find_dataset_root(start: Path) -> Path:
    override = os.environ.get("BWS_DATASET_ROOT")
    if override:
        p = Path(override).expanduser()
        if p.exists():
            return p
        raise RuntimeError(f"BWS_DATASET_ROOT does not exist: {p}")

    def is_dataset_dir(p: Path) -> bool:
        if not p.is_dir():
            return False
        if not p.name.startswith("audubon-bird-plates"):
            return False
        if not (p / "data.json").exists():
            return False
        return True

    if is_dataset_dir(start):
        return start

    for base in [start] + list(start.parents):
        try:
            children = list(base.iterdir())
        except Exception:
            continue
        for child in sorted(children):
            if is_dataset_dir(child):
                return child

    raise RuntimeError("Could not find dataset root; set BWS_DATASET_ROOT")

if in_colab():
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    project_root = Path("/content/drive/MyDrive/burning-world-series")
    assert project_root.exists(), "Project root missing"
    DATASET_ROOT = find_dataset_root(project_root)
else:
    DATASET_ROOT = find_dataset_root(Path.cwd())

PLATES_ORIGINAL = DATASET_ROOT / "plates"
PLATES_STRUCTURED = DATASET_ROOT / "plates_structured"
SCHEMA_DIR = DATASET_ROOT / "schemas"
LEDGER_DIR = DATASET_ROOT / "ledger"
DATA_JSON = DATASET_ROOT / "data.json"
README_MD = DATASET_ROOT / "README.md"

assert PLATES_STRUCTURED.exists(), "plates_structured missing (run audubon_bird_plates_setup.ipynb first)"
assert SCHEMA_DIR.exists(), "schemas missing"
assert LEDGER_DIR.exists(), "ledger missing"
assert DATA_JSON.exists(), "data.json missing"

plate_dirs = [p for p in PLATES_STRUCTURED.iterdir() if p.is_dir()]
assert len(plate_dirs) == 435, "Unexpected plate count"

def get_plate_dir(plate_number: int) -> Path:
    return PLATES_STRUCTURED / f"plate-{str(plate_number).zfill(3)}"

def load_plate_manifest(plate_number: int) -> dict:
    plate_dir = get_plate_dir(plate_number)
    return json.loads((plate_dir / "manifest.json").read_text(encoding="utf-8"))

print("Project loaded successfully.")
print(f"Dataset root      : {DATASET_ROOT}")
print(f"Structured plates : {len(plate_dirs)}")
print("Ready for runs, embeddings, or analysis.")


## Structure & naming contract assertion (read-only)

Purpose:
- Re-assert filesystem law
- Confirm naming conventions
- Abort early if drift has occurred

This cell reads only and raises on any deviation.


In [None]:
import json
import re

print("\n==============================")
print("STRUCTURE & NAMING ASSERTION")
print("==============================\n")

assert DATASET_ROOT.exists(), "DATASET_ROOT missing"
assert PLATES_STRUCTURED.exists(), "plates_structured missing"
assert SCHEMA_DIR.exists(), "schemas missing"
assert LEDGER_DIR.exists(), "ledger missing"

print("[1] Canonical roots present (OK)")

plate_dirs = sorted(p for p in PLATES_STRUCTURED.iterdir() if p.is_dir())
assert len(plate_dirs) == 435, "Plate count must be exactly 435"

plate_name_re = re.compile(r"^plate-\d{3}$")
bad_names = [p.name for p in plate_dirs if not plate_name_re.match(p.name)]
assert not bad_names, f"Invalid plate directory names: {bad_names[:5]}"

print("[2] Plate directory naming OK (OK)")

required_subdirs = {"source", "runs", "viz", "cache"}
violations = []

for plate_dir in plate_dirs:
    contents = {p.name for p in plate_dir.iterdir()}
    missing = required_subdirs - contents
    if missing:
        violations.append(f"{plate_dir.name}: missing {missing}")

    source_dir = plate_dir / "source"
    if source_dir.exists():
        files = [p for p in source_dir.iterdir() if p.is_file()]
        if len(files) != 1:
            violations.append(f"{plate_dir.name}: source/ has {len(files)} files")

    for fname in ("manifest.json", "source.sha256"):
        if not (plate_dir / fname).exists():
            violations.append(f"{plate_dir.name}: missing {fname}")

assert not violations, f"Plate structure violations:\n{violations[:5]}"

print("[3] Plate internal structure OK (OK)")

for plate_dir in plate_dirs:
    manifest = json.loads((plate_dir / "manifest.json").read_text(encoding="utf-8"))
    assert manifest["plate_id"] == plate_dir.name, f"{plate_dir.name}: plate_id mismatch"
    src = plate_dir / manifest["source_image"]
    assert src.exists(), f"{plate_dir.name}: source_image missing"

print("[4] Manifest <-> filesystem consistency OK (OK)")

run_re = re.compile(r"^run-\d{8}-\d{6}-[a-f0-9]{8}$")

for plate_dir in plate_dirs:
    runs_dir = plate_dir / "runs"
    if not runs_dir.exists():
        continue
    for run in runs_dir.iterdir():
        if not run.is_dir():
            continue
        assert run_re.match(run.name), f"{plate_dir.name}: invalid run dir {run.name}"
        assert (run / "metrics.json").exists(), f"{run}: missing metrics.json"

print("[5] Run directory naming (if any) OK (OK)")

expected_ledgers = {
    "plates.parquet",
    "runs.parquet",
    "embeddings.parquet",
    "segments.parquet",
}

found_ledgers = {p.name for p in LEDGER_DIR.iterdir() if p.is_file()}
assert expected_ledgers.issubset(found_ledgers), (
    f"Ledger files missing: {expected_ledgers - found_ledgers}"
)

print("[6] Ledger scaffolding present (OK)")

print("\n==============================")
print("CONTRACT STATUS: ASSERTED")
print("==============================")
print(
    "\n"
    "Filesystem, naming, and structural invariants\n"
    "are intact.\n\n"
    "It is now safe to:\n"
    "- start new runs\n"
    "- generate embeddings\n"
    "- perform segmentation\n"
    "- write derived artifacts\n"
)


## Dependencies (Colab)


In [None]:
!pip install pillow numpy matplotlib tqdm


## Cleanup: remove baseline image metadata outputs

Deletes only `input_image.json` under each `plates_structured/<plate_id>/`.


In [None]:
from pathlib import Path

plates_structured = DATASET_ROOT / "plates_structured"

removed = 0

for plate_dir in plates_structured.iterdir():
    if not plate_dir.is_dir():
        continue
    target = plate_dir / "input_image.json"
    if target.exists():
        target.unlink()
        removed += 1

print("Cleanup complete.")
print(f"Removed input_image.json files: {removed}")
print("Dataset restored to pre-baseline state.")


## Repo-wide exploratory (read-only, CPU-only)

Header-only inspection of source images; no writes and no full pixel loads.


In [None]:
import json
from collections import Counter
from PIL import Image

Image.MAX_IMAGE_PIXELS = None

ROOT = DATASET_ROOT
PLATES_STRUCTURED = ROOT / "plates_structured"
LEDGER_DIR = ROOT / "ledger"
SCHEMA_DIR = ROOT / "schemas"

print("\n==============================")
print("REPO EXPLORATORY REPORT")
print("==============================\n")

top_dirs = sorted([p.name for p in ROOT.iterdir() if p.is_dir()])
top_files = sorted([p.name for p in ROOT.iterdir() if p.is_file()])

print("[1] Top-level directories:")
for d in top_dirs:
    print("  -", d)

print("\n[1] Top-level files:")
for f in top_files:
    print("  -", f)

plate_dirs = sorted(
    p for p in PLATES_STRUCTURED.iterdir()
    if p.is_dir() and p.name.startswith("plate-")
)

print(f"\n[2] Plates found: {len(plate_dirs)}")

missing_manifest = []
missing_source = []
run_counts = []

for p in plate_dirs:
    if not (p / "manifest.json").exists():
        missing_manifest.append(p.name)

    src_dir = p / "source"
    if not src_dir.exists():
        missing_source.append(p.name)
    else:
        files = list(src_dir.iterdir())
        if len(files) != 1:
            missing_source.append(p.name)

    runs_dir = p / "runs"
    if runs_dir.exists():
        run_counts.append(len([r for r in runs_dir.iterdir() if r.is_dir()]))
    else:
        run_counts.append(0)

print("    Missing manifest.json:", len(missing_manifest))
print("    Missing/invalid source:", len(missing_source))
print("    Plates with runs:", sum(1 for c in run_counts if c > 0))
print("    Total runs:", sum(run_counts))

print("\n[3] Manifest field coverage (sampled)")

key_counter = Counter()
sampled = 0

for p in plate_dirs[:50]:
    try:
        m = json.loads((p / "manifest.json").read_text(encoding="utf-8"))
        key_counter.update(m.keys())
        sampled += 1
    except Exception:
        pass

for k, v in key_counter.most_common():
    print(f"  {k:20s} -> present in {v}/{sampled}")

print("\n[4] Image header stats (source images only)")

sizes = []
huge = []

for p in plate_dirs:
    try:
        src = next((p / "source").iterdir())
        with Image.open(src) as img:
            w, h = img.size
        mp = (w * h) / 1_000_000
        sizes.append(mp)
        if mp > 90:
            huge.append((p.name, round(mp, 2)))
    except Exception:
        pass

if sizes:
    print(f"    Min megapixels: {round(min(sizes), 2)}")
    print(f"    Max megapixels: {round(max(sizes), 2)}")
    print(f"    Mean megapixels: {round(sum(sizes)/len(sizes), 2)}")
    print(f"    Images > 90 MP (PIL warning risk): {len(huge)}")

for name, mp in huge[:5]:
    print(f"      - {name}: {mp} MP")

print("\n[5] Ledger files")
if LEDGER_DIR.exists():
    for p in sorted(LEDGER_DIR.iterdir()):
        if p.is_file():
            print(f"  - {p.name:20s} {round(p.stat().st_size/1024, 1)} KB")
else:
    print("  Ledger directory missing")

print("\n[5] Schemas")
if SCHEMA_DIR.exists():
    for p in sorted(SCHEMA_DIR.iterdir()):
        print("  -", p.name)
else:
    print("  Schema directory missing")

print("\n==============================")
print("SUMMARY")
print("==============================")
print(
    f"\n"
    f"Plates                : {len(plate_dirs)}\n"
    f"Total runs            : {sum(run_counts)}\n"
    f"Images > 90 MP        : {len(huge)}\n"
    f"Manifests missing     : {len(missing_manifest)}\n"
    f"Source issues         : {len(missing_source)}\n"
    f"Schemas present       : {SCHEMA_DIR.exists()}\n"
    f"Ledgers present       : {LEDGER_DIR.exists()}\n"
)

print("Exploratory complete. No files written.")
