# Audubon Bird Plates - Dataset Setup

This notebook is an idempotent scaffolding + validation workflow for the Audubon bird plates dataset.

It performs:
- Dataset discovery (Colab Drive or local folder)
- Disk/JSON sanity checks (expects 435 plates)
- Creation of a structured per-plate directory layout (`plates_structured/plate-###/...`)
- Per-plate `manifest.json` generation
- JSON Schema authoring + validation
- Per-plate source image SHA-256 checksums (`source.sha256`)
- Run-system helpers (`runs/run-.../metrics.json`)
- Ledger scaffolding (empty Parquet files with schemas)
- Read-only system validation report

Dataset root resolution:
- Colab: mounts Drive and searches under `/content/drive/MyDrive/burning-world-series/`
- Local: searches upward from the current working directory for a folder starting with `audubon-bird-plates`
- Override: set `BWS_DATASET_ROOT` to an explicit path


In [None]:
!pip -q install jsonschema pyarrow


## 0) Locate dataset

This step resolves `DATASET_ROOT` (Colab or local) and sets canonical paths.


In [None]:
from pathlib import Path
import os

def in_colab() -> bool:
    try:
        import google.colab
        return True
    except Exception:
        return False

def find_dataset_root(start: Path) -> Path:
    override = os.environ.get("BWS_DATASET_ROOT")
    if override:
        p = Path(override).expanduser()
        if p.exists():
            return p
        raise RuntimeError(f"BWS_DATASET_ROOT does not exist: {p}")

    def is_dataset_dir(p: Path) -> bool:
        if not p.is_dir():
            return False
        if not p.name.startswith("audubon-bird-plates"):
            return False
        if not (p / "plates").exists():
            return False
        if not (p / "data.json").exists():
            return False
        return True

    if is_dataset_dir(start):
        return start

    for base in [start] + list(start.parents):
        try:
            children = list(base.iterdir())
        except Exception:
            continue
        for child in sorted(children):
            if is_dataset_dir(child):
                return child

    raise RuntimeError("Could not find dataset root; set BWS_DATASET_ROOT")

if in_colab():
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
    project_root = Path("/content/drive/MyDrive/burning-world-series")
    assert project_root.exists(), f"Missing project root: {project_root}"
    DATASET_ROOT = find_dataset_root(project_root)
else:
    DATASET_ROOT = find_dataset_root(Path.cwd())

PLATES_ROOT = DATASET_ROOT / "plates"
DATA_JSON = DATASET_ROOT / "data.json"
README_MD = DATASET_ROOT / "README.md"

assert PLATES_ROOT.exists(), f"Missing plates/: {PLATES_ROOT}"
assert DATA_JSON.exists(), f"Missing data.json: {DATA_JSON}"

print("Dataset root:", DATASET_ROOT)
print("Plates root :", PLATES_ROOT)


## 1) Index plates + validate metadata

Build an in-memory index of `plates/plate-*.jpg`, then confirm `data.json` references only files that exist on disk.

In [None]:
import json
import re

plate_files = {
    p.name: p
    for p in PLATES_ROOT.rglob("plate-*.jpg")
}

print(f"Indexed {len(plate_files)} plate images on disk.")
for k in list(plate_files.keys())[:10]:
    print(" ", k, "->", plate_files[k])

with open(DATA_JSON, "r", encoding="utf-8") as f:
    metadata = json.load(f)

assert isinstance(metadata, list)
assert len(metadata) == 435, f"Expected 435 entries, got {len(metadata)}"

PLATE_FILENAME_RE = re.compile(r"^plate-(\d+)(?:-[a-z0-9-]+)?\.jpg$")
bad_filenames = []
bad_plate_numbers = []
for e in metadata:
    m = PLATE_FILENAME_RE.match(e["fileName"])
    if not m:
        bad_filenames.append(e["fileName"])
        continue
    if int(m.group(1)) != int(e["plate"]):
        bad_plate_numbers.append((e["plate"], e["fileName"]))
assert not bad_filenames, f"Bad plate filenames (sample): {bad_filenames[:10]}"
assert not bad_plate_numbers, f"Plate/fileName mismatch (sample): {bad_plate_numbers[:10]}"

missing = []
for entry in metadata:
    fname = entry["fileName"]
    if fname not in plate_files:
        missing.append(fname)

print(f"Missing files referenced in JSON: {len(missing)}")
if missing:
    for m in missing[:10]:
        print(" ", m)
    raise RuntimeError("Metadata/disk mismatch")

print("All metadata filenames resolved on disk.")


## 2) Create `plates_structured/` + per-plate manifests

Creates a stable directory layout for each plate and writes a minimal `manifest.json` once.

- Safe to re-run: existing manifests are reused.
- Source images are copied once into `source/`.


In [None]:
import shutil
from datetime import datetime

STRUCTURED_ROOT = DATASET_ROOT / "plates_structured"
STRUCTURED_ROOT.mkdir(exist_ok=True)

def plate_id(n: int) -> str:
    return f"plate-{str(n).zfill(3)}"

created = 0
kept = 0

for entry in metadata:
    plate_num = int(entry["plate"])
    pid = plate_id(plate_num)

    plate_dir  = STRUCTURED_ROOT / pid
    source_dir = plate_dir / "source"
    runs_dir   = plate_dir / "runs"
    viz_dir    = plate_dir / "viz"
    cache_dir  = plate_dir / "cache"

    for d in (source_dir, runs_dir, viz_dir, cache_dir):
        d.mkdir(parents=True, exist_ok=True)

    src_path = plate_files.get(entry["fileName"])
    canonical_name = f"{pid}.jpg"
    dst_path = source_dir / canonical_name
    canonical_rel = f"source/{canonical_name}"

    manifest_path = plate_dir / "manifest.json"
    if manifest_path.exists():
        manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
        src_rel = manifest.get("source_image")
        src_existing = plate_dir / src_rel if isinstance(src_rel, str) else None

        if not dst_path.exists():
            if src_existing and src_existing.exists() and src_existing.parent == source_dir:
                src_existing.rename(dst_path)
            else:
                files = [p for p in source_dir.iterdir() if p.is_file()]
                if len(files) == 1:
                    files[0].rename(dst_path)
                elif len(files) == 0:
                    if src_path is None:
                        raise RuntimeError(f"Missing source image on disk: {entry['fileName']}")
                    shutil.copy2(src_path, dst_path)
                else:
                    raise RuntimeError(f"{pid}: source/ contains {len(files)} files")

        if manifest.get("source_image") != canonical_rel:
            manifest["source_image"] = canonical_rel
            manifest_path.write_text(
                json.dumps(manifest, indent=2, ensure_ascii=False),
                encoding="utf-8",
            )

        kept += 1
        continue

    if not dst_path.exists():
        if src_path is None:
            raise RuntimeError(f"Missing source image on disk: {entry['fileName']}")
        shutil.copy2(src_path, dst_path)

    manifest = {
        "plate_id": pid,
        "plate_number": plate_num,
        "title": entry["name"],
        "slug": entry["slug"],
        "source_image": canonical_rel,
        "download_url": entry.get("download"),
        "license": "Public Domain",
        "created_at": datetime.utcnow().isoformat() + "Z"
    }

    manifest_path.write_text(
        json.dumps(manifest, indent=2, ensure_ascii=False),
        encoding="utf-8",
    )
    created += 1

print(f"Plates created : {created}")
print(f"Plates reused : {kept}")


## 3) Write JSON Schemas

Schemas are persisted under `schemas/` so tooling can validate manifests and run metadata.

In [None]:
from pathlib import Path

SCHEMA_DIR = DATASET_ROOT / "schemas"
SCHEMA_DIR.mkdir(exist_ok=True)

PLATE_MANIFEST_SCHEMA = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://burning-world-series/schemas/plate.manifest.schema.json",
    "title": "Audubon Plate Manifest",
    "type": "object",
    "required": [
        "plate_id",
        "plate_number",
        "title",
        "slug",
        "source_image",
    ],
    "properties": {
        "plate_id": {"type": "string", "pattern": "^plate-[0-9]{3}$"},
        "plate_number": {"type": "integer", "minimum": 1, "maximum": 435},
        "title": {"type": "string"},
        "slug": {"type": "string"},
        "source_image": {"type": "string"},
        "download_url": {"type": ["string", "null"]},
        "license": {"type": "string"},
        "created_at": {"type": "string", "format": "date-time"},
    },
    "additionalProperties": False,
}

RUN_MANIFEST_SCHEMA = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://burning-world-series/schemas/run.manifest.schema.json",
    "title": "Plate Run Manifest",
    "type": "object",
    "required": ["run_id", "plate_id", "timestamp", "models", "outputs"],
    "properties": {
        "run_id": {"type": "string"},
        "plate_id": {"type": "string"},
        "timestamp": {"type": "string", "format": "date-time"},
        "models": {"type": "array", "items": {"type": "string"}},
        "outputs": {"type": "array", "items": {"type": "string"}},
        "notes": {"type": ["string", "null"]},
    },
    "additionalProperties": False,
}

def write_schema(path: Path, schema: dict):
    path.write_text(json.dumps(schema, indent=2), encoding="utf-8")

write_schema(SCHEMA_DIR / "plate.manifest.schema.json", PLATE_MANIFEST_SCHEMA)
write_schema(SCHEMA_DIR / "run.manifest.schema.json", RUN_MANIFEST_SCHEMA)

print("Schemas written to:", SCHEMA_DIR)


## 4) Validate manifests + sources

Validates every plate's `manifest.json` against the schema, and confirms the referenced source image exists.

In [None]:
from jsonschema import Draft202012Validator

SCHEMA_PATH = SCHEMA_DIR / "plate.manifest.schema.json"
plate_schema = json.loads(SCHEMA_PATH.read_text(encoding="utf-8"))
validator = Draft202012Validator(plate_schema)

errors = []
checked = 0

for plate_dir in (DATASET_ROOT / "plates_structured").iterdir():
    if not plate_dir.is_dir():
        continue

    manifest_path = plate_dir / "manifest.json"
    if not manifest_path.exists():
        errors.append(f"{plate_dir.name}: missing manifest.json")
        continue

    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
    for err in validator.iter_errors(manifest):
        errors.append(f"{plate_dir.name}: {list(err.path)} -> {err.message}")

    src = plate_dir / manifest["source_image"]
    if not src.exists():
        errors.append(f"{plate_dir.name}: missing source image {src}")

    checked += 1

print(f"Validated {checked} plate manifests.")

if errors:
    print("\nVALIDATION ERRORS:")
    for e in errors[:20]:
        print(" ", e)
    raise RuntimeError(f"{len(errors)} validation errors found.")
else:
    print("All plate manifests are valid.")


## 5) Write / verify per-plate source checksums

Creates `source.sha256` in each plate directory for integrity checks.

- If `source.sha256` exists, it's verified.
- If missing, it's created.


In [None]:
import hashlib

def sha256(path: Path, chunk_size: int = 8192) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

PLATES_STRUCTURED = DATASET_ROOT / "plates_structured"

checksums_written = 0
verified = 0

for plate_dir in PLATES_STRUCTURED.iterdir():
    if not plate_dir.is_dir():
        continue

    manifest_path = plate_dir / "manifest.json"
    if not manifest_path.exists():
        continue

    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
    src_path = plate_dir / manifest["source_image"]

    checksum_path = plate_dir / "source.sha256"
    digest = sha256(src_path)

    if checksum_path.exists():
        recorded = checksum_path.read_text().strip()
        if recorded != digest:
            raise RuntimeError(
                f"CHECKSUM MISMATCH: {plate_dir.name}\n"
                f" recorded: {recorded}\n"
                f" current : {digest}"
            )
        verified += 1
    else:
        checksum_path.write_text(digest)
        checksums_written += 1

print(f"Checksums written : {checksums_written}")
print(f"Checksums verified: {verified}")


## 6) Run system helpers

Defines a tiny run system for append-only processing outputs under each plate's `runs/` directory.

In [None]:
import hashlib
from datetime import datetime, timezone

def generate_run_id(models: list, note: str = "") -> str:
    \"\"\"
    Generate a sortable, collision-resistant run ID.
    Format: run-YYYYMMDD-HHMMSS-<hash>
    \"\"\"
    stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
    payload = "|".join(models) + "|" + note
    short = hashlib.sha1(payload.encode()).hexdigest()[:8]
    return f"run-{stamp}-{short}"

def start_run(plate_dir: Path, models: list, note: str = "") -> Path:
    \"\"\"
    Initialize a new append-only run directory and manifest.
    Writes runs/<run_id>/metrics.json.
    \"\"\"
    run_id = generate_run_id(models, note)
    run_dir = plate_dir / "runs" / run_id

    if run_dir.exists():
        raise RuntimeError(f"Run already exists: {run_dir}")

    run_dir.mkdir(parents=True)

    manifest = {
        "run_id": run_id,
        "plate_id": plate_dir.name,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "models": models,
        "outputs": [],
        "notes": note or None,
    }

    (run_dir / "metrics.json").write_text(
        json.dumps(manifest, indent=2),
        encoding="utf-8",
    )

    return run_dir

def register_output(run_dir: Path, relative_path: str):
    \"\"\"
    Register an output artifact relative to the run directory.
    Updates runs/<run_id>/metrics.json.
    \"\"\"
    manifest_path = run_dir / "metrics.json"
    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))

    if relative_path not in manifest["outputs"]:
        manifest["outputs"].append(relative_path)

    manifest_path.write_text(
        json.dumps(manifest, indent=2),
        encoding="utf-8",
    )


## 7) Ledger initialization (derived, rebuildable)

Creates **empty** Parquet files with the correct schemas.

- Safe to re-run.
- Does **not** populate ledgers.


In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

LEDGER_DIR = DATASET_ROOT / "ledger"
LEDGER_DIR.mkdir(exist_ok=True)

PLATES_SCHEMA = pa.schema([
    ("plate_id", pa.string()),
    ("plate_number", pa.int16()),
    ("title", pa.string()),
    ("source_checksum", pa.string()),
])

RUNS_SCHEMA = pa.schema([
    ("run_id", pa.string()),
    ("plate_id", pa.string()),
    ("timestamp", pa.timestamp("ms", tz="UTC")),
    ("models", pa.list_(pa.string())),
    ("notes", pa.string()),
])

EMBEDDINGS_SCHEMA = pa.schema([
    ("run_id", pa.string()),
    ("plate_id", pa.string()),
    ("model", pa.string()),
    ("vector", pa.list_(pa.float32())),
])

SEGMENTS_SCHEMA = pa.schema([
    ("run_id", pa.string()),
    ("plate_id", pa.string()),
    ("segment_id", pa.string()),
    ("area_ratio", pa.float32()),
])

SCHEMAS = {
    "plates.parquet": PLATES_SCHEMA,
    "runs.parquet": RUNS_SCHEMA,
    "embeddings.parquet": EMBEDDINGS_SCHEMA,
    "segments.parquet": SEGMENTS_SCHEMA,
}

for name, schema in SCHEMAS.items():
    path = LEDGER_DIR / name
    if not path.exists():
        empty_table = pa.Table.from_arrays(
            [pa.array([], type=field.type) for field in schema],
            schema=schema,
        )
        pq.write_table(empty_table, path)
        print(f"Created {name}")
    else:
        print(f"Exists  {name}")

print("Ledger initialization complete.")


## 8) System validation report (read-only)

This section asserts core invariants without mutating state.

In [None]:
import json
import re
from jsonschema import Draft202012Validator

PLATE_DIR_RE = re.compile(r"^plate-[0-9]{3}$")
PLATE_FILENAME_RE = re.compile(r"^plate-[0-9]{3}\.jpg$")
RUN_DIR_RE = re.compile(r"^run-[0-9]{8}-[0-9]{6}-[0-9a-f]{8}$")

print("\n==============================")
print("SYSTEM VALIDATION REPORT")
print("==============================\n")

PLATES_STRUCTURED = DATASET_ROOT / "plates_structured"
plates = sorted([p for p in PLATES_STRUCTURED.iterdir() if p.is_dir()])

print(f"[1] Plates structured: {len(plates)}")
assert len(plates) == 435, "Plate count mismatch (expected 435)"

bad_names = [p.name for p in plates if not PLATE_DIR_RE.match(p.name)]
assert not bad_names, f"Bad plate directory names: {bad_names[:5]}"
print("    OK Plate directory count and naming OK")

PLATE_SCHEMA_PATH = DATASET_ROOT / "schemas" / "plate.manifest.schema.json"
RUN_SCHEMA_PATH = DATASET_ROOT / "schemas" / "run.manifest.schema.json"

assert PLATE_SCHEMA_PATH.exists(), "Missing plate.manifest.schema.json"
assert RUN_SCHEMA_PATH.exists(), "Missing run.manifest.schema.json"
print("    OK Schemas present")

plate_schema = json.loads(PLATE_SCHEMA_PATH.read_text(encoding="utf-8"))
validator = Draft202012Validator(plate_schema)

missing = []
schema_errors = []
checksum_missing = 0

for plate_dir in plates:
    manifest_path = plate_dir / "manifest.json"
    source_dir = plate_dir / "source"
    checksum_path = plate_dir / "source.sha256"

    if not manifest_path.exists():
        missing.append(f"{plate_dir.name}: missing manifest")
        continue

    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))

    if manifest.get("plate_id") != plate_dir.name:
        missing.append(f"{plate_dir.name}: manifest plate_id mismatch ({manifest.get("plate_id")})")

    expected_plate_number = int(plate_dir.name.split("-")[1])
    if manifest.get("plate_number") != expected_plate_number:
        missing.append(f"{plate_dir.name}: manifest plate_number mismatch ({manifest.get("plate_number")})")

    src_rel = manifest.get("source_image")
    if not isinstance(src_rel, str) or not src_rel.startswith("source/"):
        missing.append(f"{plate_dir.name}: bad source_image path ({src_rel})")
        src_name = None
    else:
        src_name = src_rel.split("/", 1)[1]
        if not PLATE_FILENAME_RE.match(src_name):
            missing.append(f"{plate_dir.name}: bad source filename ({src_name})")
        expected_src_name = f"plate-{expected_plate_number:03d}.jpg"
        if src_name != expected_src_name:
            missing.append(f"{plate_dir.name}: source filename mismatch ({src_name} != {expected_src_name})")

    for err in validator.iter_errors(manifest):
        schema_errors.append(f"{plate_dir.name}: {list(err.path)} -> {err.message}")

    src = plate_dir / manifest["source_image"]
    if not src.exists():
        missing.append(f"{plate_dir.name}: missing source image")

    images = list(source_dir.glob("*"))
    if len(images) != 1:
        missing.append(f"{plate_dir.name}: source/ contains {len(images)} files")
    elif src_name is not None and images[0].name != src_name:
        missing.append(f"{plate_dir.name}: source/ filename mismatch ({images[0].name} != {src_name})")

    runs_root = plate_dir / "runs"
    if runs_root.exists():
        for run_dir in runs_root.iterdir():
            if run_dir.is_dir() and not run_dir.name.startswith(".") and not RUN_DIR_RE.match(run_dir.name):
                missing.append(f"{plate_dir.name}: bad run dir name {run_dir.name}")

    if not checksum_path.exists():
        checksum_missing += 1

assert not missing, f"Missing or malformed plate artifacts:\n{missing[:5]}"
assert not schema_errors, f"Schema errors detected:\n{schema_errors[:5]}"
assert checksum_missing == 0, f"{checksum_missing} plates missing checksums"
print("    OK Plate manifests, sources, and checksums OK")

LEDGER_DIR = DATASET_ROOT / "ledger"
expected_ledgers = {"plates.parquet", "runs.parquet", "embeddings.parquet", "segments.parquet"}
found_ledgers = {p.name for p in LEDGER_DIR.iterdir() if p.is_file()}
missing_ledgers = expected_ledgers - found_ledgers
assert not missing_ledgers, f"Missing ledger files: {missing_ledgers}"
print("    OK Ledger scaffolding present")

assert callable(generate_run_id)
assert callable(start_run)
assert callable(register_output)

test_run_id = generate_run_id(["test-model"], "dry-run")
assert test_run_id.startswith("run-")
print("    OK Run system callable (dry)")

print("\n==============================")
print("STATUS: SYSTEM IS CONSISTENT")
print("==============================")
print(
    "\n"
    "- 435 plate directories present\n"
    "- All manifests schema-valid\n"
    "- All source images present and checksummed\n"
    "- Schemas persisted\n"
    "- Ledgers scaffolded\n"
    "- Run system ready but unused\n\n"
    "NEXT SAFE ACTION:\n"
    "- Start your FIRST REAL RUN (segmentation or embeddings)\n"
)
