# CPU Baseline (SageMaker-style: read-only input, write-only output)

This notebook is designed to run on local CPU while matching the SageMaker mental model:

- Treat `INPUT_ROOT` as **read-only**.
- Write all new artifacts under `OUTPUT_ROOT`.
- Use shard parameters so the same code can scale to SageMaker Processing.

## Input criteria (must already be true)

`INPUT_ROOT/` must contain:

- `plates_structured/` with exactly 435 plate directories named `plate-###`
- For each plate directory:
  - `manifest.json`
  - `source.sha256`
  - `source/plate-###.jpg`
- `schemas/plate.manifest.schema.json`
- `schemas/run.manifest.schema.json`

## Midpoint artifacts (written once)

- `OUTPUT_ROOT/schemas/cpu.baseline.schema.json`
- `OUTPUT_ROOT/reports/<run_id>/report.json`

## Outputs (append-only, per plate)

For each plate in the shard:

- `OUTPUT_ROOT/plates_structured/<plate_id>/runs/<run_id>/metrics.json`
- `OUTPUT_ROOT/plates_structured/<plate_id>/runs/<run_id>/cpu_baseline.json`

## Constraints enforced

- No mutation of `INPUT_ROOT`.
- Append-only run outputs in `OUTPUT_ROOT`.
- `metrics.json` validates against `schemas/run.manifest.schema.json`.
- `cpu_baseline.json` validates against `OUTPUT_ROOT/schemas/cpu.baseline.schema.json`.

## CPU-only feature pack

- Container facts: bytes, extension, format, sha256, EXIF presence, ICC presence+hash, JPEG progressive/subsampling/quant hash when exposed
- Geometry: width, height, megapixels, aspect ratio, tiling lattice
- Pixel distribution: RGB histograms and luma histogram (256 bins)
- Summary stats: mean/std/min/max + clipped ratios per channel + luma
- Entropy per channel + luma
- Hashes: aHash, dHash, pHash
- Sharpness proxy: Laplacian variance on downsampled grayscale


In [None]:
import hashlib
import json
import math
import os
from datetime import datetime, timezone
from pathlib import Path

import numpy as np
from jsonschema import Draft202012Validator
from PIL import Image
from tqdm import tqdm

Image.MAX_IMAGE_PIXELS = None

TILE_SIZE = 512
HIST_BINS = 256
LAPLACE_MAX_DIM = 1024
PHASH_BITS = 8
PHASH_FACTOR = 4

def utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

def generate_run_id(models: list[str], note: str) -> str:
    stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
    payload = "|".join(models) + "|" + note
    short = hashlib.sha1(payload.encode("utf-8")).hexdigest()[:8]
    return f"run-{stamp}-{short}"

def resize_fit(img: Image.Image, max_dim: int) -> Image.Image:
    w, h = img.size
    m = max(w, h)
    if m <= max_dim:
        return img
    s = max_dim / m
    return img.resize((max(1, int(round(w * s))), max(1, int(round(h * s)))), resample=Image.BILINEAR)

def laplacian_variance(gray_img: Image.Image) -> float:
    g = resize_fit(gray_img, LAPLACE_MAX_DIM)
    a = np.asarray(g, dtype=np.float32)
    ap = np.pad(a, ((1, 1), (1, 1)), mode="edge")
    out = ap[:-2, 1:-1] + ap[2:, 1:-1] + ap[1:-1, :-2] + ap[1:-1, 2:] - 4.0 * ap[1:-1, 1:-1]
    return float(np.var(out))

def hist_stats(hist: list[int]) -> dict:
    total = int(sum(hist))
    if total <= 0:
        return {"total": 0, "min": None, "max": None, "mean": None, "std": None, "clipped_low_ratio": None, "clipped_high_ratio": None}
    mn = next((i for i, c in enumerate(hist) if c), 0)
    mx = next((255 - i for i, c in enumerate(reversed(hist)) if c), 255)
    mean = sum(i * c for i, c in enumerate(hist)) / total
    s2 = sum((i - mean) ** 2 * c for i, c in enumerate(hist)) / total
    return {"total": total, "min": int(mn), "max": int(mx), "mean": float(mean), "std": float(math.sqrt(s2)), "clipped_low_ratio": float(hist[0] / total), "clipped_high_ratio": float(hist[-1] / total)}

def entropy(hist: list[int]) -> float | None:
    total = float(sum(hist))
    if total <= 0:
        return None
    ent = 0.0
    for c in hist:
        if c:
            p = c / total
            ent -= p * math.log2(p)
    return float(ent)

def ahash(img: Image.Image, size: int = 8) -> str:
    g = img.convert("L").resize((size, size), Image.BILINEAR)
    a = np.asarray(g, dtype=np.float32)
    m = a.mean()
    bits = (a > m).astype(np.uint8).flatten()
    v = 0
    for b in bits:
        v = (v << 1) | int(b)
    return f"{v:0{(size * size) // 4}x}"

def dhash(img: Image.Image, size: int = 8) -> str:
    g = img.convert("L").resize((size + 1, size), Image.BILINEAR)
    a = np.asarray(g, dtype=np.float32)
    diff = (a[:, 1:] > a[:, :-1]).astype(np.uint8).flatten()
    v = 0
    for b in diff:
        v = (v << 1) | int(b)
    return f"{v:0{(size * size) // 4}x}"

def phash(img: Image.Image, hash_bits: int = PHASH_BITS, highfreq_factor: int = PHASH_FACTOR) -> str:
    n = hash_bits * highfreq_factor
    g = img.convert("L").resize((n, n), Image.BILINEAR)
    a = np.asarray(g, dtype=np.float32)
    idx = np.arange(n)
    k = idx.reshape(-1, 1)
    cos = np.cos(np.pi / n * (idx + 0.5) * k).astype(np.float32)
    d1 = cos @ a
    d2 = (cos @ d1.T).T
    d = d2[:hash_bits, :hash_bits].flatten()
    med = np.median(d[1:]) if d.size > 1 else np.median(d)
    bits = (d > med).astype(np.uint8)
    v = 0
    for b in bits:
        v = (v << 1) | int(b)
    return f"{v:0{(hash_bits * hash_bits) // 4}x}"

def exif_present(img) -> bool:
    try:
        ex = img.getexif()
        return ex is not None and len(ex) > 0
    except Exception:
        return False

def icc_hash(img):
    try:
        icc = img.info.get("icc_profile", None)
        if not icc:
            return False, None
        if isinstance(icc, str):
            icc = icc.encode("utf-8", errors="ignore")
        return True, hashlib.sha256(icc).hexdigest()
    except Exception:
        return False, None

def jpeg_progressive_flag(img):
    v = img.info.get("progressive", None)
    return None if v is None else bool(v)

def jpeg_subsampling_flag(img):
    v = img.info.get("subsampling", None)
    return None if v is None else str(v)

def quant_hash_from_pil(img):
    q = getattr(img, "quantization", None)
    if not q:
        return None
    try:
        payload = json.dumps(q, sort_keys=True)
        return hashlib.sha256(payload.encode("utf-8")).hexdigest()
    except Exception:
        return None

def load_validator(path: Path) -> Draft202012Validator:
    schema = json.loads(path.read_text(encoding="utf-8"))
    Draft202012Validator.check_schema(schema)
    return Draft202012Validator(schema)


In [None]:
def find_input_root(start: Path) -> Path:
    override = os.environ.get("BWS_INPUT_ROOT")
    if override:
        p = Path(override).expanduser().resolve()
        if p.exists():
            return p
        raise RuntimeError(f"BWS_INPUT_ROOT does not exist: {p}")

    def is_scaffolded(p: Path) -> bool:
        if not p.is_dir():
            return False
        if not (p / "plates_structured").exists():
            return False
        if not (p / "schemas" / "plate.manifest.schema.json").exists():
            return False
        if not (p / "schemas" / "run.manifest.schema.json").exists():
            return False
        if not (p / "data.json").exists():
            return False
        return True

    candidates = [start] + list(start.parents)
    for base in candidates:
        if is_scaffolded(base):
            return base
        try:
            for child in sorted(base.iterdir()):
                if is_scaffolded(child):
                    return child
        except Exception:
            pass

    raise RuntimeError("Could not find scaffolded INPUT_ROOT; set BWS_INPUT_ROOT")

INPUT_ROOT = find_input_root(Path.cwd())
OUTPUT_ROOT = Path(os.environ.get("BWS_OUTPUT_ROOT", str(INPUT_ROOT / "_RUN_OUTPUT"))).expanduser().resolve()
SHARD_INDEX = int(os.environ.get("BWS_SHARD_INDEX", "0"))
SHARD_COUNT = int(os.environ.get("BWS_SHARD_COUNT", "1"))
RUN_ID = os.environ.get("BWS_RUN_ID", None)
SKIP_IF_PRESENT = os.environ.get("BWS_SKIP_IF_PRESENT", "1") == "1"

print("INPUT_ROOT :", INPUT_ROOT)
print("OUTPUT_ROOT:", OUTPUT_ROOT)
print("SHARD      :", SHARD_INDEX, "/", SHARD_COUNT)
print("RUN_ID     :", RUN_ID)
print("SKIP       :", SKIP_IF_PRESENT)


In [None]:
PLATES_ROOT = INPUT_ROOT / "plates_structured"
SCHEMAS_ROOT = INPUT_ROOT / "schemas"

plate_schema_path = SCHEMAS_ROOT / "plate.manifest.schema.json"
run_schema_path = SCHEMAS_ROOT / "run.manifest.schema.json"

if not plate_schema_path.exists():
    raise RuntimeError(f"Missing: {plate_schema_path}")
if not run_schema_path.exists():
    raise RuntimeError(f"Missing: {run_schema_path}")

plate_validator = load_validator(plate_schema_path)
run_validator = load_validator(run_schema_path)

plates = sorted([p for p in PLATES_ROOT.iterdir() if p.is_dir() and p.name.startswith("plate-")])
if len(plates) != 435:
    raise RuntimeError(f"Unexpected plate count: {len(plates)}")

selected = [p for i, p in enumerate(plates) if i % SHARD_COUNT == SHARD_INDEX]
print("plates_total   :", len(plates))
print("plates_selected:", len(selected))

cpu_schema_dir = OUTPUT_ROOT / "schemas"
cpu_schema_dir.mkdir(parents=True, exist_ok=True)
cpu_schema_path = cpu_schema_dir / "cpu.baseline.schema.json"

CPU_BASELINE_SCHEMA = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://burning-world-series/schemas/cpu.baseline.schema.json",
    "title": "CPU Baseline Feature Pack",
    "type": "object",
    "required": ["plate_id", "run_id", "timestamp", "source_image", "source_file", "geometry", "tiling", "decode"],
    "properties": {
        "plate_id": {"type": "string", "pattern": "^plate-[0-9]{3}$"},
        "run_id": {"type": "string"},
        "timestamp": {"type": "string", "format": "date-time"},
        "source_image": {"type": "string"},
        "source_file": {
            "type": "object",
            "required": ["bytes", "extension", "format", "sha256", "exif_present", "icc_present", "icc_hash", "jpeg_is_progressive", "jpeg_subsampling", "jpeg_quant_hash"],
            "properties": {
                "bytes": {"type": "integer", "minimum": 0},
                "extension": {"type": "string"},
                "format": {"type": ["string", "null"]},
                "sha256": {"type": "string"},
                "exif_present": {"type": "boolean"},
                "icc_present": {"type": "boolean"},
                "icc_hash": {"type": ["string", "null"]},
                "jpeg_is_progressive": {"type": ["boolean", "null"]},
                "jpeg_subsampling": {"type": ["string", "null"]},
                "jpeg_quant_hash": {"type": ["string", "null"]}
            },
            "additionalProperties": False
        },
        "geometry": {
            "type": "object",
            "required": ["width_px", "height_px", "megapixels", "aspect_ratio", "mode"],
            "properties": {
                "width_px": {"type": ["integer", "null"], "minimum": 1},
                "height_px": {"type": ["integer", "null"], "minimum": 1},
                "megapixels": {"type": ["number", "null"], "minimum": 0},
                "aspect_ratio": {"type": ["number", "null"], "minimum": 0},
                "mode": {"type": ["string", "null"]}
            },
            "additionalProperties": False
        },
        "tiling": {
            "type": "object",
            "required": ["tile_size_px", "tiles_x", "tiles_y", "total_tiles"],
            "properties": {
                "tile_size_px": {"type": "integer", "minimum": 1},
                "tiles_x": {"type": "integer", "minimum": 1},
                "tiles_y": {"type": "integer", "minimum": 1},
                "total_tiles": {"type": "integer", "minimum": 1}
            },
            "additionalProperties": False
        },
        "decode": {
            "type": "object",
            "required": ["ok", "error"],
            "properties": {"ok": {"type": "boolean"}, "error": {"type": ["string", "null"]}},
            "additionalProperties": False
        },
        "pixel_stats": {
            "type": ["object", "null"],
            "properties": {
                "colorspace": {"type": "string"},
                "rgb_histograms": {
                    "type": "object",
                    "required": ["bins", "R", "G", "B"],
                    "properties": {
                        "bins": {"type": "integer"},
                        "R": {"type": "array", "items": {"type": "integer"}, "minItems": 256, "maxItems": 256},
                        "G": {"type": "array", "items": {"type": "integer"}, "minItems": 256, "maxItems": 256},
                        "B": {"type": "array", "items": {"type": "integer"}, "minItems": 256, "maxItems": 256}
                    },
                    "additionalProperties": False
                },
                "l_histogram": {
                    "type": "object",
                    "required": ["bins", "L"],
                    "properties": {
                        "bins": {"type": "integer"},
                        "L": {"type": "array", "items": {"type": "integer"}, "minItems": 256, "maxItems": 256}
                    },
                    "additionalProperties": False
                },
                "rgb_stats": {"type": "object"},
                "luma_stats": {"type": "object"},
                "entropy": {
                    "type": "object",
                    "required": ["R", "G", "B", "L"],
                    "properties": {
                        "R": {"type": ["number", "null"]},
                        "G": {"type": ["number", "null"]},
                        "B": {"type": ["number", "null"]},
                        "L": {"type": ["number", "null"]}
                    },
                    "additionalProperties": False
                },
                "laplacian_var": {"type": ["number", "null"]}
            },
            "additionalProperties": False
        },
        "hashes": {
            "type": ["object", "null"],
            "properties": {"ahash": {"type": "string"}, "dhash": {"type": "string"}, "phash": {"type": "string"}},
            "additionalProperties": False
        }
    },
    "additionalProperties": False
}

if not cpu_schema_path.exists():
    cpu_schema_path.write_text(json.dumps(CPU_BASELINE_SCHEMA, indent=2), encoding="utf-8")

cpu_validator = load_validator(cpu_schema_path)
print("cpu.baseline.schema.json:", cpu_schema_path)


In [None]:
MODELS = ["cpu-baseline-v1"]
NOTE = "cpu baseline: container geometry hist entropy hashes laplacian"
RUN_ID_EFFECTIVE = RUN_ID or generate_run_id(MODELS, NOTE)

report = {
    "run_id": RUN_ID_EFFECTIVE,
    "timestamp": utc_iso(),
    "dataset_root": str(INPUT_ROOT),
    "input_root": str(INPUT_ROOT),
    "output_root": str(OUTPUT_ROOT),
    "shard_index": SHARD_INDEX,
    "shard_count": SHARD_COUNT,
    "plates_total": len(plates),
    "plates_selected": len(selected),
    "plates_processed": 0,
    "plates_skipped": 0,
    "decode_failures": 0,
    "schema_failures": 0,
    "errors_sample": [],
}

def compute_cpu_baseline(plate_dir: Path) -> tuple[dict, dict]:
    manifest = json.loads((plate_dir / "manifest.json").read_text(encoding="utf-8"))
    errs = list(plate_validator.iter_errors(manifest))
    if errs:
        raise RuntimeError(f"plate manifest schema error: {list(errs[0].path)} -> {errs[0].message}")

    src_path = plate_dir / manifest["source_image"]
    if not src_path.exists():
        raise RuntimeError("missing source image")

    run_manifest = {
        "run_id": RUN_ID_EFFECTIVE,
        "plate_id": plate_dir.name,
        "timestamp": utc_iso(),
        "models": MODELS,
        "outputs": ["cpu_baseline.json"],
        "notes": NOTE,
    }

    rerrs = list(run_validator.iter_errors(run_manifest))
    if rerrs:
        raise RuntimeError(f"run manifest schema error: {list(rerrs[0].path)} -> {rerrs[0].message}")

    baseline = {
        "plate_id": manifest["plate_id"],
        "run_id": RUN_ID_EFFECTIVE,
        "timestamp": utc_iso(),
        "source_image": manifest["source_image"],
        "source_file": {
            "bytes": int(src_path.stat().st_size),
            "extension": src_path.suffix.lower().lstrip("."),
            "format": None,
            "sha256": sha256_file(src_path),
            "exif_present": False,
            "icc_present": False,
            "icc_hash": None,
            "jpeg_is_progressive": None,
            "jpeg_subsampling": None,
            "jpeg_quant_hash": None,
        },
        "geometry": {"width_px": None, "height_px": None, "megapixels": None, "aspect_ratio": None, "mode": None},
        "tiling": {"tile_size_px": TILE_SIZE, "tiles_x": 1, "tiles_y": 1, "total_tiles": 1},
        "decode": {"ok": False, "error": None},
        "pixel_stats": None,
        "hashes": None,
    }

    try:
        with Image.open(src_path) as img:
            baseline["source_file"]["format"] = img.format
            baseline["source_file"]["exif_present"] = exif_present(img)
            ip, ih = icc_hash(img)
            baseline["source_file"]["icc_present"] = ip
            baseline["source_file"]["icc_hash"] = ih
            if (img.format or "").upper() == "JPEG":
                baseline["source_file"]["jpeg_is_progressive"] = jpeg_progressive_flag(img)
                baseline["source_file"]["jpeg_subsampling"] = jpeg_subsampling_flag(img)
                baseline["source_file"]["jpeg_quant_hash"] = quant_hash_from_pil(img)

            w, h = img.size
            baseline["geometry"] = {
                "width_px": int(w),
                "height_px": int(h),
                "megapixels": float(round((w * h) / 1_000_000, 3)),
                "aspect_ratio": float(w / h),
                "mode": img.mode,
            }

            tiles_x = (w + TILE_SIZE - 1) // TILE_SIZE
            tiles_y = (h + TILE_SIZE - 1) // TILE_SIZE
            baseline["tiling"] = {
                "tile_size_px": TILE_SIZE,
                "tiles_x": int(tiles_x),
                "tiles_y": int(tiles_y),
                "total_tiles": int(tiles_x * tiles_y),
            }

            rgb = img.convert("RGB")
            raw = rgb.histogram()
            Rh, Gh, Bh = raw[0:256], raw[256:512], raw[512:768]
            L = rgb.convert("L")
            Lh = L.histogram()

            baseline["pixel_stats"] = {
                "colorspace": "as-decoded",
                "rgb_histograms": {"bins": HIST_BINS, "R": Rh, "G": Gh, "B": Bh},
                "l_histogram": {"bins": HIST_BINS, "L": Lh},
                "rgb_stats": {"R": hist_stats(Rh), "G": hist_stats(Gh), "B": hist_stats(Bh)},
                "luma_stats": hist_stats(Lh),
                "entropy": {"R": entropy(Rh), "G": entropy(Gh), "B": entropy(Bh), "L": entropy(Lh)},
                "laplacian_var": laplacian_variance(L),
            }
            baseline["hashes"] = {"ahash": ahash(rgb), "dhash": dhash(rgb), "phash": phash(rgb)}
            baseline["decode"] = {"ok": True, "error": None}

    except Exception as e:
        baseline["decode"] = {"ok": False, "error": f"{type(e).__name__}: {str(e)[:300]}"}

    berrs = list(cpu_validator.iter_errors(baseline))
    if berrs:
        raise RuntimeError(f"cpu baseline schema error: {list(berrs[0].path)} -> {berrs[0].message}")

    return run_manifest, baseline

for plate_dir in tqdm(selected, desc="plates"):
    out_plate_dir = OUTPUT_ROOT / "plates_structured" / plate_dir.name
    out_run_dir = out_plate_dir / "runs" / RUN_ID_EFFECTIVE
    out_run_dir.mkdir(parents=True, exist_ok=True)

    out_metrics = out_run_dir / "metrics.json"
    out_cpu = out_run_dir / "cpu_baseline.json"

    if SKIP_IF_PRESENT and out_metrics.exists() and out_cpu.exists():
        report["plates_skipped"] += 1
        continue

    try:
        run_manifest, baseline = compute_cpu_baseline(plate_dir)
        out_metrics.write_text(json.dumps(run_manifest, indent=2), encoding="utf-8")
        out_cpu.write_text(json.dumps(baseline, indent=2), encoding="utf-8")
        report["plates_processed"] += 1
    except Exception as e:
        report["schema_failures"] += 1
        if len(report["errors_sample"]) < 10:
            report["errors_sample"].append(f"{plate_dir.name}: {type(e).__name__}: {str(e)}")

report_dir = OUTPUT_ROOT / "reports" / RUN_ID_EFFECTIVE
report_dir.mkdir(parents=True, exist_ok=True)
(report_dir / "report.json").write_text(json.dumps(report, indent=2), encoding="utf-8")
print(json.dumps(report, indent=2))
