# Data Loading, Metadata Audit, and Manifests

In [1]:
from pathlib import Path
import json
import pandas as pd

# ---- paths
DATA_ROOT = Path("/kaggle/input/physionet-ecg-image-digitization")
TRAIN_DIR = DATA_ROOT / "train"
TEST_DIR  = DATA_ROOT / "test"

OUT_ROOT = Path("/kaggle/working/ecgdig")
ART_DIR  = OUT_ROOT / "artifacts"
OUT_ROOT.mkdir(parents=True, exist_ok=True)
ART_DIR.mkdir(parents=True, exist_ok=True)

# ---- hard sanity (no branching; fail fast)
assert (DATA_ROOT / "train.csv").exists()
assert (DATA_ROOT / "test.csv").exists()
assert (DATA_ROOT / "sample_submission.parquet").exists()
assert TRAIN_DIR.exists()
assert TEST_DIR.exists()

# ---- load metadata
df_train_meta = pd.read_csv(DATA_ROOT / "train.csv")
df_test_meta  = pd.read_csv(DATA_ROOT / "test.csv")
df_sub_tpl    = pd.read_parquet(DATA_ROOT / "sample_submission.parquet")

# ---- schema checks
assert {"id", "fs", "sig_len"}.issubset(df_train_meta.columns)
assert {"id", "lead", "fs", "number_of_rows"}.issubset(df_test_meta.columns)
assert {"id", "value"}.issubset(df_sub_tpl.columns)

# ---- train manifest: (base_id, variant_tag, image_path, gt_path)
rows = []
for p in sorted(TRAIN_DIR.iterdir()):
    base_id = p.name
    gt_path = p / f"{base_id}.csv"
    assert gt_path.exists()
    for img_path in sorted(p.glob(f"{base_id}-*.png")):
        variant_tag = img_path.stem.split("-")[-1]
        rows.append((base_id, variant_tag, str(img_path), str(gt_path)))

df_train_manifest = pd.DataFrame(rows, columns=["base_id", "variant_tag", "image_path", "gt_path"])
assert len(df_train_manifest) > 0

# ---- test manifest: (base_id, image_path)
df_test_manifest = df_test_meta[["id"]].rename(columns={"id": "base_id"}).copy()
df_test_manifest["image_path"] = df_test_manifest["base_id"].map(lambda x: str(TEST_DIR / f"{x}.png"))

# ---- lightweight audit report
audit = {
    "data_root": str(DATA_ROOT),
    "n_train_meta": int(len(df_train_meta)),
    "n_test_meta": int(len(df_test_meta)),
    "n_train_variants": int(len(df_train_manifest)),
    "train_fs_unique": sorted(df_train_meta["fs"].unique().tolist()),
    "test_fs_unique": sorted(df_test_meta["fs"].unique().tolist()),
    "test_leads_unique": sorted(df_test_meta["lead"].unique().tolist()),
    "sub_template_rows": int(len(df_sub_tpl)),
    "train_manifest_example": df_train_manifest.head(3).to_dict(orient="records"),
}

# ---- save artifacts
df_train_meta.to_parquet(ART_DIR / "train_meta.parquet", index=False)
df_test_meta.to_parquet(ART_DIR / "test_meta.parquet", index=False)
df_sub_tpl.to_parquet(ART_DIR / "sample_submission.parquet", index=False)
df_train_manifest.to_parquet(ART_DIR / "train_manifest.parquet", index=False)
df_test_manifest.to_parquet(ART_DIR / "test_manifest.parquet", index=False)

(ART_DIR / "paths.json").write_text(json.dumps({
    "DATA_ROOT": str(DATA_ROOT),
    "TRAIN_DIR": str(TRAIN_DIR),
    "TEST_DIR": str(TEST_DIR),
    "OUT_ROOT": str(OUT_ROOT),
    "ART_DIR": str(ART_DIR),
}, indent=2))

(ART_DIR / "meta_audit.json").write_text(json.dumps(audit, indent=2))

print("OK | Saved:")
print(" -", ART_DIR / "train_meta.parquet")
print(" -", ART_DIR / "test_meta.parquet")
print(" -", ART_DIR / "train_manifest.parquet")
print(" -", ART_DIR / "test_manifest.parquet")
print(" -", ART_DIR / "sample_submission.parquet")
print(" -", ART_DIR / "meta_audit.json")

OK | Saved:
 - /kaggle/working/ecgdig/artifacts/train_meta.parquet
 - /kaggle/working/ecgdig/artifacts/test_meta.parquet
 - /kaggle/working/ecgdig/artifacts/train_manifest.parquet
 - /kaggle/working/ecgdig/artifacts/test_manifest.parquet
 - /kaggle/working/ecgdig/artifacts/sample_submission.parquet
 - /kaggle/working/ecgdig/artifacts/meta_audit.json


# Leakage-Safe Cross-Validation and Data Tables

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold

OUT_ROOT = Path("/kaggle/working/ecgdig")
ART_DIR  = OUT_ROOT / "artifacts"
assert ART_DIR.exists()

# ---- load stage-1 artifacts (fail fast)
df_train_meta     = pd.read_parquet(ART_DIR / "train_meta.parquet")
df_train_manifest = pd.read_parquet(ART_DIR / "train_manifest.parquet")
assert {"id","fs","sig_len"}.issubset(df_train_meta.columns)
assert {"base_id","variant_tag","image_path","gt_path"}.issubset(df_train_manifest.columns)

# ---- reconcile base_id universe (use intersection; no hard assumption)
base_from_manifest = pd.Index(df_train_manifest["base_id"].unique())
base_from_meta     = pd.Index(df_train_meta["id"].astype(str).unique())
base_keep          = pd.Index(sorted(set(base_from_manifest).intersection(set(base_from_meta))))
assert len(base_keep) > 0

# ---- base table (record-level, only for base_keep)
df_base = (df_train_meta[["id","fs","sig_len"]]
           .assign(id=lambda d: d["id"].astype(str))
           .rename(columns={"id":"base_id"})
           .drop_duplicates("base_id", keep="first"))

df_base = df_base[df_base["base_id"].isin(base_keep)].reset_index(drop=True)
assert df_base["base_id"].nunique() == len(df_base)

# ---- manifest restricted to base_keep (ensures fold merge cannot produce NaN)
df_train_manifest = df_train_manifest[df_train_manifest["base_id"].isin(base_keep)].reset_index(drop=True)
assert df_train_manifest["base_id"].nunique() == df_base["base_id"].nunique()

# ---- GroupKFold by base_id (all variants of a base_id share the same fold)
N_SPLITS = 5
assert df_base["base_id"].nunique() >= N_SPLITS

gkf  = GroupKFold(n_splits=N_SPLITS)
fold = np.full(len(df_base), -1, dtype=np.int16)

for k, (_, va_idx) in enumerate(gkf.split(df_base, groups=df_base["base_id"].values)):
    fold[va_idx] = k

df_base["fold"] = fold
assert (df_base["fold"] >= 0).all()

# ---- attach folds to manifest + build training tables
df_train_manifest_folds = df_train_manifest.merge(df_base[["base_id","fold"]], on="base_id", how="inner")
assert len(df_train_manifest_folds) == len(df_train_manifest)

df_train_img_table = df_train_manifest_folds.merge(df_base[["base_id","fs","sig_len"]], on="base_id", how="inner")
assert len(df_train_img_table) == len(df_train_manifest_folds)

# ---- mismatch report (for debugging; does not stop pipeline)
missing_in_meta = sorted(set(base_from_manifest) - set(base_from_meta))
missing_in_manifest = sorted(set(base_from_meta) - set(base_from_manifest))
mismatch_report = {
    "n_base_ids_manifest": int(len(base_from_manifest)),
    "n_base_ids_meta": int(len(base_from_meta)),
    "n_base_ids_used_intersection": int(len(base_keep)),
    "n_missing_in_train_csv": int(len(missing_in_meta)),
    "n_missing_in_train_folder": int(len(missing_in_manifest)),
    "missing_in_train_csv_head200": missing_in_meta[:200],
    "missing_in_train_folder_head200": missing_in_manifest[:200],
}

# ---- fold stats
fold_stats = {
    "n_splits": int(N_SPLITS),
    "n_base_ids_used": int(df_base["base_id"].nunique()),
    "n_train_variants_used": int(len(df_train_manifest_folds)),
    "base_ids_per_fold": df_base.groupby("fold")["base_id"].nunique().sort_index().astype(int).to_dict(),
    "variants_per_fold": df_train_manifest_folds.groupby("fold")["base_id"].size().sort_index().astype(int).to_dict(),
    "fs_unique": sorted(df_base["fs"].unique().tolist()),
    "sig_len_unique": sorted(df_base["sig_len"].unique().tolist()),
}

# ---- save artifacts
df_base.to_parquet(ART_DIR / "train_base_folds.parquet", index=False)
df_train_manifest_folds.to_parquet(ART_DIR / "train_manifest_folds.parquet", index=False)
df_train_img_table.to_parquet(ART_DIR / "train_img_table.parquet", index=False)
(ART_DIR / "fold_stats.json").write_text(json.dumps(fold_stats, indent=2))
(ART_DIR / "mismatch_report.json").write_text(json.dumps(mismatch_report, indent=2))

print("OK | Saved:")
print(" -", ART_DIR / "train_base_folds.parquet")
print(" -", ART_DIR / "train_manifest_folds.parquet")
print(" -", ART_DIR / "train_img_table.parquet")
print(" -", ART_DIR / "fold_stats.json")
print(" -", ART_DIR / "mismatch_report.json")
print("Used base_ids:", fold_stats["n_base_ids_used"], "| Used variants:", fold_stats["n_train_variants_used"])
print("Fold base_ids_per_fold:", fold_stats["base_ids_per_fold"])
print("Mismatch (manifest not in train.csv):", mismatch_report["n_missing_in_train_csv"])
print("Mismatch (train.csv not in manifest):", mismatch_report["n_missing_in_train_folder"])

OK | Saved:
 - /kaggle/working/ecgdig/artifacts/train_base_folds.parquet
 - /kaggle/working/ecgdig/artifacts/train_manifest_folds.parquet
 - /kaggle/working/ecgdig/artifacts/train_img_table.parquet
 - /kaggle/working/ecgdig/artifacts/fold_stats.json
 - /kaggle/working/ecgdig/artifacts/mismatch_report.json
Used base_ids: 977 | Used variants: 8793
Fold base_ids_per_fold: {0: 196, 1: 196, 2: 195, 3: 195, 4: 195}
Mismatch (manifest not in train.csv): 0
Mismatch (train.csv not in manifest): 0


# Page Geometry Normalization and Robust Lead Localization

In [3]:
from pathlib import Path
import os, json, shutil
import numpy as np
import pandas as pd
import cv2
from tqdm.auto import tqdm

# ============================================================
# STAGE 3 — REVISED FULL (Option A + Parquet dtype-safe)
# ============================================================

WORK_ROOT = Path("/kaggle/working/ecgdig")
TMP_ROOT  = Path("/kaggle/temp/ecgdig_cache")

ART_DIR   = WORK_ROOT / "artifacts"
MODEL_DIR = WORK_ROOT / "models"

CACHE_DIR = TMP_ROOT / "cache"
NORM_DIR  = CACHE_DIR / "norm"
CROP_DIR  = CACHE_DIR / "crops"
YOLO_DIR  = TMP_ROOT / "yolo_roi"
YOLO_RUNS = TMP_ROOT / "yolo_runs"

for d in [WORK_ROOT, ART_DIR, MODEL_DIR, TMP_ROOT, CACHE_DIR, NORM_DIR, CROP_DIR, YOLO_DIR, YOLO_RUNS]:
    d.mkdir(parents=True, exist_ok=True)

# ---- load artifacts
df_train = pd.read_parquet(ART_DIR / "train_img_table.parquet")
df_testm = pd.read_parquet(ART_DIR / "test_meta.parquet")
df_test  = pd.read_parquet(ART_DIR / "test_manifest.parquet")

assert {"base_id","variant_tag","image_path","fold"}.issubset(df_train.columns)
assert {"base_id","image_path"}.issubset(df_test.columns)
assert {"id","lead","fs","number_of_rows"}.issubset(df_testm.columns)

# ---- dtype normalize (CRITICAL for pyarrow)
df_train["base_id"] = df_train["base_id"].astype(str)
df_train["variant_tag"] = df_train["variant_tag"].astype(str)
df_train["image_path"] = df_train["image_path"].astype(str)

df_test["base_id"] = df_test["base_id"].astype(str)
df_test["image_path"] = df_test["image_path"].astype(str)

# ---- build unified image table (train variants + test images)
df_tr_img = df_train[["base_id","variant_tag","image_path","fold"]].copy()
df_tr_img["split"] = "train"
df_tr_img["uid"] = df_tr_img["base_id"] + "__" + df_tr_img["variant_tag"]

df_te_img = df_test[["base_id","image_path"]].copy()
df_te_img["variant_tag"] = "test"
df_te_img["fold"] = -1
df_te_img["split"] = "test"
df_te_img["uid"] = df_te_img["base_id"] + "__test"

df_img = pd.concat([df_tr_img, df_te_img], ignore_index=True)
df_img["norm_path"] = df_img["uid"].map(lambda u: str(NORM_DIR / f"{u}.png"))

# ---- geometry normalization helpers
def _read_bgr(path: str) -> np.ndarray:
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    assert img is not None
    return img

def _resize_max_side(img: np.ndarray, max_side: int) -> np.ndarray:
    h, w = img.shape[:2]
    s = min(1.0, float(max_side) / float(max(h, w)))
    nw = int(round(w * s))
    nh = int(round(h * s))
    return cv2.resize(img, (nw, nh), interpolation=cv2.INTER_AREA)

def _estimate_deskew_angle(gray: np.ndarray) -> float:
    e = cv2.Canny(gray, 50, 150)
    lines = cv2.HoughLinesP(e, 1, np.pi/180.0, threshold=120, minLineLength=gray.shape[1]//3, maxLineGap=20)
    try:
        x1 = lines[:,0,0].astype(np.float32); y1 = lines[:,0,1].astype(np.float32)
        x2 = lines[:,0,2].astype(np.float32); y2 = lines[:,0,3].astype(np.float32)
        ang = np.degrees(np.arctan2((y2 - y1), (x2 - x1)))
        ang = ((ang + 45.0) % 90.0) - 45.0
        a = float(np.nanmedian(ang))
    except Exception:
        a = 0.0
    return float(np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0))

def _rotate(img: np.ndarray, angle_deg: float) -> np.ndarray:
    h, w = img.shape[:2]
    M = cv2.getRotationMatrix2D((w/2.0, h/2.0), angle_deg, 1.0)
    return cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)

def _content_bbox(gray: np.ndarray) -> tuple:
    g = cv2.GaussianBlur(gray, (3,3), 0)
    _, thr = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    thr = cv2.morphologyEx(thr, cv2.MORPH_OPEN, np.ones((3,3), np.uint8), iterations=1)
    ys, xs = np.where(thr > 0)
    x1 = int(xs.min()); x2 = int(xs.max()) + 1
    y1 = int(ys.min()); y2 = int(ys.max()) + 1
    return x1, y1, x2, y2

def _normalize_one(src_path: str, max_side: int = 1600):
    bgr = _read_bgr(src_path)
    bgr = _resize_max_side(bgr, max_side=max_side)
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    ang = _estimate_deskew_angle(gray)
    rot = _rotate(bgr, -ang)
    gray2 = cv2.cvtColor(rot, cv2.COLOR_BGR2GRAY)
    try:
        x1,y1,x2,y2 = _content_bbox(gray2)
    except Exception:
        h,w = gray2.shape[:2]
        x1,y1,x2,y2 = 0,0,w,h
    pad = int(round(0.01 * min((x2-x1), (y2-y1))))
    x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
    x2 = min(rot.shape[1], x2 + pad); y2 = min(rot.shape[0], y2 + pad)
    crop = rot[y1:y2, x1:x2].copy()
    prm = {"angle_deg": float(ang),
           "content_bbox": f"{int(x1)},{int(y1)},{int(x2)},{int(y2)}",
           "max_side": int(max_side)}
    return crop, prm

# ---- template ROI boxes
LEADS_12 = ["I","aVR","V1","V4",
            "II","aVL","V2","V5",
            "III","aVF","V3","V6"]
LEAD_LONG = "II_LONG"
CLASSES = LEADS_12 + [LEAD_LONG]
CLASS2ID = {c:i for i,c in enumerate(CLASSES)}

def _template_boxes(h: int, w: int):
    pw = int(round(0.01 * w))
    ph = int(round(0.01 * h))
    y_main0 = 0
    y_main1 = int(round(0.75 * h))
    y_rhy0  = y_main1
    y_rhy1  = h

    boxes = []
    cell_h = (y_main1 - y_main0) / 3.0
    cell_w = w / 4.0
    for idx, lead in enumerate(LEADS_12):
        r = idx // 4
        c = idx % 4
        x1 = int(round(c * cell_w)) + pw
        x2 = int(round((c+1) * cell_w)) - pw
        y1 = int(round(y_main0 + r * cell_h)) + ph
        y2 = int(round(y_main0 + (r+1) * cell_h)) - ph
        boxes.append((lead, x1, y1, x2, y2))
    boxes.append((LEAD_LONG, pw, y_rhy0 + ph, w - pw, y_rhy1 - ph))
    return boxes

# ---- normalize all images (train variants + test) -> /kaggle/temp
norm_params = []
for uid, src, dst in tqdm(df_img[["uid","image_path","norm_path"]].itertuples(index=False), total=len(df_img)):
    img_norm, prm = _normalize_one(src, max_side=1600)
    cv2.imwrite(dst, img_norm)
    prm.update({"uid": str(uid), "norm_path": str(dst), "h": int(img_norm.shape[0]), "w": int(img_norm.shape[1])})
    norm_params.append(prm)

df_norm = pd.DataFrame(norm_params)
df_norm = df_norm.merge(df_img[["uid","base_id","variant_tag","fold","split"]], on="uid", how="left")

# ---- double-safe dtype for parquet
df_norm["base_id"] = df_norm["base_id"].astype(str)
df_norm["variant_tag"] = df_norm["variant_tag"].astype(str)
df_norm["split"] = df_norm["split"].astype(str)
df_norm["uid"] = df_norm["uid"].astype(str)
df_norm["norm_path"] = df_norm["norm_path"].astype(str)

df_norm.to_parquet(ART_DIR / "norm_manifest.parquet", index=False)

# ---- build template ROI boxes + save crops -> /kaggle/temp
roi_rows = []
for uid, base_id, split, norm_path, h, w in tqdm(df_norm[["uid","base_id","split","norm_path","h","w"]].itertuples(index=False), total=len(df_norm)):
    img = _read_bgr(norm_path)
    for lead, x1,y1,x2,y2 in _template_boxes(int(h), int(w)):
        crop = img[y1:y2, x1:x2]
        outp = CROP_DIR / f"{uid}__{lead}.png"
        cv2.imwrite(str(outp), crop)
        roi_rows.append((str(uid), str(base_id), str(split), str(lead), int(x1),int(y1),int(x2),int(y2), int(w),int(h), str(outp)))

df_roi_tpl = pd.DataFrame(roi_rows, columns=["uid","base_id","split","lead","x1","y1","x2","y2","img_w","img_h","crop_path"])
df_roi_tpl["base_id"] = df_roi_tpl["base_id"].astype(str)
df_roi_tpl["uid"] = df_roi_tpl["uid"].astype(str)
df_roi_tpl["lead"] = df_roi_tpl["lead"].astype(str)
df_roi_tpl["crop_path"] = df_roi_tpl["crop_path"].astype(str)

df_roi_tpl.to_parquet(ART_DIR / "roi_boxes_template.parquet", index=False)

# ---- export YOLO ROI dataset -> /kaggle/temp
for p in [
    YOLO_DIR / "images/train", YOLO_DIR / "images/val",
    YOLO_DIR / "labels/train", YOLO_DIR / "labels/val"
]:
    p.mkdir(parents=True, exist_ok=True)

df_yolo = df_norm[df_norm["split"].eq("train")].copy()
df_yolo["yolo_split"] = np.where(df_yolo["fold"].values == 0, "val", "train")

def _link_or_copy(src: str, dst: str):
    try:
        os.link(src, dst)
    except Exception:
        shutil.copy2(src, dst)

for uid, norm_path, ysplit, h, w in tqdm(df_yolo[["uid","norm_path","yolo_split","h","w"]].itertuples(index=False), total=len(df_yolo)):
    img_dst = YOLO_DIR / "images" / ysplit / f"{uid}.png"
    lab_dst = YOLO_DIR / "labels" / ysplit / f"{uid}.txt"
    _link_or_copy(str(norm_path), str(img_dst))

    boxes = _template_boxes(int(h), int(w))
    lines = []
    for lead, x1,y1,x2,y2 in boxes:
        cx = ((x1 + x2) / 2.0) / float(w)
        cy = ((y1 + y2) / 2.0) / float(h)
        bw = (x2 - x1) / float(w)
        bh = (y2 - y1) / float(h)
        lines.append(f"{CLASS2ID[lead]} {cx:.6f} {cy:.6f} {bw:.6f} {bh:.6f}")
    lab_dst.write_text("\n".join(lines))

yolo_yaml = YOLO_DIR / "roi.yaml"
yolo_yaml.write_text(
    "path: " + str(YOLO_DIR) + "\n"
    "train: images/train\n"
    "val: images/val\n"
    "names:\n" + "\n".join([f"  {i}: {name}" for i,name in enumerate(CLASSES)]) + "\n"
)

# ---- optional YOLOv8n train
yolo_ckpt = MODEL_DIR / "yolo_roi.pt"
train_log = {"trained": False}

try:
    from ultralytics import YOLO
    y = YOLO("yolov8n.pt")
    r = y.train(
        data=str(yolo_yaml),
        epochs=15,
        imgsz=960,
        batch=16,
        workers=2,
        verbose=False,
        project=str(YOLO_RUNS),
        name="roi"
    )
    best_pt = Path(r.save_dir) / "weights" / "best.pt"
    shutil.copy2(best_pt, yolo_ckpt)
    train_log = {"trained": True, "best_pt": str(best_pt), "saved_as": str(yolo_ckpt), "save_dir": str(r.save_dir)}
except Exception as e:
    train_log = {"trained": False, "error": repr(e), "note": "Ultralytics not available or training failed; template ROIs already saved."}

(Path(ART_DIR) / "yolo_train_log.json").write_text(json.dumps(train_log, indent=2))

# ---- optional test-time refine ROIs
try:
    from ultralytics import YOLO
    y = YOLO(str(yolo_ckpt))

    df_te = df_norm[df_norm["split"].eq("test")][["uid","base_id","norm_path","h","w"]].copy()
    refined_rows = []
    for uid, base_id, norm_path, h, w in tqdm(df_te.itertuples(index=False), total=len(df_te)):
        pred = y.predict(source=str(norm_path), imgsz=960, conf=0.10, iou=0.40, verbose=False)[0]
        b = pred.boxes
        cls = b.cls.cpu().numpy().astype(int)
        xyxy = b.xyxy.cpu().numpy().astype(np.float32)
        conf = b.conf.cpu().numpy().astype(np.float32)
        d = pd.DataFrame({"class_id": cls, "conf": conf,
                          "x1": xyxy[:,0], "y1": xyxy[:,1], "x2": xyxy[:,2], "y2": xyxy[:,3]})
        d = d.sort_values(["class_id","conf"]).drop_duplicates("class_id", keep="last")
        d["lead"] = d["class_id"].map(lambda i: CLASSES[int(i)])
        for lead, x1,y1,x2,y2 in d[["lead","x1","y1","x2","y2"]].itertuples(index=False):
            refined_rows.append((str(uid), str(lead), float(x1),float(y1),float(x2),float(y2)))

    df_ref = pd.DataFrame(refined_rows, columns=["uid","lead","x1","y1","x2","y2"])
    df_ref = df_ref.merge(df_te[["uid","h","w"]], on="uid", how="left")

    df_tpl = df_roi_tpl[df_roi_tpl["split"].eq("test")][["uid","lead","x1","y1","x2","y2","img_w","img_h"]].copy()
    df_tpl = df_tpl.rename(columns={"img_w":"w","img_h":"h"})

    df_m = df_tpl.merge(df_ref, on=["uid","lead"], how="left", suffixes=("_tpl","_pred"))
    df_m["x1"] = df_m["x1_pred"].combine_first(df_m["x1_tpl"])
    df_m["y1"] = df_m["y1_pred"].combine_first(df_m["y1_tpl"])
    df_m["x2"] = df_m["x2_pred"].combine_first(df_m["x2_tpl"])
    df_m["y2"] = df_m["y2_pred"].combine_first(df_m["y2_tpl"])
    df_m["img_w"] = df_m["w"]
    df_m["img_h"] = df_m["h"]

    df_m = df_m[["uid","lead","x1","y1","x2","y2","img_w","img_h"]].copy()
    df_m[["x1","y1","x2","y2"]] = df_m[["x1","y1","x2","y2"]].round().astype(int)

    for uid, lead, x1,y1,x2,y2 in tqdm(df_m[["uid","lead","x1","y1","x2","y2"]].itertuples(index=False), total=len(df_m)):
        img = _read_bgr(str(NORM_DIR / f"{uid}.png"))
        crop = img[int(y1):int(y2), int(x1):int(x2)]
        outp = CROP_DIR / f"{uid}__{lead}.png"
        cv2.imwrite(str(outp), crop)

    df_roi_test_ref = df_roi_tpl[df_roi_tpl["split"].eq("test")].drop(columns=["x1","y1","x2","y2"], errors="ignore")
    df_roi_test_ref = df_roi_test_ref.merge(df_m, on=["uid","lead"], how="left")
    df_roi_test_ref.to_parquet(ART_DIR / "roi_boxes_test_refined.parquet", index=False)

except Exception as e:
    (Path(ART_DIR) / "yolo_refine_log.json").write_text(json.dumps({"refined": False, "error": repr(e)}, indent=2))

print("OK | Saved artifacts (working):")
print(" -", ART_DIR / "norm_manifest.parquet")
print(" -", ART_DIR / "roi_boxes_template.parquet")
print(" -", ART_DIR / "yolo_train_log.json")
print(" -", yolo_ckpt)
print("Temp caches (not counted in commit output):")
print(" -", NORM_DIR)
print(" -", CROP_DIR)
print(" -", YOLO_DIR)
print(" -", YOLO_RUNS)


  0%|          | 0/8817 [00:00<?, ?it/s]

  0%|          | 0/9081 [00:00<?, ?it/s]

  0%|          | 0/8793 [00:00<?, ?it/s]

OK | Saved artifacts (working):
 - /kaggle/working/ecgdig/artifacts/norm_manifest.parquet
 - /kaggle/working/ecgdig/artifacts/roi_boxes_template.parquet
 - /kaggle/working/ecgdig/artifacts/yolo_train_log.json
 - /kaggle/working/ecgdig/models/yolo_roi.pt
Temp caches (not counted in commit output):
 - /kaggle/temp/ecgdig_cache/cache/norm
 - /kaggle/temp/ecgdig_cache/cache/crops
 - /kaggle/temp/ecgdig_cache/yolo_roi
 - /kaggle/temp/ecgdig_cache/yolo_runs


# Pseudo-Label Mask Generation and Segmentation Training

In [None]:
from pathlib import Path
import json, math, random
import numpy as np
import pandas as pd
import cv2
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch import ToTensorV2

from transformers import SegformerForSemanticSegmentation

# ============================================================
# STAGE 4 — Pseudo-Label Mask Generation + SegFormer-B1 Training (REVISED FULL, SVD-safe)
# ============================================================

# Config
OUT_ROOT  = Path("/kaggle/working/ecgdig")
ART_DIR   = OUT_ROOT / "artifacts"
MODEL_DIR = OUT_ROOT / "models"
MASK_DBG_DIR = OUT_ROOT / "debug" / "pseudo_masks"
for d in [MODEL_DIR, MASK_DBG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

FOLD = 0
IMG_SIZE = 512
BATCH = 8
EPOCHS = 6
LR = 2e-4
WD = 1e-2
NUM_WORKERS = 2
SEED = 42

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

LEADS_12 = ["I","aVR","V1","V4","II","aVL","V2","V5","III","aVF","V3","V6"]
CLASSES  = LEADS_12 + ["II_LONG"]
LEAD_SEC = {**{k: 2.5 for k in LEADS_12}, "II_LONG": 10.0}
LEAD_COL = {**{k: k for k in LEADS_12}, "II_LONG": "II"}

GT_ROOT = Path("/kaggle/input/physionet-ecg-image-digitization") / "train"

# ---- load stage-3 artifacts (crops) + folds
df_roi = pd.read_parquet(ART_DIR / "roi_boxes_template.parquet")
df_folds = pd.read_parquet(ART_DIR / "train_base_folds.parquet")[["base_id","fold","fs","sig_len"]]

df_roi["base_id"] = df_roi["base_id"].astype(str)
df_roi["lead"] = df_roi["lead"].astype(str)
df_folds["base_id"] = df_folds["base_id"].astype(str)

df_roi = df_roi[df_roi["split"].eq("train")].merge(df_folds, on="base_id", how="inner")
df_roi = df_roi[df_roi["lead"].isin(CLASSES)].reset_index(drop=True)

df_tr = df_roi[df_roi["fold"].ne(FOLD)].reset_index(drop=True)
df_va = df_roi[df_roi["fold"].eq(FOLD)].reset_index(drop=True)

seg_table = {
    "FOLD": int(FOLD),
    "IMG_SIZE": int(IMG_SIZE),
    "BATCH": int(BATCH),
    "EPOCHS": int(EPOCHS),
    "LR": float(LR),
    "WD": float(WD),
    "N_TRAIN": int(len(df_tr)),
    "N_VALID": int(len(df_va)),
}
(Path(ART_DIR) / "seg_table_info.json").write_text(json.dumps(seg_table, indent=2))
df_tr.to_parquet(ART_DIR / f"seg_train_table_fold{FOLD}.parquet", index=False)
df_va.to_parquet(ART_DIR / f"seg_valid_table_fold{FOLD}.parquet", index=False)

# ---- augmentations
aug_tr = A.Compose([
    A.LongestMaxSize(max_size=IMG_SIZE),
    A.PadIfNeeded(IMG_SIZE, IMG_SIZE, border_mode=cv2.BORDER_REPLICATE),
    A.OneOf([
        A.GaussianBlur(blur_limit=(3, 7), p=1.0),
        A.MotionBlur(blur_limit=7, p=1.0),
    ], p=0.35),
    A.OneOf([
        A.RandomBrightnessContrast(0.2, 0.2, p=1.0),
        A.RandomGamma(gamma_limit=(70, 140), p=1.0),
    ], p=0.5),
    A.Perspective(scale=(0.02, 0.06), keep_size=True, p=0.25),
    A.ShiftScaleRotate(shift_limit=0.03, scale_limit=0.06, rotate_limit=3, border_mode=cv2.BORDER_REPLICATE, p=0.35),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(transpose_mask=True),
])

aug_va = A.Compose([
    A.LongestMaxSize(max_size=IMG_SIZE),
    A.PadIfNeeded(IMG_SIZE, IMG_SIZE, border_mode=cv2.BORDER_REPLICATE),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(transpose_mask=True),
])

# ---- stable linear fit (no SVD): y ≈ b - a*vn  with a>=0
def _fit_b_a(vn: np.ndarray, y: np.ndarray):
    # x2 = -vn, fit y = b + c*x2, enforce c>=0, then y = b - c*vn
    x2 = (-vn).astype(np.float32)
    y  = y.astype(np.float32)

    m = (np.isfinite(x2) & np.isfinite(y)).astype(np.float32)
    cnt = float(m.sum()) + 1e-6

    mx = float((x2 * m).sum() / cnt)
    my = float((y  * m).sum() / cnt)

    dx = (x2 - mx)
    dy = (y  - my)

    varx = float(((dx*dx) * m).sum() / cnt) + 1e-6
    cov  = float(((dx*dy) * m).sum() / cnt)

    c = float(np.abs(cov / varx))  # enforce positive without if/else
    b = float(my - c * mx)
    return b, c

# Dataset: pseudo-mask from GT aligned to ink (SVD-safe)
class ECGSegDS(Dataset):
    def __init__(self, df: pd.DataFrame, is_train: bool):
        self.df = df.reset_index(drop=True)
        self.tf = aug_tr if is_train else aug_va
        self.gt_cache = {}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i: int):
        r = self.df.iloc[i]
        crop_path = str(r["crop_path"])
        base_id = str(r["base_id"])
        lead = str(r["lead"])
        fs = float(r["fs"])

        # cache GT per base_id
        if base_id not in self.gt_cache:
            gt_csv = GT_ROOT / base_id / f"{base_id}.csv"
            self.gt_cache[base_id] = pd.read_csv(gt_csv)
        sig_df = self.gt_cache[base_id]

        col = LEAD_COL[lead]
        v_full = sig_df[col].values.astype(np.float32)
        v_full = np.nan_to_num(v_full, nan=0.0, posinf=0.0, neginf=0.0)

        n = int(math.floor(fs * LEAD_SEC[lead]))
        n = int(min(n, v_full.shape[0]))
        v = v_full[:n]

        bgr = cv2.imread(crop_path, cv2.IMREAD_COLOR)
        assert bgr is not None
        gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY).astype(np.uint8)

        # ink map
        ink = (255 - gray).astype(np.float32)
        ink = cv2.GaussianBlur(ink, (3,3), 0)

        h0, w0 = ink.shape[:2]

        # resample GT to crop width
        x0 = np.linspace(0.0, 1.0, num=w0, dtype=np.float32)
        xv = np.linspace(0.0, 1.0, num=max(1, len(v)), dtype=np.float32)
        v_w = np.interp(x0, xv, v).astype(np.float32)
        v_w = np.nan_to_num(v_w, nan=0.0, posinf=0.0, neginf=0.0)

        med = float(np.median(v_w))
        p05, p95 = np.nanpercentile(v_w, [5, 95]).astype(np.float32)
        scale = float((p95 - p05) + 1e-6)
        vn = (v_w - med) / scale
        vn = np.nan_to_num(vn, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)

        # soft y from ink (more stable than argmax)
        yy = np.arange(h0, dtype=np.float32).reshape(-1, 1)
        w = ink + 1e-3
        y_soft = (w * yy).sum(axis=0) / (w.sum(axis=0) + 1e-6)
        y_soft = np.nan_to_num(y_soft, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)

        # fit affine
        b, a = _fit_b_a(vn, y_soft)

        y = (b - a * vn).astype(np.int32)
        y = np.clip(y, 0, h0 - 1)

        # render polyline mask
        m = np.zeros((h0, w0), np.uint8)
        pts = np.stack([np.arange(w0, dtype=np.int32), y], axis=1).reshape(-1,1,2)
        cv2.polylines(m, [pts], isClosed=False, color=255, thickness=2, lineType=cv2.LINE_AA)
        m = cv2.morphologyEx(m, cv2.MORPH_DILATE, np.ones((3,3), np.uint8), iterations=1)

        out = self.tf(image=bgr, mask=m)
        img_t = out["image"]
        m_t = out["mask"].float().unsqueeze(0) / 255.0
        return img_t, m_t, base_id, lead

ds_tr = ECGSegDS(df_tr, is_train=True)
ds_va = ECGSegDS(df_va, is_train=False)

dl_tr = DataLoader(ds_tr, batch_size=BATCH, shuffle=True, num_workers=NUM_WORKERS,
                   pin_memory=True, drop_last=True)
dl_va = DataLoader(ds_va, batch_size=BATCH, shuffle=False, num_workers=NUM_WORKERS,
                   pin_memory=True, drop_last=False)

# Model: SegFormer-B1 (binary trace)
MODEL_NAME = "nvidia/segformer-b1-finetuned-ade-512-512"
model = SegformerForSemanticSegmentation.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    ignore_mismatched_sizes=True,
)
model.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
bce = nn.BCEWithLogitsLoss()

scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

def _run_epoch(dl, train: bool):
    model.train(mode=train)
    losses = []
    for img, m, _, _ in tqdm(dl, leave=False):
        img = img.to(device, non_blocking=True)
        m   = m.to(device, non_blocking=True)

        with torch.autocast(device_type=device.type, enabled=(device.type == "cuda")):
            out = model(pixel_values=img)
            logits = out.logits
            logits = F.interpolate(logits, size=m.shape[-2:], mode="bilinear", align_corners=False)
            loss = bce(logits, m)

        if train:
            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            scaler.step(opt)
            scaler.update()

        losses.append(float(loss.detach().cpu().item()))
    return float(np.mean(losses)) if len(losses) else float("nan")

best_va = 1e9
hist = []
for ep in range(EPOCHS):
    tr_loss = _run_epoch(dl_tr, train=True)
    va_loss = _run_epoch(dl_va, train=False)
    hist.append({"epoch": ep+1, "train_loss": tr_loss, "valid_loss": va_loss})
    best_va = min(best_va, va_loss)
    print(f"Epoch {ep+1}/{EPOCHS} | train_loss={tr_loss:.5f} | valid_loss={va_loss:.5f}")

# Save model
save_path = MODEL_DIR / f"segformer_b1_fold{FOLD}.pt"
torch.save(
    {
        "model_name": MODEL_NAME,
        "state_dict": model.state_dict(),
        "fold": int(FOLD),
        "img_size": int(IMG_SIZE),
        "classes": ["trace"],
        "leads": CLASSES,
        "train_cfg": {"batch": BATCH, "epochs": EPOCHS, "lr": LR, "wd": WD, "seed": SEED},
        "history": hist,
    },
    save_path
)
(Path(ART_DIR) / f"segformer_b1_fold{FOLD}_history.json").write_text(json.dumps(hist, indent=2))

# quick debug dump (first 12 samples from valid)
dbg_idx = np.arange(min(12, len(df_va)))
for j in dbg_idx:
    img_t, m_t, bid, lead = ds_va[int(j)]
    img = img_t.permute(1,2,0).cpu().numpy()
    img = (img * np.array([0.229,0.224,0.225]) + np.array([0.485,0.456,0.406]))
    img = np.clip(img*255.0, 0, 255).astype(np.uint8)
    msk = (m_t.squeeze(0).cpu().numpy()*255.0).astype(np.uint8)
    cv2.imwrite(str(MASK_DBG_DIR / f"va_{j:03d}__{bid}__{lead}__img.png"), cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
    cv2.imwrite(str(MASK_DBG_DIR / f"va_{j:03d}__{bid}__{lead}__mask.png"), msk)

print("OK | Saved:")
print(" -", ART_DIR / f"seg_train_table_fold{FOLD}.parquet")
print(" -", ART_DIR / f"seg_valid_table_fold{FOLD}.parquet")
print(" -", save_path)
print(" -", MASK_DBG_DIR)


2026-01-21 15:12:55.656009: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769008375.866964      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769008375.929732      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769008376.459837      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769008376.459886      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769008376.459889      55 computation_placer.cc:177] computation placer alr

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b1-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([1, 256, 1, 1]) in the model instantiated
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/11422 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

# Prob Map Inference, DP/Viterbi Vectorization, and Signal Extraction

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import cv2
from tqdm.auto import tqdm

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch import ToTensorV2
from transformers import SegformerForSemanticSegmentation

# Config
OUT_ROOT  = Path("/kaggle/working/ecgdig")
ART_DIR   = OUT_ROOT / "artifacts"
MODEL_DIR = OUT_ROOT / "models"
CACHE_DIR = OUT_ROOT / "cache"
PROB_DIR  = CACHE_DIR / "prob_u8"
TRACE_DIR = CACHE_DIR / "trace_px"
for d in [PROB_DIR, TRACE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

FOLD = 0
IMG_SIZE = 512
BATCH = 24
NUM_WORKERS = 2

# DP params (fast + stable)
DP_H_MAX = 256
DP_W_MAX = 512
MAX_DY = 12
LAMBDA = 0.65
EPS = 1e-6

SAVE_PROB_U8 = 1  # keep as 1 for debugging/QA; u8 is compact

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load ROI table (prefer refined test if exists)
df_roi = pd.read_parquet(ART_DIR / "roi_boxes_template.parquet")
try:
    df_ref = pd.read_parquet(ART_DIR / "roi_boxes_test_refined.parquet")
    df_roi = pd.concat([df_roi[df_roi["split"].ne("test")], df_ref], ignore_index=True)
except Exception:
    pass

df_roi = df_roi[df_roi["split"].eq("test")].reset_index(drop=True)
assert {"uid","base_id","lead","crop_path","split"}.issubset(df_roi.columns)
assert len(df_roi) > 0


# Inference transform
tf_inf = A.Compose([
    A.LongestMaxSize(max_size=IMG_SIZE),
    A.PadIfNeeded(IMG_SIZE, IMG_SIZE, border_mode=cv2.BORDER_REPLICATE),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

class CropDS(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i: int):
        r = self.df.iloc[i]
        p = r["crop_path"]
        bgr = cv2.imread(p, cv2.IMREAD_COLOR)
        assert bgr is not None
        h0, w0 = bgr.shape[:2]
        out = tf_inf(image=bgr)
        img = out["image"]
        return img, int(h0), int(w0), str(r["uid"]), str(r["base_id"]), str(r["lead"]), str(p)

dl = DataLoader(CropDS(df_roi), batch_size=BATCH, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)


# Load SegFormer-B1 checkpoint
ckpt_path = MODEL_DIR / f"segformer_b1_fold{FOLD}.pt"
assert ckpt_path.exists()
ckpt = torch.load(ckpt_path, map_location="cpu")
MODEL_NAME = ckpt["model_name"]

model = SegformerForSemanticSegmentation.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    ignore_mismatched_sizes=True,
)
model.load_state_dict(ckpt["state_dict"], strict=False)
model.to(device).eval()


# DP/Viterbi seam (windowed L1 transitions; fast)
def viterbi_seam(prob_hw: np.ndarray, lam: float, max_dy: int) -> np.ndarray:
    H, W = prob_hw.shape
    cost = (-np.log(prob_hw + EPS)).astype(np.float32)
    dp = cost[:, 0].copy()
    back = np.empty((H, W), dtype=np.int16)
    back[:, 0] = np.arange(H, dtype=np.int16)

    y_idx = np.arange(H, dtype=np.int16)
    dp_pad = None
    shifts = np.arange(-max_dy, max_dy + 1, dtype=np.int16)

    for x in range(1, W):
        dp_pad = np.pad(dp, (max_dy, max_dy), mode="constant", constant_values=np.inf)
        best = np.full(H, np.inf, dtype=np.float32)
        best_prev = np.zeros(H, dtype=np.int16)

        for s in shifts:
            prev = dp_pad[(max_dy - int(s)):(max_dy - int(s) + H)]
            cand = prev + (lam * float(abs(int(s))))
            m = cand < best
            best = np.where(m, cand, best)
            best_prev = np.where(m, (y_idx - int(s)).astype(np.int16), best_prev)

        dp = cost[:, x] + best
        back[:, x] = best_prev

    y = int(np.argmin(dp))
    path = np.empty(W, dtype=np.int16)
    path[W - 1] = y
    for x in range(W - 1, 0, -1):
        y = int(back[y, x])
        path[x - 1] = y
    return path

# Inference + save prob_u8 + save trace_px
rows = []
with torch.no_grad():
    for imgs, h0s, w0s, uids, base_ids, leads, crop_paths in tqdm(dl, total=len(dl)):
        imgs = imgs.to(device, non_blocking=True)
        out = model(pixel_values=imgs)
        logits = out.logits
        logits = F.interpolate(logits, size=(IMG_SIZE, IMG_SIZE), mode="bilinear", align_corners=False)
        prob = torch.sigmoid(logits).squeeze(1).detach().cpu().numpy().astype(np.float32)  # [B,512,512]

        for k in range(prob.shape[0]):
            h0 = int(h0s[k]); w0 = int(w0s[k])
            uid = str(uids[k]); base_id = str(base_ids[k]); lead = str(leads[k]); crop_path = str(crop_paths[k])

            pr = prob[k]
            pr0 = cv2.resize(pr, (w0, h0), interpolation=cv2.INTER_LINEAR)

            # optional prob_u8 save (compact)
            prob_path = str(PROB_DIR / f"{uid}__{lead}.npz")
            pr_u8 = np.clip(np.round(pr0 * 255.0), 0, 255).astype(np.uint8)
            np.savez_compressed(prob_path, prob_u8=pr_u8, h=int(h0), w=int(w0))

            # DP on downsampled prob for speed
            Hd = int(max(2, min(DP_H_MAX, h0)))
            Wd = int(max(2, min(DP_W_MAX, w0)))
            pr_ds = cv2.resize(pr0, (Wd, Hd), interpolation=cv2.INTER_LINEAR).astype(np.float32)

            seam_ds = viterbi_seam(pr_ds, lam=LAMBDA, max_dy=MAX_DY).astype(np.float32)  # [Wd]
            x_ds = np.arange(Wd, dtype=np.float32)
            x0 = np.linspace(0.0, float(Wd - 1), num=w0, dtype=np.float32)
            seam_x = np.interp(x0, x_ds, seam_ds).astype(np.float32)

            scale_y = (float(h0 - 1) / float(max(1, Hd - 1)))
            y0 = np.clip(seam_x * scale_y, 0.0, float(h0 - 1)).astype(np.float32)

            trace_path = str(TRACE_DIR / f"{uid}__{lead}.npy")
            np.save(trace_path, y0)

            # confidence along seam (proxy)
            xi = np.clip(np.round(np.arange(w0)).astype(np.int32), 0, w0 - 1)
            yi = np.clip(np.round(y0).astype(np.int32), 0, h0 - 1)
            conf = float(np.mean(pr0[yi, xi]))

            rows.append((uid, base_id, lead, crop_path, prob_path, trace_path, int(h0), int(w0), int(Hd), int(Wd), conf))

df_out = pd.DataFrame(
    rows,
    columns=["uid","base_id","lead","crop_path","prob_u8_path","trace_px_path","h","w","dp_h","dp_w","mean_trace_conf"]
)

df_out.to_parquet(ART_DIR / f"test_trace_index_fold{FOLD}.parquet", index=False)
(Path(ART_DIR) / f"stage5_params_fold{FOLD}.json").write_text(json.dumps({
    "fold": int(FOLD),
    "img_size": int(IMG_SIZE),
    "batch": int(BATCH),
    "dp_h_max": int(DP_H_MAX),
    "dp_w_max": int(DP_W_MAX),
    "max_dy": int(MAX_DY),
    "lambda": float(LAMBDA),
    "save_prob_u8": int(SAVE_PROB_U8),
    "n_rows": int(len(df_out)),
}, indent=2))

print("OK | Saved:")
print(" -", ART_DIR / f"test_trace_index_fold{FOLD}.parquet")
print(" -", ART_DIR / f"stage5_params_fold{FOLD}.json")
print("Cache dirs:")
print(" -", PROB_DIR)
print(" -", TRACE_DIR)
print("Rows:", len(df_out), "| mean conf:", float(df_out["mean_trace_conf"].mean()))

# Prob Map Inference, DP/Viterbi Vectorization, and Signal Extraction

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import cv2
from tqdm.auto import tqdm

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch import ToTensorV2
from transformers import SegformerForSemanticSegmentation

# Config
OUT_ROOT  = Path("/kaggle/working/ecgdig")
ART_DIR   = OUT_ROOT / "artifacts"
MODEL_DIR = OUT_ROOT / "models"
CACHE_DIR = OUT_ROOT / "cache"
PROB_DIR  = CACHE_DIR / "prob_u8"
TRACE_DIR = CACHE_DIR / "trace_px"
for d in [PROB_DIR, TRACE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

FOLD = 0
IMG_SIZE = 512
BATCH = 24
NUM_WORKERS = 2

# DP params (fast + stable)
DP_H_MAX = 256
DP_W_MAX = 512
MAX_DY = 12
LAMBDA = 0.65
EPS = 1e-6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load ROI table (train+test). Prefer refined test if available.
df_roi_all = pd.read_parquet(ART_DIR / "roi_boxes_template.parquet")
df_roi_all["uid"] = df_roi_all["uid"].astype(str)
df_roi_all["base_id"] = df_roi_all["base_id"].astype(str)
df_roi_all["lead"] = df_roi_all["lead"].astype(str)
df_roi_all["crop_path"] = df_roi_all["crop_path"].astype(str)

try:
    df_ref = pd.read_parquet(ART_DIR / "roi_boxes_test_refined.parquet")
    df_ref["uid"] = df_ref["uid"].astype(str)
    df_ref["base_id"] = df_ref["base_id"].astype(str)
    df_ref["lead"] = df_ref["lead"].astype(str)
    df_ref["crop_path"] = df_ref["crop_path"].astype(str)
    df_roi_all = pd.concat([df_roi_all[df_roi_all["split"].ne("test")], df_ref], ignore_index=True)
except Exception:
    pass

# Attach fold to train rows for OOF subset
df_folds = pd.read_parquet(ART_DIR / "train_base_folds.parquet")[["base_id","fold"]].copy()
df_folds["base_id"] = df_folds["base_id"].astype(str)
df_roi_all = df_roi_all.merge(df_folds, on="base_id", how="left")

# Build run set: TEST + OOF(valid fold only)
df_test = df_roi_all[df_roi_all["split"].eq("test")].copy()
df_oof  = df_roi_all[df_roi_all["split"].eq("train") & df_roi_all["fold"].eq(FOLD)].copy()

df_test["run_split"] = "test"
df_oof["run_split"]  = "oof"

df_run = pd.concat([df_test, df_oof], ignore_index=True).reset_index(drop=True)
assert len(df_run) > 0
assert {"uid","base_id","lead","crop_path","run_split"}.issubset(df_run.columns)


# Inference transform
tf_inf = A.Compose([
    A.LongestMaxSize(max_size=IMG_SIZE),
    A.PadIfNeeded(IMG_SIZE, IMG_SIZE, border_mode=cv2.BORDER_REPLICATE),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

class CropDS(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i: int):
        r = self.df.iloc[i]
        p = r["crop_path"]
        bgr = cv2.imread(p, cv2.IMREAD_COLOR)
        assert bgr is not None
        h0, w0 = bgr.shape[:2]
        out = tf_inf(image=bgr)
        img = out["image"]
        return img, int(h0), int(w0), str(r["uid"]), str(r["base_id"]), str(r["lead"]), str(r["run_split"]), str(p)

dl = DataLoader(CropDS(df_run), batch_size=BATCH, shuffle=False,
                num_workers=NUM_WORKERS, pin_memory=True, drop_last=False)

# Load SegFormer-B1 checkpoint
ckpt_path = MODEL_DIR / f"segformer_b1_fold{FOLD}.pt"
assert ckpt_path.exists()
ckpt = torch.load(ckpt_path, map_location="cpu")
MODEL_NAME = ckpt["model_name"]

model = SegformerForSemanticSegmentation.from_pretrained(
    MODEL_NAME,
    num_labels=1,
    ignore_mismatched_sizes=True,
)
model.load_state_dict(ckpt["state_dict"], strict=False)
model.to(device).eval()

# DP/Viterbi seam (fast windowed transitions)
def viterbi_seam(prob_hw: np.ndarray, lam: float, max_dy: int) -> np.ndarray:
    H, W = prob_hw.shape
    cost = (-np.log(prob_hw + EPS)).astype(np.float32)
    dp = cost[:, 0].copy()
    back = np.empty((H, W), dtype=np.int16)
    back[:, 0] = np.arange(H, dtype=np.int16)

    y_idx = np.arange(H, dtype=np.int16)
    shifts = np.arange(-max_dy, max_dy + 1, dtype=np.int16)

    for x in range(1, W):
        dp_pad = np.pad(dp, (max_dy, max_dy), mode="constant", constant_values=np.inf)
        best = np.full(H, np.inf, dtype=np.float32)
        best_prev = np.zeros(H, dtype=np.int16)

        for s in shifts:
            prev = dp_pad[(max_dy - int(s)):(max_dy - int(s) + H)]
            cand = prev + (lam * float(abs(int(s))))
            m = cand < best
            best = np.where(m, cand, best)
            best_prev = np.where(m, (y_idx - int(s)).astype(np.int16), best_prev)

        dp = cost[:, x] + best
        back[:, x] = best_prev

    y = int(np.argmin(dp))
    path = np.empty(W, dtype=np.int16)
    path[W - 1] = y
    for x in range(W - 1, 0, -1):
        y = int(back[y, x])
        path[x - 1] = y
    return path

# Inference + save prob_u8 + save trace_px
rows = []
with torch.no_grad():
    for imgs, h0s, w0s, uids, base_ids, leads, run_splits, crop_paths in tqdm(dl, total=len(dl)):
        imgs = imgs.to(device, non_blocking=True)
        out = model(pixel_values=imgs)
        logits = out.logits
        logits = F.interpolate(logits, size=(IMG_SIZE, IMG_SIZE), mode="bilinear", align_corners=False)
        prob = torch.sigmoid(logits).squeeze(1).detach().cpu().numpy().astype(np.float32)  # [B,512,512]

        for k in range(prob.shape[0]):
            h0 = int(h0s[k]); w0 = int(w0s[k])
            uid = str(uids[k]); base_id = str(base_ids[k]); lead = str(leads[k])
            run_split = str(run_splits[k]); crop_path = str(crop_paths[k])

            pr = prob[k]
            pr0 = cv2.resize(pr, (w0, h0), interpolation=cv2.INTER_LINEAR)

            # prob_u8 save (compact + deterministic)
            prob_path = str(PROB_DIR / f"{run_split}__{uid}__{lead}.npz")
            pr_u8 = np.clip(np.round(pr0 * 255.0), 0, 255).astype(np.uint8)
            np.savez_compressed(prob_path, prob_u8=pr_u8, h=int(h0), w=int(w0))

            # DP on downsampled prob for speed
            Hd = int(max(2, min(DP_H_MAX, h0)))
            Wd = int(max(2, min(DP_W_MAX, w0)))
            pr_ds = cv2.resize(pr0, (Wd, Hd), interpolation=cv2.INTER_LINEAR).astype(np.float32)

            seam_ds = viterbi_seam(pr_ds, lam=LAMBDA, max_dy=MAX_DY).astype(np.float32)  # [Wd]
            x_ds = np.arange(Wd, dtype=np.float32)
            x0 = np.linspace(0.0, float(Wd - 1), num=w0, dtype=np.float32)
            seam_x = np.interp(x0, x_ds, seam_ds).astype(np.float32)

            scale_y = (float(h0 - 1) / float(max(1, Hd - 1)))
            y0 = np.clip(seam_x * scale_y, 0.0, float(h0 - 1)).astype(np.float32)

            trace_path = str(TRACE_DIR / f"{run_split}__{uid}__{lead}.npy")
            np.save(trace_path, y0)

            xi = np.arange(w0, dtype=np.int32)
            yi = np.clip(np.round(y0).astype(np.int32), 0, h0 - 1)
            conf = float(np.mean(pr0[yi, xi]))

            rows.append((run_split, uid, base_id, lead, crop_path, prob_path, trace_path,
                         int(h0), int(w0), int(Hd), int(Wd), conf))

df_out = pd.DataFrame(
    rows,
    columns=["run_split","uid","base_id","lead","crop_path","prob_u8_path","trace_px_path",
             "h","w","dp_h","dp_w","mean_trace_conf"]
)

df_test_out = df_out[df_out["run_split"].eq("test")].reset_index(drop=True)
df_oof_out  = df_out[df_out["run_split"].eq("oof")].reset_index(drop=True)

df_test_out.to_parquet(ART_DIR / f"test_trace_index_fold{FOLD}.parquet", index=False)
df_oof_out.to_parquet(ART_DIR / f"oof_trace_index_fold{FOLD}.parquet", index=False)

(Path(ART_DIR) / f"stage5_params_fold{FOLD}.json").write_text(json.dumps({
    "fold": int(FOLD),
    "img_size": int(IMG_SIZE),
    "batch": int(BATCH),
    "dp_h_max": int(DP_H_MAX),
    "dp_w_max": int(DP_W_MAX),
    "max_dy": int(MAX_DY),
    "lambda": float(LAMBDA),
    "n_test_rows": int(len(df_test_out)),
    "n_oof_rows": int(len(df_oof_out)),
}, indent=2))

print("OK | Saved:")
print(" -", ART_DIR / f"test_trace_index_fold{FOLD}.parquet")
print(" -", ART_DIR / f"oof_trace_index_fold{FOLD}.parquet")
print(" -", ART_DIR / f"stage5_params_fold{FOLD}.json")
print("Cache dirs:")
print(" -", PROB_DIR)3
print(" -", TRACE_DIR)
print("Rows | test:", len(df_test_out), "| oof:", len(df_oof_out))
print("Mean conf | test:", float(df_test_out["mean_trace_conf"].mean()) if len(df_test_out) else float("nan"))
print("Mean conf | oof :", float(df_oof_out["mean_trace_conf"].mean()) if len(df_oof_out) else float("nan"))

# Ensemble, Submission Build, and Submission QA

In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import cv2
from tqdm.auto import tqdm

OUT_ROOT  = Path("/kaggle/working/ecgdig")
ART_DIR   = OUT_ROOT / "artifacts"
CACHE_DIR = OUT_ROOT / "cache"
SUB_DIR   = OUT_ROOT / "subs"
SUB_DIR.mkdir(parents=True, exist_ok=True)

# ---- load template + parse id
df_sub = pd.read_parquet(ART_DIR / "sample_submission.parquet")[["id"]].copy()
tmp = df_sub["id"].astype(str).str.rsplit("_", n=2, expand=True)
df_sub["base_id"] = tmp[0].astype(str)
df_sub["row_id"]  = tmp[1].astype(np.int32)
df_sub["lead"]    = tmp[2].astype(str)
df_sub["key"]     = df_sub["base_id"] + "|" + df_sub["lead"]

# ---- load ROI table (for crop_path) and keep TEST only; prefer refined if exists
df_roi = pd.read_parquet(ART_DIR / "roi_boxes_template.parquet")
try:
    df_ref = pd.read_parquet(ART_DIR / "roi_boxes_test_refined.parquet")
    df_roi = pd.concat([df_roi[df_roi["split"].ne("test")], df_ref], ignore_index=True)
except Exception:
    pass
df_roi = df_roi[df_roi["split"].eq("test")][["uid","base_id","lead","crop_path"]].copy()
df_roi["base_id"] = df_roi["base_id"].astype(str)
df_roi["lead"] = df_roi["lead"].astype(str)
df_roi["crop_path"] = df_roi["crop_path"].astype(str)

# ---- load all available Stage-5 test indexes (fold ensemble)
idx_paths = sorted(ART_DIR.glob("test_trace_index_fold*.parquet"))
assert len(idx_paths) > 0

dfs = []
for p in idx_paths:
    d = pd.read_parquet(p)[["base_id","lead","crop_path","trace_px_path","mean_trace_conf"]].copy()
    d["base_id"] = d["base_id"].astype(str)
    d["lead"] = d["lead"].astype(str)
    d["crop_path"] = d["crop_path"].astype(str)
    d["trace_px_path"] = d["trace_px_path"].astype(str)
    dfs.append(d)
df_idx = pd.concat(dfs, ignore_index=True)

# ---- map II_LONG -> II (submission expects lead II); prioritize II_LONG over short II
df_idx["lead_sub"] = df_idx["lead"].where(df_idx["lead"].ne("II_LONG"), "II")
df_idx["prio"] = (df_idx["lead"].eq("II_LONG")).astype(np.int8)
df_idx = df_idx.sort_values(["base_id","lead_sub","prio","mean_trace_conf"]).drop_duplicates(
    ["base_id","lead_sub","trace_px_path"], keep="last"
)

# ---- build per (base_id, lead_sub) ensemble list + crop_path
g = df_idx.groupby(["base_id","lead_sub"], sort=False)
df_pack = g.agg(
    crop_path=("crop_path", "last"),
    trace_list=("trace_px_path", list),
).reset_index()
df_pack["key"] = df_pack["base_id"] + "|" + df_pack["lead_sub"]

# ---- ensure submission keys exist in pack
keys_needed = pd.Index(df_sub["key"].unique())
keys_have = pd.Index(df_pack["key"].unique())
assert keys_needed.isin(keys_have).all()

# ---- grid scale estimate (px/mm) per crop; fill failures with global median
def estimate_px_per_mm_y(crop_path: str) -> float:
    img = cv2.imread(crop_path, cv2.IMREAD_GRAYSCALE)
    assert img is not None
    img = cv2.GaussianBlur(img, (3,3), 0)
    ed = cv2.Canny(img, 50, 150)
    proj = ed.sum(axis=1).astype(np.float32)
    proj = proj - np.median(proj)
    spec = np.abs(np.fft.rfft(proj))
    k = np.arange(spec.shape[0], dtype=np.int32)
    spec[0] = 0.0
    N = float(proj.shape[0])
    # period = N/k, keep plausible band [3px .. 60px]
    period = N / np.maximum(k, 1)
    mask = (k > 0) & (period >= 3.0) & (period <= 60.0)
    spec_m = np.where(mask, spec, -1.0)
    k_best = int(np.argmax(spec_m))
    per = float(N / max(k_best, 1))
    per = float(np.where(k_best > 0, per, np.nan))
    # heuristic: if detected period looks like big-square (~5mm), convert to 1mm by /5
    per_small = per / (1.0 + 4.0 * float(per > 15.0))
    return float(per_small)

pxmm = []
for p in tqdm(df_pack["crop_path"].values, total=len(df_pack), desc="Estimating grid scale (px/mm)"):
    try:
        pxmm.append(estimate_px_per_mm_y(p))
    except Exception:
        pxmm.append(np.nan)
pxmm = np.array(pxmm, dtype=np.float32)
pxmm_med = float(np.nanmedian(pxmm))
pxmm_filled = np.where(np.isfinite(pxmm), pxmm, pxmm_med).astype(np.float32)
df_pack["px_per_mm_y"] = pxmm_filled

# ---- prepare fast lookup: key -> (trace_list, crop_path, px_per_mm_y)
pack = df_pack.set_index("key")[["trace_list","crop_path","px_per_mm_y"]]

# ---- fill submission values groupwise (no per-row python loop)
values = np.empty(len(df_sub), dtype=np.float32)
values.fill(np.nan)

def resample_to_len(v: np.ndarray, n: int) -> np.ndarray:
    x0 = np.linspace(0.0, 1.0, num=v.size, dtype=np.float32)
    x1 = np.linspace(0.0, 1.0, num=n, dtype=np.float32)
    return np.interp(x1, x0, v).astype(np.float32)

for key, idx in tqdm(df_sub.groupby("key", sort=False).indices.items(), total=df_sub["key"].nunique(), desc="Building values"):
    row_ids = df_sub["row_id"].values[idx].astype(np.int32)
    need_n = int(row_ids.max() + 1)

    tr_list = pack.at[key, "trace_list"]
    px_per_mm_y = float(pack.at[key, "px_per_mm_y"])

    ys = []
    for tp in tr_list:
        ys.append(np.load(tp).astype(np.float32))
    y = np.mean(np.stack(ys, axis=0), axis=0).astype(np.float32)

    baseline = float(np.median(y))
    mv_per_px = 1.0 / (10.0 * px_per_mm_y)  # 10 mm per 1 mV
    v = ((baseline - y) * mv_per_px).astype(np.float32)

    v_rs = resample_to_len(v, need_n)
    values[idx] = v_rs[row_ids]

df_out = df_sub[["id"]].copy()
df_out["value"] = values.astype(np.float32)

# ---- QA (hard checks)
assert np.isfinite(df_out["value"].values).all()
assert df_out["id"].is_unique
assert len(df_out) == len(df_sub)

qa = {
    "n_rows": int(len(df_out)),
    "value_min": float(np.min(df_out["value"].values)),
    "value_max": float(np.max(df_out["value"].values)),
    "value_mean": float(np.mean(df_out["value"].values)),
    "value_std": float(np.std(df_out["value"].values)),
    "px_per_mm_y_median_used": float(pxmm_med),
    "n_pxmm_filled": int(np.sum(~np.isfinite(pxmm))),
    "n_folds_ensembled": int(len(idx_paths)),
}

# ---- save submission
sub_pq = SUB_DIR / "submission.parquet"
sub_csv = SUB_DIR / "submission.csv"
df_out.to_parquet(sub_pq, index=False)
df_out.to_csv(sub_csv, index=False)

(Path(ART_DIR) / "stage7_qa.json").write_text(json.dumps(qa, indent=2))

print("OK | Saved:")
print(" -", sub_pq)
print(" -", sub_csv)
print(" -", ART_DIR / "stage7_qa.json")
print("QA:", qa)