<div style="display: flex; justify-content: space-between; align-items: flex-start;">
    <div style="text-align: left;">
        <p style="color:#FFD700; font-size: 15px; font-weight: bold; margin-bottom: 1px; text-align: left;">Published on  October 31, 2025</p>
        <h4 style="color:#4B0082; font-weight: bold; text-align: left; margin-top: 6px;">Author: Jocelyn C. Dumlao</h4>
        <p style="font-size: 17px; line-height: 1.7; color: #333; text-align: center; margin-top: 20px;"></p>
        <a href="https://www.linkedin.com/in/jocelyn-dumlao-168921a8/" target="_blank" style="display: inline-block; background-color: #003f88; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">LinkedIn</a>
        <a href="https://github.com/jcdumlao14" target="_blank" style="display: inline-block; background-color: transparent; color: #059c99; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px; border: 2px solid #007bff;">GitHub</a>
        <a href="https://www.youtube.com/@CogniCraftedMinds" target="_blank" style="display: inline-block; background-color: #ff0054; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">YouTube</a>
        <a href="https://www.kaggle.com/jocelyndumlao" target="_blank" style="display: inline-block; background-color: #3a86ff; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">Kaggle</a>
    </div>
</div>

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Import Libraries</p></div>

In [None]:
import os
from glob import glob
from collections import defaultdict
from typing import Tuple
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.optimize
import scipy.signal

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Constants & Exceptions</p></div>

In [None]:
# Constants & Exceptions

LEADS = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
MAX_TIME_SHIFT = 0.2  # seconds
PERFECT_SCORE = 384


class ParticipantVisibleError(Exception):
    pass

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Utility Helpers</p></div>

In [None]:
# Utility helpers

def safe_read_image(path: str) -> np.ndarray:
    """Read image and raise clear error if not found."""
    img = cv2.imread(path)
    if img is None:
        raise FileNotFoundError(f"Image not found or unreadable: {path}")
    return img


def is_color_image(ima: np.ndarray) -> bool:
    """Test if a 3-channel image has colors (non-zero std across channels)."""
    if ima is None:
        return False
    if ima.ndim < 3:
        return False
    return ima.std(axis=2).mean() != 0

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Scoring Utilities</p></div>

In [None]:
# Scoring utilities (SNR metric)

def compute_power(label: np.ndarray, prediction: np.ndarray) -> Tuple[float, float]:
    """Compute signal and noise power. Inputs must be 1-D arrays."""
    label = np.asarray(label, dtype=np.float64)
    prediction = np.asarray(prediction, dtype=np.float64)

    if label.ndim != 1 or prediction.ndim != 1:
        raise ParticipantVisibleError('Inputs must be 1-dimensional arrays.')

    finite_mask = np.isfinite(prediction)
    if not np.any(finite_mask):
        raise ParticipantVisibleError("The 'prediction' array contains no finite values (all NaN or inf).")

    # Replace non-finite values in prediction with 0 for noise calc
    prediction = prediction.copy()
    prediction[~np.isfinite(prediction)] = 0
    noise = label - prediction
    p_signal = np.sum(label ** 2)
    p_noise = np.sum(noise ** 2)
    return p_signal, p_noise


def compute_snr(signal: float, noise: float) -> float:
    """Return SNR or capped PERFECT_SCORE for perfect reconstructions."""
    if noise == 0:
        return PERFECT_SCORE
    if signal == 0:
        return 0.0
    return min((signal / noise), PERFECT_SCORE)


def align_signals(label: np.ndarray, pred: np.ndarray, max_shift: float = float('inf')) -> np.ndarray:
    """
    Align pred to label by cross-correlation (time shift) and vertical offset (minimize squared error).
    max_shift is given in samples (not seconds) in this implementation when provided as int.
    """
    label_arr = np.asarray(label, dtype=np.float64)
    pred_arr = np.asarray(pred, dtype=np.float64)

    if np.any(~np.isfinite(label_arr)):
        raise ParticipantVisibleError('values in label should all be finite')
    if np.sum(np.isfinite(pred_arr)) == 0:
        raise ParticipantVisibleError('prediction can not all be infinite')

    label_mean = np.mean(label_arr)
    pred_mean = np.mean(pred_arr)

    label_centered = label_arr - label_mean
    pred_centered = pred_arr - pred_mean

    correlation = scipy.signal.correlate(label_centered, pred_centered, mode='full')
    n_label = label_arr.size
    n_pred = pred_arr.size
    lags = scipy.signal.correlation_lags(n_label, n_pred, mode='full')

    # If max_shift is a float (seconds) caller must convert to samples; here accept int as pre-converted too.
    if np.isfinite(max_shift) and max_shift < np.inf:
        # ensure mask length matches lags
        valid_lags_mask = (lags >= -int(max_shift)) & (lags <= int(max_shift))
    else:
        valid_lags_mask = np.ones_like(lags, dtype=bool)

    # restrict correlation to valid lags and get best lag
    corr_valid = correlation.copy()
    corr_valid[~valid_lags_mask] = -np.inf
    best_idx = int(np.nanargmax(corr_valid))
    time_shift = lags[best_idx]

    # Build aligned_pred by padding with nan where out-of-range
    start_padding_len = max(time_shift, 0)
    pred_slice_start = max(-time_shift, 0)
    pred_slice_end = min(n_pred, n_label - time_shift)
    end_padding_len = max(n_label - n_pred - time_shift, 0)
    middle = pred_arr[pred_slice_start:pred_slice_end]
    aligned_pred = np.concatenate((np.full(start_padding_len, np.nan), middle, np.full(end_padding_len, np.nan)))

    # Optimize vertical offset (v_shift) to minimize squared error
    def objective(v_shift):
        return np.nansum((label_arr - (aligned_pred - v_shift)) ** 2)

    if np.any(np.isfinite(label_arr) & np.isfinite(aligned_pred)):
        res = scipy.optimize.minimize_scalar(objective, method='Brent')
        v_shift = float(res.x)
        aligned_pred = aligned_pred - v_shift

    return aligned_pred

def _calculate_image_score(group: pd.DataFrame) -> float:
    """Helper function to calculate the total SNR score for a single image group."""

    unique_fs_values = group['fs'].unique()
    if len(unique_fs_values) != 1:
        raise ParticipantVisibleError('Sampling frequency should be consistent across each ecg')
    sampling_frequency = unique_fs_values[0]
    
    sum_signal = 0
    sum_noise = 0
    for lead in LEADS:
        sub = group[group['lead'] == lead]
        label = sub['value_true'].values
        pred = sub['value_pred'].values

        aligned_pred = align_signals(label, pred, int(sampling_frequency * MAX_TIME_SHIFT))
        p_signal, p_noise = compute_power(label, aligned_pred)
        sum_signal += p_signal
        sum_noise += p_noise
    return compute_snr(sum_signal, sum_noise)


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute the mean Signal-to-Noise Ratio (SNR) across multiple ECG leads and images for the PhysioNet 2025 competition.
    The final score is the average of the sum of SNRs over different lines, averaged over all unique images.
    Args:
        solution: DataFrame with ground truth values. Expected columns: 'id' and one for each lead.
        submission: DataFrame with predicted values. Expected columns: 'id' and one for each lead.
        row_id_column_name: The name of the unique identifier column, typically 'id'.
    Returns:
        The final competition score.
    """
    for df in [solution, submission]:
        if row_id_column_name not in df.columns:
            raise ParticipantVisibleError(f"'{row_id_column_name}' column not found in DataFrame.")
        if df['value'].isna().any():
            raise ParticipantVisibleError('NaN exists in solution/submission')
        if not np.isfinite(df['value']).all():
            raise ParticipantVisibleError('Infinity exists in solution/submission')

    submission = submission[['id', 'value']]
    solution.rename(columns={'value': 'value_true'}, inplace=True)
    submission.rename(columns={'value': 'value_pred'}, inplace=True)

    merged_df = pd.merge(solution, submission, on=row_id_column_name)
    merged_df['image_id'] = merged_df[row_id_column_name].str.split('_').str[0]
    merged_df['row_id'] = merged_df[row_id_column_name].str.split('_').str[1].astype('int64')
    merged_df['lead'] = merged_df[row_id_column_name].str.split('_').str[2]
    merged_df['fs'] = 500  # Assuming a default fs of 500, add fs to your solution/submission or load it dynamically if it varies

    merged_df.sort_values(by=['image_id', 'row_id', 'lead'], inplace=True)
    image_scores = merged_df.groupby('image_id').apply(_calculate_image_score, include_groups=False)
    mean_snr = image_scores.mean()
    if np.isnan(mean_snr):
        return -PERFECT_SCORE
    return max(float(10 * np.log10(mean_snr)), -PERFECT_SCORE)



# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Mean Model / Data Loaders</p></div>

In [None]:
# Mean model / data loaders

def fit_mean_model(train_df, base_path):
    """
    Build mean time-series per lead across training CSVs.
    Resample each series to 20000 samples for stacking.
    """
    mean_dict = defaultdict(list)
    for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Calculating Mean ECG"):
        csv_path = os.path.join(base_path, str(row.id), f"{row.id}.csv")
        if not os.path.exists(csv_path):
            # skip missing
            continue
        try:
            labels = pd.read_csv(csv_path)
        except FileNotFoundError:
            print(f"Warning: CSV not found at {csv_path}. Skipping.")
            continue

        for lead in labels.columns:
            values = labels[lead].dropna().values.astype(float)
            if values.size == 0:
                continue
            resampled = np.interp(np.linspace(0, values.size - 1, 20000), np.arange(values.size), values)
            mean_dict[lead].append(resampled)
    # Stack
    for lead in list(mean_dict.keys()):
        mean_dict[lead] = np.stack(mean_dict[lead], axis=0)
    return mean_dict


def validate_mean_model(val_df, mean_dict, base_path):
    snr_list = []
    for idx, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Validating Mean Model"):
        csv_path = os.path.join(base_path, str(row.id), f"{row.id}.csv")
        if not os.path.exists(csv_path):
            continue
        try:
            labels = pd.read_csv(csv_path)
        except FileNotFoundError:
            print(f"Warning: CSV not found at {csv_path}. Skipping.")
            continue

        sum_signal = 0.0
        sum_noise = 0.0
        fs = int(getattr(row, 'fs', 500))
        for lead in labels.columns:
            label = labels[lead].dropna().values.astype(float)
            if label.size == 0:
                continue
            # get mean pred and resample to label length
            if lead not in mean_dict:
                # fallback: zeros
                pred = np.zeros(label.size)
            else:
                pred_mean = mean_dict[lead].mean(axis=0)
                pred = np.interp(np.linspace(0, 1, label.size), np.linspace(0, 1, pred_mean.size), pred_mean)
            aligned_pred = align_signals(label, pred, int(fs * MAX_TIME_SHIFT))
            p_signal, p_noise = compute_power(label, aligned_pred)
            sum_signal += p_signal
            sum_noise += p_noise
        snr = compute_snr(sum_signal, sum_noise)
        snr_list.append(snr)
    if len(snr_list) == 0:
        return -PERFECT_SCORE
    mean_snr = np.mean(snr_list)
    val_score = max(float(10 * np.log10(mean_snr)), -PERFECT_SCORE)
    print(f"# Validation SNR for mean prediction: {mean_snr:.2f} val_score={val_score:.2f}")
    return val_score


# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Marker Finder</p></div>

In [None]:
# Marker finder - robust version

class MarkerFinder:
    def __init__(self, show_templates=False, base_path='/kaggle/input/physionet-ecg-image-digitization/train'):
        # load sample images (use try/except if missing)
        candidate_paths = [
            os.path.join(base_path, '4292118763', '4292118763-0001.png'),
            os.path.join(base_path, '4289880010', '4289880010-0001.png'),
            os.path.join(base_path, '4284351157', '4284351157-0001.png'),
        ]
        imgs = []
        for p in candidate_paths:
            try:
                imgs.append(safe_read_image(p))
            except FileNotFoundError:
                # skip missing examples
                pass
        if len(imgs) == 0:
            raise FileNotFoundError("No template images found in candidate paths. Please check dataset mount.")
        ima = np.maximum.reduce(imgs)  # element-wise max across available templates

        # Absolute points originally computed for the dataset templates
        absolute_points = np.zeros((17, 2), dtype=int)
        for i in range(3):
            absolute_points[5 * i] = np.array([707 + 284 * i, 118])
            for j in range(1, 5):
                absolute_points[5 * i + j] = np.array([707 + 284 * i, 118 + 492 * j])
        absolute_points[5 * 3] = np.array([1535, 118])
        absolute_points[5 * 3 + 1] = np.array([1535, 118 + 492 * 4])

        # template_positions stored as (x, y) top-left
        template_positions = [None] * 17
        for i in range(len(absolute_points)):
            if absolute_points[i][1] < 118 + 492 * 4:
                if i % 5 == 0:
                    template_positions[i] = (absolute_points[i][0] - 87, absolute_points[i][1] - 50)
                else:
                    template_positions[i] = (absolute_points[i][0] - 37, absolute_points[i][1] - 13)

        template_sizes = np.array([(105, 60)] * 17)

        # compute template_points as offsets inside template (x_off, y_off)
        template_points = [
            np.array([absolute_points[i][0] - template_positions[i][0],
                      absolute_points[i][1] - template_positions[i][1]])
            if template_positions[i] is not None else None
            for i in range(len(absolute_points))
        ]

        # Extract templates safely: note image indexing is [y:y+h, x:x+w]
        templates = [None] * 17
        for i in range(len(template_positions)):
            if template_positions[i] is not None:
                tx, ty = template_positions[i]  # x, y
                w, h = template_sizes[i]       # width, height
                # clamp to image bounds
                x0 = max(0, tx)
                y0 = max(0, ty)
                x1 = min(ima.shape[1], tx + w)
                y1 = min(ima.shape[0], ty + h)
                if x1 <= x0 or y1 <= y0:
                    templates[i] = None
                else:
                    templates[i] = ima[y0:y1, x0:x1].copy()

        # Save attributes
        self._absolute_points = absolute_points
        self._template_positions = template_positions
        self._template_sizes = template_sizes
        self._template_points = template_points
        self._templates = templates

        if show_templates:
            fig, axs = plt.subplots(4, 4, figsize=(8, 10))
            for i in range(len(templates)):
                ax = axs.ravel()[i]
                tpl = templates[i]
                if tpl is not None:
                    # convert BGR->RGB for plotting
                    ax.imshow(cv2.cvtColor(tpl, cv2.COLOR_BGR2RGB))
                    ax.set_title(f"Tpl {i}")
                ax.axis('off')
            plt.tight_layout()
            plt.show()

    @staticmethod
    def lead_info(lead: str):
        mapping = {
            'I': (0, 1),
            'II-subset': (5, 6),
            'III': (10, 11),
            'aVR': (1, 2),
            'aVL': (6, 7),
            'aVF': (11, 12),
            'V1': (2, 3),
            'V2': (7, 8),
            'V3': (12, 13),
            'V4': (3, 4),
            'V5': (8, 9),
            'V6': (13, 14),
            'II': (15, 16),
        }
        begin, end = mapping[lead]
        return begin // 5, begin, end

    def find_markers(self, ima, warn=False, plot=False, title=''):
        """
        Find 13 template-based markers and estimate the guessed markers.
        Returns list of 17 markers as (row, col) or None.
        """
        if ima is None:
            raise ValueError("Input image is None")
        if ima.shape[0] != 1652:
            # Not fatal; we allow other heights in this robust script but warn
            # raise ValueError("Implemented only for scanned images (image types 3, 4, 11, 12)")
            pass

        markers = [None] * 17
        for j in range(len(self._templates)):
            tpl = self._templates[j]
            tpl_point = self._template_points[j]
            tpl_pos = self._template_positions[j]
            tpl_size = self._template_sizes[j]
            if tpl is None or tpl_point is None or tpl_pos is None:
                continue
            # define search window around template position (x,y)
            tx, ty = tpl_pos  # x, y
            w, h = tpl_size
            # safe margins
            search_x0 = max(0, tx - 100)
            search_y0 = max(0, ty - 100)
            search_x1 = min(ima.shape[1], tx + w + 250)
            search_y1 = min(ima.shape[0], ty + h + 100)
            search_range = ima[search_y0:search_y1, search_x0:search_x1]
            if search_range.size == 0:
                continue
            try:
                res = cv2.matchTemplate(search_range, tpl, cv2.TM_CCOEFF)
            except Exception as e:
                if warn:
                    print(f"matchTemplate failed for tpl {j}: {e}")
                continue
            _, max_val, _, max_loc = cv2.minMaxLoc(res)
            top_left = max_loc  # (x, y) in search_range coords
            # compute absolute marker coord (row, col) (i.e., y, x)
            abs_row = search_y0 + top_left[1] + int(tpl_point[1])
            abs_col = search_x0 + top_left[0] + int(tpl_point[0])
            markers[j] = np.array((abs_row, abs_col))
            if warn and max_val < 3e7:
                print(j, (abs_row, abs_col), max_val)

        # Guess the missing ends (safe guards if markers missing)
        for i in range(3):
            idx2 = 5 * i + 2
            idx3 = 5 * i + 3
            idx4 = 5 * i + 4
            if markers[idx2] is not None and markers[idx3] is not None:
                m = markers[idx3] * 2 - markers[idx2]
                markers[idx4] = m.astype(int)

        # guess marker 16 similarly if required
        if markers[14] is not None and markers[9] is not None:
            markers[16] = ((markers[14] * (284 + 260) - markers[9] * 260) / 284).astype(int)

        if plot:
            disp = ima.copy()
            # ensure display image is color for drawing
            if disp.ndim == 2:
                disp = cv2.cvtColor(disp, cv2.COLOR_GRAY2BGR)
            for m in markers:
                if m is not None and m.size == 2:
                    top_left = (int(m[1]) - 40, int(m[0]) - 40)
                    bottom_right = (int(m[1]) + 40, int(m[0]) + 40)
                    # clamp coords
                    tl = (max(0, top_left[0]), max(0, top_left[1]))
                    br = (min(disp.shape[1] - 1, bottom_right[0]), min(disp.shape[0] - 1, bottom_right[1]))
                    cv2.rectangle(disp, tl, br, (255, 0, 0), 2)
            plt.figure(figsize=(10, 6))
            # convert BGR->RGB
            try:
                plt.imshow(cv2.cvtColor(disp, cv2.COLOR_BGR2RGB))
            except Exception:
                plt.imshow(disp)
            plt.title(title)
            plt.axis('off')
            plt.show()

        return markers


    def demo(self, ima, warn=False, title=''):
        markers = self.find_markers(ima, warn=warn, plot=True, title=title)
        return markers


# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Image</p></div>

In [None]:
# Image -> (scanned color)

def find_line_by_topdown_sweep(ima_bool: np.ndarray):
    """
    Given a 2D boolean image (True=white, False=black) returns topmost black pixels per column (top)
    and topmost white pixels per column (bottom). Modifies ima_bool in-place to remove the found line.
    """
    # Ensure boolean
    ima = ima_bool.astype(bool)
    # topmost black pixel: find first False (i.e., ~ima)
    top = np.argmax(~ima, axis=0)  # returns index of first True in ~ima; if no False found, yields 0
    # Paint black everything above top
    rows = np.arange(ima.shape[0]).reshape(-1, 1)
    mask = rows >= top  # True for rows below or equal top
    ima &= mask
    # Find topmost white pixel in modified image
    bottom = np.argmax(ima, axis=0)
    bottomx = np.maximum(bottom, int(np.median(top)) + 100)
    mask2 = rows < bottomx  # True for rows above bottomx
    ima |= mask2
    # slightly expand to cover near columns
    ima[:, :-1] |= mask2[:, 1:]
    ima[:, 1:] |= mask2[:, :-1]
    return top, bottom


def get_lead_from_top_bottom(tops, bottoms, lead, number_of_rows, markers, mf: MarkerFinder):
    """
    Extract time series for one lead given top/bottom arrays and markers.
    tops/bottoms are lists of 4 arrays (one per line) where each array length == image width.
    markers is list of 17 (row, col) coordinates.
    """
    line, begin_idx, end_idx = mf.lead_info(lead)
    top = tops[line]
    bottom = bottoms[line]
    begin = markers[begin_idx]
    end = markers[end_idx]
    if begin is None or end is None:
        # fallback to zeros if markers missing
        return np.zeros(number_of_rows, dtype=float)

    # compute column range clamp
    col0 = int(max(0, begin[1]))
    col1 = int(min(len(top), end[1]))
    if col1 <= col0:
        # degenerate case
        return np.zeros(number_of_rows, dtype=float)

    pred0 = (top[col0:col1] + bottom[col0:col1]) / 2.0
    # baseline: linear interpolation between begin row and end row across columns
    baseline = np.linspace(begin[0], end[0], col1 - col0)

    # ensure same length
    if pred0.size == 0:
        return np.zeros(number_of_rows, dtype=float)

    pred = baseline[:pred0.size] - pred0

    # scale (80 pixels = 1 mV)
    pred = pred / 80.0

    # small fixes for marker-obscured pixels
    if lead in ['aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']:
        if pred.size >= 5:
            pred[:4][pred[:4] > 0.2] = pred[4]
    if lead in ['I', 'II-subset', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3']:
        if pred.size >= 6:
            last5 = pred[-5:]
            mask = last5 > 0.2
            if np.any(mask):
                pred[-5:][mask] = pred[-6]
    if lead in ['I', 'II-subset', 'III', 'II']:
        if pred.size >= 2 and 0.9 < pred[0] < 1.1 and pred[1] < 0.5:
            pred[0] = pred[1]

    # upsample / resample to required number_of_rows
    pred_up = np.interp(np.linspace(0, 1, number_of_rows),
                        np.linspace(0, 1, pred.size),
                        pred)

    # clamp implausible values
    pred_up = np.where(np.abs(pred_up) <= 0.9, pred_up, 0.0)

    return pred_up


def convert_scanned_color(ima: np.ndarray, markers, n_timesteps: dict, verbose=False, mf: MarkerFinder = None):
    """
    Convert scanned color ECG image to 12 leads.
    ima: BGR image (height ~1652).
    markers: list of 17 markers from MarkerFinder.find_markers.
    n_timesteps: dict mapping lead->num_samples required.
    """
    if mf is None:
        raise ValueError("MarkerFinder instance (mf) must be provided.")

    crop_top = 400
    if ima.shape[0] <= crop_top:
        raise ValueError("Image too small to crop top. Got height {}".format(ima.shape[0]))

    # Use red channel for better contrast of ECG ink on many scans
    red_channel = ima[crop_top:, :, 2]
    bw_image = red_channel > 160  # boolean: True = white background

    # denoise by 3x3 majority filter (keep if >=7 neighbors white)
    iima = bw_image.astype(np.uint8)
    denoised_image = (iima[:-2, :-2] + iima[:-2, 1:-1] + iima[:-2, 2:]
                      + iima[1:-1, :-2] + iima[1:-1, 1:-1] + iima[1:-1, 2:]
                      + iima[2:, :-2] + iima[2:, 1:-1] + iima[2:, 2:]) >= 7
    # pad to original width
    denoised = np.pad(denoised_image, ((1, 1), (1, 1)), mode='constant', constant_values=True)

    if verbose:
        plt.figure(figsize=(8, 3))
        plt.imshow(denoised, cmap='gray')
        plt.title('Denoised BW')
        plt.axis('off')
        plt.show()

    # find four lines by iteratively removing topmost line
    tops = []
    bottoms = []
    working_image = denoised.copy()
    for i in range(4):
        top, bottom = find_line_by_topdown_sweep(working_image)
        tops.append(top + crop_top)
        bottoms.append(bottom + crop_top)

    # Extract leads: include II-subset artificially as required
    n_timesteps_local = dict(n_timesteps)  # copy
    n_timesteps_local['II-subset'] = n_timesteps_local.get('I', n_timesteps_local.get('II', 5000))
    preds = {}
    for lead in LEADS + ['II-subset']:
        required = int(n_timesteps_local.get(lead, 2000))
        preds[lead] = get_lead_from_top_bottom(tops, bottoms, lead, required, markers, mf)

    # combine II-subset into II
    if 'II' in preds and 'II-subset' in preds:
        subset_len = min(len(preds['II-subset']), len(preds['II']))
        preds['II'][:subset_len] = (preds['II'][:subset_len] + preds['II-subset'][:subset_len]) / 2.0
        del preds['II-subset']

    # Apply Einthoven's and augmented limb leads corrections
    apply_einthoven(preds)
    return preds


def apply_einthoven(preds: dict):
    """Apply Einthoven relationships to improve consistency."""
    # safety checks
    if 'I' in preds and 'III' in preds and 'II' in preds:
        L = min(len(preds['I']), len(preds['III']), len(preds['II']))
        residual = preds['I'][:L] + preds['III'][:L] - preds['II'][:L]
        correction = residual / 3.0
        preds['I'][:L] = preds['I'][:L] - correction
        preds['III'][:L] = preds['III'][:L] - correction
        preds['II'][:L] = preds['II'][:L] + correction
    if 'aVR' in preds and 'aVL' in preds and 'aVF' in preds:
        L2 = min(len(preds['aVR']), len(preds['aVL']), len(preds['aVF']))
        residual2 = preds['aVR'][:L2] + preds['aVL'][:L2] + preds['aVF'][:L2]
        correction2 = residual2 / 3.0
        preds['aVR'][:L2] = preds['aVR'][:L2] - correction2
        preds['aVL'][:L2] = preds['aVL'][:L2] - correction2
        preds['aVF'][:L2] = preds['aVF'][:L2] - correction2



# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Validation Helper</p></div>

In [None]:
# Validation helper (visual)

def validate_algorithm(train_df, image_types, convert_func, mf: MarkerFinder, base_path):
    for idx, row in train_df.iterrows():
        csv_path = os.path.join(base_path, str(row.id), f"{row.id}.csv")
        if not os.path.exists(csv_path):
            continue
        try:
            labels = pd.read_csv(csv_path)
        except FileNotFoundError:
            print(f"Warning: CSV not found at {csv_path}. Skipping.")
            continue
        png_paths = sorted(glob(os.path.join(base_path, str(row.id), f"{row.id}-*.png")))
        if len(png_paths) == 0:
            continue
        for path in png_paths:
            try:
                img_type = int(path[-8:-4])
            except Exception:
                img_type = None
            if img_type not in image_types:
                continue
            try:
                ima = safe_read_image(path)
            except FileNotFoundError:
                continue
            markers = mf.find_markers(ima, plot=True, title=f"Markers: {row.id}")
            n_timesteps = {lead: int((~labels[lead].isna()).sum()) for lead in LEADS if lead in labels.columns}
            # Provide reasonable defaults if missing
            for lead in LEADS:
                n_timesteps.setdefault(lead, 5000)
            preds = convert_func(ima, markers, n_timesteps, verbose=True, mf=mf)
            # plot one lead (I)
            lead_to_plot = 'I'
            plt.figure(figsize=(10, 3))
            true_signal = labels[lead_to_plot].dropna().values if lead_to_plot in labels.columns else np.zeros(len(preds[lead_to_plot]))
            plt.plot(true_signal, label='True')
            plt.plot(preds[lead_to_plot], label='Predicted')
            plt.title(f'Lead {lead_to_plot} for {row.id}')
            plt.legend()
            plt.show()

            # compute simple SNR for lead I
            fs = int(getattr(row, 'fs', 500))
            aligned = align_signals(true_signal, preds[lead_to_plot], int(fs * MAX_TIME_SHIFT))
            p_s, p_n = compute_power(true_signal, aligned)
            snr = compute_snr(p_s, p_n)
            print(f"SNR for {row.id} lead {lead_to_plot}: {snr:.2f}")
            break
        break





# <div style="color:white;display:inline-block;border-radius:5px;background-color:#cc8800 ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ffdd11;"><b> </b>Submission</p></div>

In [None]:
# Submission 

def create_submission(test_df, mean_dict, convert_func, mf: MarkerFinder, base_path):
    submission_rows = []
    old_id = None
    preds = None
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Creating submission"):
        # row expected to have attributes: id, lead, number_of_rows, fs
        fs = int(getattr(row, 'fs', 500))
        if row.id != old_id:
            # read image (some test IDs are single .png, some multi-page; adapt path)
            png_candidates = glob(os.path.join(base_path, f"{row.id}*.png"))
            if len(png_candidates) == 0:
                # fallback: use mean model
                preds = None
                old_id = row.id
            else:
                # choose first image
                path = png_candidates[0]
                try:
                    ima = safe_read_image(path)
                except FileNotFoundError:
                    preds = None
                    old_id = row.id
                    continue
                good_shape = (ima.shape[0] == 1652)
                if good_shape and is_color_image(ima):
                    markers = mf.find_markers(ima)
                    n_timesteps = {lead: fs * 10 if lead == 'II' else fs * 10 // 4 for lead in LEADS}
                    preds = convert_func(ima, markers, n_timesteps, verbose=False, mf=mf)
                else:
                    preds = None
                old_id = row.id

        # determine pred for this row
        if preds is None:
            # use mean model fallback
            pred_mean = mean_dict.get(row.lead)
            if pred_mean is None:
                # zeros fallback
                pred = np.zeros(int(row.number_of_rows))
                print(f"Warning: Lead {row.lead} not found in mean_dict. Using zeros.") # Add a warning
            else:
                mean_series = pred_mean.mean(axis=0)
                pred = np.interp(np.linspace(0, 1, int(row.number_of_rows)),
                                 np.linspace(0, 1, mean_series.size),
                                 mean_series)
        else:
            pred = preds.get(row.lead)
            if pred is None:
                pred = np.zeros(int(row.number_of_rows))
                print(f"Warning: Lead {row.lead} not found in image predictions. Using zeros.")# Add a warning

        # ensure correct length
        pred_len = len(pred)
        expected_len = int(row.number_of_rows)

        if pred_len != expected_len:
            print(f"ERROR: Prediction length mismatch for id={row.id}, lead={row.lead}.")
            print(f"  Expected length: {expected_len}, Actual length: {pred_len}")
            # Handle the error: either truncate, pad, or use a fallback
            if pred_len < expected_len:
                # Pad with zeros (or a more appropriate value)
                padding = np.zeros(expected_len - pred_len)
                pred = np.concatenate([pred, padding])
            else:
                # Truncate the prediction
                pred = pred[:expected_len]


        assert len(pred) == int(row.number_of_rows), f"Length mismatch after correction for id={row.id}, lead={row.lead}"


        for t in range(int(row.number_of_rows)):
            submission_rows.append({'id': f"{row.id}_{t}_{row.lead}", 'value': float(pred[t])}) # Ensure 't' is int

    submission_df = pd.DataFrame(submission_rows)
    submission_df['value'] = submission_df['value'].astype(float)
    submission_df.fillna(0, inplace=True)  # Replace NaN with 0
    submission_df.replace([float('inf'), float('-inf')], 0, inplace=True) # Replace infinite values with 0

    print(f"Length: {len(submission_df)}")
    return submission_df


In [None]:
# Usage guarded

if __name__ == '__main__':
    
    TRAIN_CSV = '/kaggle/input/physionet-ecg-image-digitization/train.csv'
    TEST_CSV = '/kaggle/input/physionet-ecg-image-digitization/test.csv'
    TRAIN_BASE = '/kaggle/input/physionet-ecg-image-digitization/train'
    TEST_BASE = '/kaggle/input/physionet-ecg-image-digitization/test'

    # load small portions to demo (guard with exists)
    if not os.path.exists(TRAIN_CSV):
        print("Train CSV not found at", TRAIN_CSV)
    else:
        train = pd.read_csv(TRAIN_CSV)

        # build mean model on a small subset for speed if dataset present
        subset = train.iloc[:min(200, len(train))]
        mean_dict = fit_mean_model(subset, base_path=TRAIN_BASE)
        # validate on next chunk
        if len(train) > 200:
            validate_mean_model(train.iloc[200:300], mean_dict, base_path=TRAIN_BASE)

        # Initialize MarkerFinder (will raise if templates missing)
        try:
            mf = MarkerFinder(show_templates=False, base_path=TRAIN_BASE)
        except FileNotFoundError as e:
            print("MarkerFinder initialization failed:", e)
            mf = None

        # Demo marker detection if we have an example image
        if mf is not None:
            example_path = os.path.join(TRAIN_BASE, str(train.iloc[0].id), f"{train.iloc[0].id}-0001.png")
            if os.path.exists(example_path):
                try:
                    ex_img = safe_read_image(example_path)
                    mf.demo(ex_img, warn=False, title=f"Demo {train.iloc[0].id}")
                except FileNotFoundError:
                    pass

            # run a single validation example (visual)
            validate_algorithm(train.iloc[:5], image_types=[3, 11], convert_func=convert_scanned_color, mf=mf, base_path=TRAIN_BASE)

        # If test CSV exists, create (small) submission
        if os.path.exists(TEST_CSV) and mf is not None:
            test = pd.read_csv(TEST_CSV)
            small_test = test.iloc[:min(200, len(test))]

            # Create a dummy solution.csv for testing the `score` function
            # Replace with actual ground truth if available
            solution = small_test[['id', 'fs']].copy()
            solution['value'] = 0  # Replace with ground truth values if available

            submission_df = create_submission(small_test, mean_dict, convert_scanned_color, mf, base_path=TEST_BASE)

            submission_df.to_csv('submission.csv', index=False)
            print("submission.csv created (first 200 rows).")
        else:
            print("Test CSV not found or MarkerFinder unavailable; skipping submission creation.")


In [None]:
!head submission.csv