# # ECG Image Digitization ‚Äî **ML‚ÄëHybrid Baseline** (Fast, Deterministic, Competition‚ÄëReady)
# 
# **Upgrades vs pure signal baseline**
# - Per‚Äëlead morphology via **PCA templates** (learned from beats)
# - **BPM regressor** (RandomForest) using rhythm/spectral features
# - Same deterministic synthesis (tiling + gentle filtering)
# - Optional **Einthoven blending** for limb leads
# - Optional tiny **CNN denoiser** (disabled by default to keep runtime short)
# 
# **I/O** follows the Kaggle PhysioNet ECG Image Digitization dataset structure.
# 
# If this helps, an upvote ‚≠ê on Kaggle helps others find it! üôå

In [None]:
# --- Imports, Reproducibility, and Global Config ---
import os, random, math, warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from scipy.signal import butter, sosfiltfilt, find_peaks
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Optional: lightweight CNN denoiser (disabled by default)
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    TORCH_OK = True
except Exception:
    TORCH_OK = False

# Reproducibility
os.environ["PYTHONHASHSEED"] = "0"
random.seed(0)
np.random.seed(0)

# --- Paths (Kaggle dataset structure) ---
TRAIN_DIR = '/kaggle/input/physionet-ecg-image-digitization/train/'
TRAIN_CSV = '/kaggle/input/physionet-ecg-image-digitization/train.csv'
TEST_DIR  = '/kaggle/input/physionet-ecg-image-digitization/test/'
TEST_CSV  = '/kaggle/input/physionet-ecg-image-digitization/test.csv'

# --- Config ---
LEADS = ['I','II','III','aVR','aVL','aVF','V1','V2','V3','V4','V5','V6']
R_PRE_S, R_POST_S = 0.20, 0.40
BEAT_LEN = 360
BP_LO_HZ, BP_HI_HZ, BP_ORDER = 5.0, 25.0, 2
BPM_CANDIDATES = [45, 55, 65, 75, 85, 95, 105]
MIN_VAL, MAX_VAL = 0.0, 0.09
EINTHOVEN_BLEND_W = 0.6
ENSEMBLE_W = np.array([0.5, 0.3, 0.2], dtype=np.float32)
SAFE_MODE = True

# Feature/Model knobs
PCA_COMPONENTS = 6             # per‚Äëlead PCA size
RF_TREES = 200                 # BPM regressor trees
ENABLE_CNN_DENOISER = True    # keep False for speed; set True to train tiny CNN
CNN_EPOCHS = 5                 # very small, to keep runtime reasonable

In [None]:
# --- Utilities ---

def zscore(x):
    x = np.asarray(x, np.float32)
    mu, s = np.mean(x), np.std(x)
    if not np.isfinite(s) or s < 1e-8: s = 1e-8
    return (x - mu) / s


def bandpass(x, fs, lo=BP_LO_HZ, hi=BP_HI_HZ, order=BP_ORDER):
    if len(x) < 32:
        return x
    nyq = 0.5 * fs
    lo_n, hi_n = max(lo/nyq, 1e-3), min(hi/nyq, 0.99)
    if hi_n <= lo_n + 1e-3:
        return x
    sos = butter(order, [lo_n, hi_n], btype='band', output='sos')
    try:
        return sosfiltfilt(sos, x).astype(np.float32)
    except Exception:
        return x.astype(np.float32)


def apply_lowpass(x, fs, cutoff=15.0, order=2):
    if len(x) < 32:
        return x
    nyq = 0.5 * fs
    wn = min(cutoff/nyq, 0.99)
    if wn <= 1e-3:
        return x.astype(np.float32)
    sos = butter(order, wn, btype='low', output='sos')
    try:
        return sosfiltfilt(sos, x).astype(np.float32)
    except Exception:
        return x.astype(np.float32)


def _nextpow2(n):
    return 1 << (int(math.ceil(math.log2(max(1, n)))))


def autocorr_peak_score(y, fs, min_rr_s=0.35, max_rr_s=1.8):
    y = zscore(y)
    n = len(y)
    if n < 16: return 0.0
    m = _nextpow2(2*n - 1)
    Y = np.fft.rfft(y, n=m)
    ac = np.fft.irfft(Y * np.conj(Y), n=m)[:n]
    lo, hi = int(min_rr_s * fs), int(max_rr_s * fs)
    if hi <= lo: return 0.0
    seg = ac[lo:hi]
    peak = np.max(seg) if seg.size else 0
    return float(np.clip(peak / (ac[0] + 1e-8), 0.0, 1.0))


def soft_minmax_scale(x, lo=MIN_VAL, hi=MAX_VAL):
    x = np.asarray(x, np.float32)
    if x.size == 0:
        return np.full(1, (lo + hi) / 2, np.float32)
    mn, mx = np.nanmin(x), np.nanmax(x)
    if not np.isfinite(mn) or not np.isfinite(mx) or mx <= mn:
        return np.full_like(x, (lo + hi) / 2, np.float32)
    y = (x - mn) / (mx - mn)
    return np.clip(lo + y * (hi - lo), lo, hi).astype(np.float32)


def scale_to_lead_range(x, lead_stat=None, lo=MIN_VAL, hi=MAX_VAL):
    if lead_stat is not None:
        mn, mx = lead_stat.get('min', -0.5), lead_stat.get('max', 0.5)
        x = np.clip(x, mn, mx)
    return soft_minmax_scale(x, lo, hi)


def resample_to_length(x, n):
    return np.interp(
        np.linspace(0, 1, n, dtype=np.float32),
        np.linspace(0, 1, len(x), dtype=np.float32),
        x
    ).astype(np.float32)


def derive_limb_leads_from_I_II(yI, yII):
    III  = yII - yI
    aVR  = -(yI + yII) / 2.0
    aVL  = yI - 0.5 * yII
    aVF  = yII - 0.5 * yI
    return {'III': III, 'aVR': aVR, 'aVL': aVL, 'aVF': aVF}


def soft_blend(a, b, w):
    return (1 - w) * a + w

In [None]:
# --- Step A: Scan training set to collect stats, beats, and BPM supervision ---

def build_per_lead_assets(train_csv, train_dir, leads=LEADS):
    meta = pd.read_csv(train_csv)
    lead_vals = {ld: [] for ld in leads}
    lead_beats = {ld: [] for ld in leads}
    lead_bpm_samples = {ld: [] for ld in leads}

    # For BPM regressor supervision (features/targets per record/lead)
    bpm_feats, bpm_targets = [], []

    for row in tqdm(meta.itertuples(index=False), total=len(meta), desc="Scan train"):
        rid = str(row.id)
        fs  = int(row.fs)
        csvp = os.path.join(train_dir, rid, f"{rid}.csv")
        if not os.path.exists(csvp):
            continue
        try:
            df = pd.read_csv(csvp)
        except Exception:
            continue

        for ld in leads:
            if ld not in df.columns:
                continue
            y = df[ld].dropna().to_numpy(np.float32)
            if len(y) < 200:
                continue

            # stats pool (raw)
            lead_vals[ld].append(y)

            # R-peak detection on band-passed signal
            y_bp = bandpass(zscore(y), fs)
            iqr = np.subtract(*np.percentile(y_bp, [75, 25]))
            scale = iqr if np.isfinite(iqr) and iqr > 0 else np.std(y_bp)
            prominence = max(0.25 * scale, 0.08)
            distance = int(max(0.28 * fs, 1))
            pks, _ = find_peaks(y_bp, distance=distance, prominence=prominence)
            if len(pks) < 2:
                continue

            rr = np.diff(pks) / float(fs)
            rr = rr[(rr > 0.3) & (rr < 2.0)]
            if rr.size >= 1:
                bpm = float(np.clip(60.0 / np.median(rr), 40.0, 160.0))
                lead_bpm_samples[ld].append(bpm)

                # --- BPM features per record/lead ---
                # Autocorr and spectral/simple stats
                y_norm = zscore(y)
                ac_sc = autocorr_peak_score(y_norm, fs)
                dur_s = len(y) / fs
                var = float(np.var(y_norm))
                mad = float(np.mean(np.abs(np.diff(y_norm))))
                # spectral proxy: power in 0.5‚Äì15 Hz band vs 15‚Äì40 Hz
                def band_power(x, fs, lo, hi):
                    X = np.fft.rfft(x)
                    freqs = np.fft.rfftfreq(len(x), d=1.0/fs)
                    m = (freqs>=lo) & (freqs<=hi)
                    return float(np.sum(np.abs(X[m])**2) + 1e-8)
                p_lo = band_power(y_norm, fs, 0.5, 15.0)
                p_hi = band_power(y_norm, fs, 15.0, 40.0)
                ratio = p_lo / (p_hi + 1e-8)
                bpm_feats.append([fs, dur_s, ac_sc, var, mad, ratio])
                bpm_targets.append(bpm)

            # beat windows around R
            n_pre  = int(round(R_PRE_S * fs))
            n_post = int(round(R_POST_S * fs))
            for pk in pks:
                a, b = pk - n_pre, pk + n_post
                if a < 0 or b >= len(y):
                    continue
                seg = y[a:b+1].astype(np.float32)
                seg_rs = resample_to_length(seg, BEAT_LEN)
                lead_beats[ld].append(seg_rs)

    # Lead stats & trimmed median beat
    lead_stats, lead_templates_raw = {}, {}
    for ld in leads:
        if len(lead_vals[ld]) == 0:
            lead_stats[ld] = {'mean':0.0,'std':0.1,'min':-0.5,'max':0.5}
        else:
            vals = np.concatenate(lead_vals[ld]).astype(np.float32)
            lead_stats[ld] = {
                'mean': float(np.mean(vals)),
                'std':  float(np.std(vals) if vals.size>1 else 0.1),
                'min':  float(np.min(vals)),
                'max':  float(np.max(vals))
            }
        if len(lead_beats[ld])>0:
            arr = np.vstack(lead_beats[ld]).astype(np.float32)
            if arr.shape[0] > 5:
                provisional = np.median(arr, axis=0)
                d = np.linalg.norm(arr - provisional, axis=1)
                keep = d < np.percentile(d, 90)
                arr = arr[keep]
            lead_templates_raw[ld] = np.median(arr, axis=0).astype(np.float32)
        else:
            t = np.linspace(0,1,BEAT_LEN,dtype=np.float32)
            lead_templates_raw[ld] = np.sin(2*np.pi*t).astype(np.float32)

    bpm_feats = np.array(bpm_feats, dtype=np.float32)
    bpm_targets = np.array(bpm_targets, dtype=np.float32)
    return lead_stats, lead_templates_raw, lead_beats, lead_bpm_samples, bpm_feats, bpm_targets

In [None]:
print("[1/5] Building assets from train‚Ä¶")
lead_stats, lead_templates_raw, lead_beats, lead_bpms, bpm_feats, bpm_targets = build_per_lead_assets(TRAIN_CSV, TRAIN_DIR, LEADS)
print("  BPM samples:", {k: len(v) for k,v in lead_bpms.items()})
print("  BPM reg feats:", bpm_feats.shape)

In [None]:
# --- Step B: Per‚Äëlead PCA templates ---
lead_pca = {}
lead_template = {}
for ld in LEADS:
    beats = lead_beats[ld]
    if len(beats) >= max(20, PCA_COMPONENTS+5):
        X = np.vstack(beats).astype(np.float32)
        pca = PCA(n_components=PCA_COMPONENTS, random_state=0).fit(X)
        lead_pca[ld] = pca
        mean_code = np.zeros(PCA_COMPONENTS, dtype=np.float32)
        tpl = pca.inverse_transform(mean_code)
        lead_template[ld] = zscore(tpl)
    else:
        lead_pca[ld] = None
        lead_template[ld] = zscore(lead_templates_raw[ld])
print("[2/5] PCA templates built.")

In [None]:
# --- Step C: BPM Regressor (RandomForest) ---
if len(bpm_targets) >= 50:
    rf = RandomForestRegressor(n_estimators=RF_TREES, random_state=0, n_jobs=-1)
    rf.fit(bpm_feats, bpm_targets)
    pred = rf.predict(bpm_feats)
    print("[3/5] BPM RF: R2=%.3f, MAE=%.2f bpm" % (r2_score(bpm_targets, pred), mean_absolute_error(bpm_targets, pred)))
else:
    rf = None
    print("[3/5] Not enough BPM supervision, skipping RF.")

# Simple amplitude scaler: per‚Äëlead linear model from (std,iqr) ‚Üí (clip range)
# It refines the per‚Äëlead min/max from training to be a bit more adaptive at test.
lead_scalers = {}
for ld in LEADS:
    # gather training stats per record for this lead (approximate using beats)
    if len(lead_beats[ld]) >= 20:
        Xs, ys_min, ys_max = [], [], []
        # sample up to 200 beats for statistics
        arr = np.vstack(lead_beats[ld])
        step = max(1, arr.shape[0] // 200)
        for seg in arr[::step]:
            s = np.std(seg); i = np.subtract(*np.percentile(seg, [75,25]))
            Xs.append([s, i])
            ys_min.append(np.min(seg))
            ys_max.append(np.max(seg))
        Xs = np.array(Xs, np.float32)
        ys_min = np.array(ys_min, np.float32)
        ys_max = np.array(ys_max, np.float32)
        if len(Xs) > 10:
            lr_min = LinearRegression().fit(Xs, ys_min)
            lr_max = LinearRegression().fit(Xs, ys_max)
            lead_scalers[ld] = (lr_min, lr_max)
        else:
            lead_scalers[ld] = None
    else:
        lead_scalers[ld] = None
print("[3.5/5] Per‚Äëlead amplitude scalers ready.")

In [None]:
# --- Step D: Optional Tiny CNN denoiser (disabled by default) ---
class SmallDenoiser(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(1, 8, 9, padding=4), nn.ReLU(),
            nn.Conv1d(8, 8, 9, padding=4), nn.ReLU(),
            nn.Conv1d(8, 1, 9, padding=4)
        )
    def forward(self, x):
        return self.net(x)

denoiser = None
if ENABLE_CNN_DENOISER and TORCH_OK:
    print("[4/5] Training tiny CNN denoiser (quick)‚Ä¶")
    # Build a tiny train set: synthesized (noisy) ‚Üí true beat median
    X_train, Y_train = [], []
    for ld in LEADS:
        beats = lead_beats[ld]
        if len(beats) < 40: continue
        tpl = lead_template[ld]
        # corrupt with mild noise to simulate synthesis artifacts
        for b in beats[:200]:
            x = tpl + 0.15*zscore(np.random.randn(len(tpl)).astype(np.float32))
            X_train.append(x.astype(np.float32))
            Y_train.append(zscore(b).astype(np.float32))
    if len(X_train) > 1000:
        X = torch.tensor(np.stack(X_train)[:,None,:])
        Y = torch.tensor(np.stack(Y_train)[:,None,:])
        denoiser = SmallDenoiser()
        opt = optim.Adam(denoiser.parameters(), lr=1e-3)
        loss_fn = nn.MSELoss()
        denoiser.train()
        for ep in range(CNN_EPOCHS):
            perm = torch.randperm(X.size(0))
            for i in range(0, X.size(0), 64):
                idx = perm[i:i+64]
                xb, yb = X[idx], Y[idx]
                out = denoiser(xb)
                loss = loss_fn(out, yb)
                opt.zero_grad(); loss.backward(); opt.step()
            print(f"  epoch {ep+1}/{CNN_EPOCHS} ‚Äî loss {loss.item():.5f}")
        denoiser.eval()
    else:
        print("  Not enough data for CNN; skipping.")
else:
    print("[4/5] CNN denoiser disabled (set ENABLE_CNN_DENOISER=True to try).")

In [None]:
# --- Step E: Synthesis helpers (with ML upgrades) ---

def tile_template(template_beat, fs, n_out, bpm, amp=1.0):
    beat_samples = max(4, int(round((60.0 / max(bpm, 1e-6)) * fs)))
    one = resample_to_length(template_beat, beat_samples)
    reps = int(np.ceil(n_out / len(one)))
    y = np.tile(one, reps)[:n_out]
    return zscore(y) * float(amp)


def choose_bpm(template_beat, fs, n_out):
    # ML path if RF exists
    if rf is not None:
        # Build the same features used in training but on the raw template signal of length n_out
        # Use autocorr best tiling as a candidate to compute features
        # First, a quick autocorr sweep to generate a provisional y
        best_bpm, best_sc = None, -1
        for bpm in BPM_CANDIDATES:
            y = tile_template(template_beat, fs, n_out, bpm, amp=1.0)
            sc = autocorr_peak_score(y, fs)
            if sc > best_sc:
                best_sc, best_bpm = sc, bpm
        # Feature vector approximated from provisional realization
        y0 = tile_template(template_beat, fs, n_out, best_bpm, amp=1.0)
        y0 = zscore(y0)
        ac_sc = autocorr_peak_score(y0, fs)
        dur_s = n_out / fs
        var = float(np.var(y0))
        mad = float(np.mean(np.abs(np.diff(y0))))
        def band_power(x, fs, lo, hi):
            X = np.fft.rfft(x)
            freqs = np.fft.rfftfreq(len(x), d=1.0/fs)
            m = (freqs>=lo) & (freqs<=hi)
            return float(np.sum(np.abs(X[m])**2) + 1e-8)
        p_lo = band_power(y0, fs, 0.5, 15.0)
        p_hi = band_power(y0, fs, 15.0, 40.0)
        ratio = p_lo / (p_hi + 1e-8)
        feat = np.array([[fs, dur_s, ac_sc, var, mad, ratio]], dtype=np.float32)
        bpm_pred = float(np.clip(rf.predict(feat)[0], 40.0, 160.0))
        # also try nearby candidates and pick best by autocorr
        candidates = sorted(set(BPM_CANDIDATES + [int(round(bpm_pred))]))
        best_bpm, best_y, best_sc = None, None, -1
        for bpm in candidates:
            y = tile_template(template_beat, fs, n_out, bpm, amp=1.0)
            sc = autocorr_peak_score(y, fs)
            if sc > best_sc:
                best_sc, best_bpm, best_y = sc, bpm, y
        return best_bpm, best_y
    else:
        # fallback: pure autocorr sweep
        best_bpm, best_score, best_y = None, -1.0, None
        for bpm in BPM_CANDIDATES:
            y = tile_template(template_beat, fs, n_out, bpm, amp=1.0)
            sc = autocorr_peak_score(y, fs)
            if sc > best_score:
                best_bpm, best_score, best_y = bpm, sc, y
        return best_bpm, best_y

In [None]:
# --- Predict Test with Micro‚ÄëEnsemble + Einthoven blending + optional denoiser ---
print("[5/5] Predicting test‚Ä¶")

test = pd.read_csv(TEST_CSV)
records = {}
for r in test.itertuples(index=False):
    records.setdefault(int(r.id), []).append(r)

predictions = {}

for rid, items in tqdm(records.items(), desc="Records"):
    tmp_store = {}
    scales_store = {}

    for r in items:
        lead = str(r.lead)
        fs   = int(r.fs)
        n    = int(r.number_of_rows)

        # PCA template if available, else median template
        tpl_beat = lead_template.get(lead, zscore(lead_templates_raw['II']))

        # BPM selection (RF‚Äëaugmented)
        best_bpm, y_best = choose_bpm(tpl_beat, fs, n)

        # Prior BPM (median from training for this lead) branch
        bpm_fixed = float(np.median(lead_bpms.get(lead, [75.0]))) if len(lead_bpms.get(lead, []))>0 else 75.0
        y_fixed   = tile_template(tpl_beat, fs, n, bpm_fixed, amp=1.0)

        # Plain mean template branch = PCA template stretched to n
        y_mean = resample_to_length(tpl_beat, n)

        # Light low‚Äëpass
        y_best  = apply_lowpass(y_best, fs, cutoff=15.0, order=2)
        y_fixed = apply_lowpass(y_fixed, fs, cutoff=15.0, order=2)
        y_mean  = apply_lowpass(y_mean, fs,  cutoff=15.0, order=2)

        # Normalize to shape space
        B, F, M = zscore(y_best), zscore(y_fixed), zscore(y_mean)

        # Micro‚Äëensemble
        w = ENSEMBLE_W / (np.sum(ENSEMBLE_W) + 1e-8)
        y_syn = (w[0]*B + w[1]*F + w[2]*M).astype(np.float32)

        # Optional CNN denoiser to subtly correct shape
        if 'torch' in globals() and ENABLE_CNN_DENOISER and ('denoiser' in globals()) and (denoiser is not None):
            with torch.no_grad():
                inp = torch.tensor(y_syn[None,None,:])
                y_syn = denoiser(inp).numpy().reshape(-1).astype(np.float32)

        tmp_store[lead] = y_syn
        scales_store[lead] = (fs, n)

    # Einthoven blending
    if EINTHOVEN_BLEND_W > 0.0 and ('I' in tmp_store) and ('II' in tmp_store):
        for dlead in ['III', 'aVR', 'aVL', 'aVF']:
            if dlead not in scales_store:
                continue
            _, n_d = scales_store[dlead]
            yI_rs  = resample_to_length(tmp_store['I'],  n_d)
            yII_rs = resample_to_length(tmp_store['II'], n_d)
            derived_all = derive_limb_leads_from_I_II(yI_rs, yII_rs)
            ydrv = zscore(derived_all[dlead])
            tmp_store[dlead] = soft_blend(tmp_store.get(dlead, ydrv), ydrv, EINTHOVEN_BLEND_W)

    # Final scaling per lead with adaptive min/max (if learned), then map to [MIN,MAX]
    for lead, y in tmp_store.items():
        scaler = lead_scalers.get(lead)
        if scaler is not None:
            lr_min, lr_max = scaler
            s = np.std(y); i = np.subtract(*np.percentile(y, [75,25]))
            est_min = float(lr_min.predict([[s,i]])[0])
            est_max = float(lr_max.predict([[s,i]])[0])
            ls = {'min': min(est_min, est_max), 'max': max(est_min, est_max)}
            y_scaled = scale_to_lead_range(y, ls, MIN_VAL, MAX_VAL)
        else:
            y_scaled = scale_to_lead_range(y, lead_stats.get(lead), MIN_VAL, MAX_VAL)
        predictions[(rid, lead)] = y_scaled.astype(np.float32)

# Write submission
rows = []
for r in test.itertuples(index=False):
    rid   = int(r.id)
    lead  = str(r.lead)
    n     = int(r.number_of_rows)
    y     = predictions[(rid, lead)]
    if len(y) != n:
        y = resample_to_length(y, n)
    for i in range(n):
        rows.append((f"{rid}_{i}_{lead}", float(y[i])))

sub = pd.DataFrame(rows, columns=['id','value'])
assert set(sub.columns) == {'id','value'}
assert sub['value'].between(MIN_VAL-1e-8, MAX_VAL+1e-8).all()
assert sub['id'].nunique() == len(sub)
sub.to_csv('submission.csv', index=False)
print("‚úÖ submission.csv written successfully!")

# Peek
print(sub.head(10).to_string(index=False))