In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch

In [None]:
# Tree model setup: CatBoost + group split
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler



# Config

In [None]:
class Config:
    BASE_DIR = '/kaggle/input/physionet-ecg-image-digitization'
    ITERATIONS = 5000
    SEED = 42
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # First run on a limited subset for speed; then we will train on full data
    LIMIT_IDS = None           # set to None to use all ids immediately
    SUBSAMPLE_EVERY = 2        # take every k-th sample within each series
    # FULL_SUBSAMPLE_EVERY = 1   # heavier set; adjust if memory is tight
rng = np.random.RandomState(Config.SEED)

In [None]:
# Use config BASE_DIR
train = pd.read_csv(Config.BASE_DIR + '/train.csv')
test = pd.read_csv(Config.BASE_DIR + '/test.csv')
submission = pd.read_parquet(Config.BASE_DIR + '/sample_submission.parquet')

In [None]:
import warnings
warnings.filterwarnings('ignore')
TRAIN_DIR = Config.BASE_DIR + '/train/'
LEADS = ['I','II','III','aVR','aVL','aVF','V1','V2','V3','V4','V5','V6']

# Metric 

In [None]:
from typing import Tuple

import numpy as np
import pandas as pd

import scipy.optimize
import scipy.signal


LEADS = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
MAX_TIME_SHIFT = 0.2
PERFECT_SCORE = 384


class ParticipantVisibleError(Exception):
    pass


def compute_power(label: np.ndarray, prediction: np.ndarray) -> Tuple[float, float]:
    if label.ndim != 1 or prediction.ndim != 1:
        raise ParticipantVisibleError('Inputs must be 1-dimensional arrays.')
    finite_mask = np.isfinite(prediction)
    if not np.any(finite_mask):
        raise ParticipantVisibleError("The 'prediction' array contains no finite values (all NaN or inf).")

    prediction[~np.isfinite(prediction)] = 0
    noise = label - prediction
    p_signal = np.sum(label**2)
    p_noise = np.sum(noise**2)
    return p_signal, p_noise


def compute_snr(signal: float, noise: float) -> float:
    if noise == 0:
        # Perfect reconstruction
        snr = PERFECT_SCORE
    elif signal == 0:
        snr = 0
    else:
        snr = min((signal / noise), PERFECT_SCORE)
    return snr


def align_signals(label: np.ndarray, pred: np.ndarray, max_shift: float = float('inf')) -> np.ndarray:
    if np.any(~np.isfinite(label)):
        raise ParticipantVisibleError('values in label should all be finite')
    if np.sum(np.isfinite(pred)) == 0:
        raise ParticipantVisibleError('prediction can not all be infinite')

    # Initialize the reference and digitized signals.
    label_arr = np.asarray(label, dtype=np.float64)
    pred_arr = np.asarray(pred, dtype=np.float64)

    label_mean = np.mean(label_arr)
    pred_mean = np.mean(pred_arr)

    label_arr_centered = label_arr - label_mean
    pred_arr_centered = pred_arr - pred_mean

    # Compute the correlation between the reference and digitized signals and locate the maximum correlation.
    correlation = scipy.signal.correlate(label_arr_centered, pred_arr_centered, mode='full')

    n_label = np.size(label_arr)
    n_pred = np.size(pred_arr)

    lags = scipy.signal.correlation_lags(n_label, n_pred, mode='full')
    valid_lags_mask = (lags >= -max_shift) & (lags <= max_shift)

    max_correlation = np.nanmax(correlation[valid_lags_mask])
    all_max_indices = np.flatnonzero(correlation == max_correlation)
    best_idx = min(all_max_indices, key=lambda i: abs(lags[i]))
    time_shift = lags[best_idx]
    start_padding_len = max(time_shift, 0)
    pred_slice_start = max(-time_shift, 0)
    pred_slice_end = min(n_label - time_shift, n_pred)
    end_padding_len = max(n_label - n_pred - time_shift, 0)
    aligned_pred = np.concatenate((np.full(start_padding_len, np.nan), pred_arr[pred_slice_start:pred_slice_end], np.full(end_padding_len, np.nan)))

    def objective_func(v_shift):
        return np.nansum((label_arr - (aligned_pred - v_shift)) ** 2)

    if np.any(np.isfinite(label_arr) & np.isfinite(aligned_pred)):
        results = scipy.optimize.minimize_scalar(objective_func, method='Brent')
        vertical_shift = results.x
        aligned_pred -= vertical_shift
    return aligned_pred


def _calculate_image_score(group: pd.DataFrame) -> float:
    """Helper function to calculate the total SNR score for a single image group."""

    unique_fs_values = group['fs'].unique()
    if len(unique_fs_values) != 1:
        raise ParticipantVisibleError('Sampling frequency should be consistent across each ecg')
    sampling_frequency = unique_fs_values[0]
    if sampling_frequency != int(len(group[group['lead'] == 'II']) / 10):
        raise ParticipantVisibleError('The sequence_length should be sampling frequency * 10s')
    sum_signal = 0
    sum_noise = 0
    for lead in LEADS:
        sub = group[group['lead'] == lead]
        label = sub['value_true'].values
        pred = sub['value_pred'].values

        aligned_pred = align_signals(label, pred, int(sampling_frequency * MAX_TIME_SHIFT))
        p_signal, p_noise = compute_power(label, aligned_pred)
        sum_signal += p_signal
        sum_noise += p_noise
    return compute_snr(sum_signal, sum_noise)


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute the mean Signal-to-Noise Ratio (SNR) across multiple ECG leads and images for the PhysioNet 2025 competition.
    The final score is the average of the sum of SNRs over different lines, averaged over all unique images.
    Args:
        solution: DataFrame with ground truth values. Expected columns: 'id' and one for each lead.
        submission: DataFrame with predicted values. Expected columns: 'id' and one for each lead.
        row_id_column_name: The name of the unique identifier column, typically 'id'.
    Returns:
        The final competition score.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> row_id_column_name = "id"
    >>> solution = pd.DataFrame({'id': ['343_0_I', '343_1_I', '343_2_I', '343_0_III', '343_1_III','343_2_III','343_0_aVR', '343_1_aVR','343_2_aVR',\
    '343_0_aVL', '343_1_aVL', '343_2_aVL', '343_0_aVF', '343_1_aVF','343_2_aVF','343_0_V1', '343_1_V1', '343_2_V1','343_0_V2', '343_1_V2','343_2_V2',\
    '343_0_V3', '343_1_V3', '343_2_V3','343_0_V4', '343_1_V4', '343_2_V4', '343_0_V5', '343_1_V5','343_2_V5','343_0_V6', '343_1_V6','343_2_V6',\
    '343_0_II', '343_1_II','343_2_II', '343_3_II', '343_4_II', '343_5_II','343_6_II', '343_7_II','343_8_II','343_9_II','343_10_II','343_11_II'],\
    'fs': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\
    'value':[0.1,0.3,0.4,0.6,0.6,0.4,0.2,0.3,0.4,0.5,0.2,0.7,0.2,0.3,0.4,0.8,0.6,0.7, 0.2,0.3,-0.1,0.5,0.6,0.7,0.2,0.9,0.4,0.5,0.6,0.7,0.1,0.3,0.4,\
    0.6,0.6,0.4,0.2,0.3,0.4,0.5,0.2,0.7,0.2,0.3,0.4]})
    >>> submission = solution.copy()
    >>> round(score(solution, submission, row_id_column_name), 4)
    25.8433
    >>> submission.loc[0, 'value'] = 0.9 # Introduce some noise
    >>> round(score(solution, submission, row_id_column_name), 4)
    13.6291
    >>> submission.loc[4, 'value'] = 0.3 # Introduce some noise
    >>> round(score(solution, submission, row_id_column_name), 4)
    13.0576

    >>> solution = pd.DataFrame({'id': ['343_0_I', '343_1_I', '343_2_I', '343_0_III', '343_1_III','343_2_III','343_0_aVR', '343_1_aVR','343_2_aVR',\
    '343_0_aVL', '343_1_aVL', '343_2_aVL', '343_0_aVF', '343_1_aVF','343_2_aVF','343_0_V1', '343_1_V1', '343_2_V1','343_0_V2', '343_1_V2','343_2_V2',\
    '343_0_V3', '343_1_V3', '343_2_V3','343_0_V4', '343_1_V4', '343_2_V4', '343_0_V5', '343_1_V5','343_2_V5','343_0_V6', '343_1_V6','343_2_V6',\
    '343_0_II', '343_1_II','343_2_II', '343_3_II', '343_4_II', '343_5_II','343_6_II', '343_7_II','343_8_II','343_9_II','343_10_II','343_11_II'],\
    'fs': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\
    'value':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]})
    >>> round(score(solution, submission, row_id_column_name), 4)
    -384
    >>> submission = solution.copy()
    >>> round(score(solution, submission, row_id_column_name), 4)
    25.8433

    >>> # test alignment
    >>> label = np.array([0, 1, 2, 1, 0])
    >>> pred = np.array([0, 1, 2, 1, 0])
    >>> aligned = align_signals(label, pred)
    >>> expected_array = np.array([0, 1, 2, 1, 0])
    >>> np.allclose(aligned, expected_array, equal_nan=True)
    True

    >>> # Test 2: Vertical shift (DC offset) should be removed
    >>> label = np.array([0, 1, 2, 1, 0])
    >>> pred = np.array([10, 11, 12, 11, 10])
    >>> aligned = align_signals(label, pred)
    >>> expected_array = np.array([0, 1, 2, 1, 0])
    >>> np.allclose(aligned, expected_array, equal_nan=True)
    True

    >>> # Test 3: Time shift should be corrected
    >>> label = np.array([0, 0, 1, 2, 1, 0., 0.])
    >>> pred = np.array([1, 2, 1, 0, 0, 0, 0])
    >>> aligned = align_signals(label, pred)
    >>> expected_array = np.array([np.nan, np.nan, 1, 2, 1, 0, 0])
    >>> np.allclose(aligned, expected_array, equal_nan=True)
    True
    
    >>> # Test 4: max_shift constraint prevents optimal alignment
    >>> label = np.array([0, 0, 0, 0, 1, 2, 1]) # Peak is far
    >>> pred = np.array([1, 2, 1, 0, 0, 0, 0])
    >>> aligned = align_signals(label, pred, max_shift=10)
    >>> expected_array = np.array([ np.nan, np.nan, np.nan, np.nan, 1, 2, 1])
    >>> np.allclose(aligned, expected_array, equal_nan=True)
    True

    """
    for df in [solution, submission]:
        if row_id_column_name not in df.columns:
            raise ParticipantVisibleError(f"'{row_id_column_name}' column not found in DataFrame.")
        if df['value'].isna().any():
            raise ParticipantVisibleError('NaN exists in solution/submission')
        if not np.isfinite(df['value']).all():
            raise ParticipantVisibleError('Infinity exists in solution/submission')

    submission = submission[['id', 'value']]
    merged_df = pd.merge(solution, submission, on=row_id_column_name, suffixes=('_true', '_pred'))
    merged_df['image_id'] = merged_df[row_id_column_name].str.split('_').str[0]
    merged_df['row_id'] = merged_df[row_id_column_name].str.split('_').str[1].astype('int64')
    merged_df['lead'] = merged_df[row_id_column_name].str.split('_').str[2]
    merged_df.sort_values(by=['image_id', 'row_id', 'lead'], inplace=True)
    image_scores = merged_df.groupby('image_id').apply(_calculate_image_score, include_groups=False)
    return max(float(10 * np.log10(image_scores.mean())), -PERFECT_SCORE)

# Prepare features

In [None]:
# Map id -> fs from train.csv
FS_MAP = dict(zip(train['id'].astype(str), train['fs'].astype(int)))

In [None]:
def iter_windows(sig: np.ndarray, fs: int, lead: str, crop_mode: str = 'first',
                 step_sec_ii: float = 2.0, step_sec_other: float = 0.5):
    """Yield fixed-length windows across the full signal (overlapping)."""
    win_len = int(round(fs * (10.0 if lead == 'II' else 2.5)))
    if len(sig) <= win_len:
        yield crop_to_expected_length(sig, fs, lead, mode=crop_mode)
        return
    step = int(round(fs * (step_sec_ii if lead == 'II' else step_sec_other)))
    step = max(1, min(step, win_len))
    last_start = len(sig) - win_len
    for start in range(0, last_start + 1, step):
        yield sig[start:start + win_len]
    # ensure the tail is covered
    if (last_start % step) != 0:
        yield sig[-win_len:]

In [None]:
def infer_fs_from_df(df: pd.DataFrame) -> int:
    if 'II' in df.columns and len(df['II']) >= 10:
        return int(len(df['II']) / 10)
    return 500

def crop_to_expected_length(sig: np.ndarray, fs: int, lead: str, mode: str = 'first') -> np.ndarray:
    target_len = int(round(fs * (10.0 if lead == 'II' else 2.5)))
    if len(sig) == target_len:
        return sig
    if len(sig) < target_len:
        # pad short signals (rare) to keep shapes consistent
        pad = target_len - len(sig)
        return np.pad(sig, (0, pad), mode='edge')
    # len(sig) > target_len: crop
    if mode == 'center':
        start = (len(sig) - target_len) // 2
        return sig[start:start + target_len]
    # default: take first window
    return sig[:target_len]

# ...existing code...
def build_training_rows(train_df, train_dir, lead_templates, global_stats,
                        subsample_every=1, crop_mode='first',
                        use_windows=True, step_sec_ii=2.0, step_sec_other=0.5,
                        min_len=5):
    rows, y_all = [], []
    for _, meta in train_df.iterrows():
        csv_path = os.path.join(train_dir, str(meta['id']), f"{meta['id']}.csv")
        if not os.path.exists(csv_path):
            continue
        try:
            df = pd.read_csv(csv_path)
        except Exception:
            continue
        fs = int(FS_MAP.get(str(meta['id']), infer_fs_from_df(df)))
        for lead in [c for c in df.columns if c in (set(global_stats.keys()) | set(lead_templates.keys()))]:
            sig = df[lead].dropna().values.astype(np.float32)

            # Produce training segments
            segments = (iter_windows(sig, fs, lead, crop_mode, step_sec_ii, step_sec_other)
                        if use_windows else [crop_to_expected_length(sig, fs, lead, mode=crop_mode)])

            for seg in segments:
                n_rows = len(seg)
                if n_rows < min_len:
                    continue
                feat = make_series_features(n_rows, fs, lead, lead_templates, global_stats)
                take = np.arange(0, n_rows, subsample_every, dtype=int)
                rows.append(feat.iloc[take].reset_index(drop=True))
                y_all.append(pd.Series(seg[take], name='y').reset_index(drop=True))

    if not rows:
        return pd.DataFrame(), pd.Series(dtype=np.float32)
    X = pd.concat(rows, axis=0, ignore_index=True)
    y = pd.concat(y_all, axis=0, ignore_index=True).astype(np.float32)
    return X, y
# ...existing code...

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt

# --- Utilities from your notebook idea ---

def build_lead_templates(train_df, train_dir, leads, template_len=500):
    templates = {}
    for lead in leads:
        signals = []
        for _, row in train_df.iterrows():
            csv_path = os.path.join(train_dir, str(row['id']), f"{row['id']}.csv")
            if not os.path.exists(csv_path):
                continue
            try:
                df = pd.read_csv(csv_path)
                if lead not in df.columns:
                    continue
                s = df[lead].dropna().values.astype(np.float32)
                if len(s) < 50:
                    continue
                s_norm = (s - s.mean()) / (s.std() + 1e-8)
                s_resamp = np.interp(
                    np.linspace(0, 1, template_len),
                    np.linspace(0, 1, len(s_norm)),
                    s_norm
                )
                signals.append(s_resamp)
            except:
                continue
        if signals:
            templates[lead] = np.mean(signals, axis=0)
        else:
            t = np.linspace(0, 1, template_len)
            templates[lead] = np.sin(2 * np.pi * t)
    return templates


In [None]:
def compute_global_stats(train_df, train_dir):
    all_stats = {}
    for _, row in train_df.iterrows():
        csv_path = os.path.join(train_dir, str(row['id']), f"{row['id']}.csv")
        if not os.path.exists(csv_path):
            continue
        try:
            df = pd.read_csv(csv_path)
            for lead in df.columns:
                vals = df[lead].dropna().values
                if len(vals) == 0:
                    continue
                if lead not in all_stats:
                    all_stats[lead] = []
                all_stats[lead].extend(vals.tolist())
        except:
            continue
    global_stats = {}
    for lead, vals in all_stats.items():
        v = np.asarray(vals, dtype=np.float32)
        if len(v) == 0:
            continue
        global_stats[lead] = dict(
            mean=float(np.mean(v)),
            std=float(np.std(v) if len(v) > 1 else 0.1),
            median=float(np.median(v)),
            min=float(np.min(v)),
            max=float(np.max(v)),
        )
    return global_stats

def lowpass_15hz(x, fs):
    if len(x) < 10:
        return x
    nyq = 0.5 * fs
    wn = min(15.0 / nyq, 0.99)
    b, a = butter(2, wn, btype='low')
    return filtfilt(b, a, x)

# --- Feature extraction per (id, lead) series ---

def rolling_mean(x, w):
    if w <= 1:
        return x
    k = np.ones(w, dtype=np.float32) / w
    return np.convolve(x, k, mode='same')

def make_series_features(n_rows, fs, lead, lead_templates, global_stats, template_len=500):
    # Meta
    duration = 10.0 if lead == 'II' else 2.5
    idx = np.arange(n_rows, dtype=np.int32)
    i_norm = idx / max(n_rows - 1, 1)
    t_sec = i_norm * duration

    # Sin/Cos bases
    freqs = [0.5, 1, 2, 3, 5, 7, 10]
    max_f = fs / 2.0
    sin_feats, cos_feats = [], []
    for f in freqs:
        if f < max_f:
            sin_feats.append(np.sin(2 * np.pi * f * t_sec))
            cos_feats.append(np.cos(2 * np.pi * f * t_sec))
    if len(sin_feats) == 0:
        sin_feats = [np.zeros(n_rows)]
        cos_feats = [np.zeros(n_rows)]
    sin_feats = np.vstack(sin_feats).T
    cos_feats = np.vstack(cos_feats).T

    # Beat phase windows
    heart_period = 0.8
    phase = np.mod(t_sec, heart_period) / heart_period
    p_mask   = ((phase >= 0.00) & (phase <= 0.12)).astype(np.float32)
    qrs_mask = ((phase >= 0.20) & (phase <= 0.28)).astype(np.float32)
    t_mask   = ((phase >= 0.40) & (phase <= 0.60)).astype(np.float32)

    # Template and derivatives
    tpl_base = lead_templates.get(lead, next(iter(lead_templates.values())))
    tpl = np.interp(np.linspace(0, 1, n_rows), np.linspace(0, 1, len(tpl_base)), tpl_base)
    tpl_d1 = np.gradient(tpl)
    tpl_d2 = np.gradient(tpl_d1)

    # Local windows
    m5  = rolling_mean(tpl, 5)
    m11 = rolling_mean(tpl, 11)
    m31 = rolling_mean(tpl, 31)
    dev5  = tpl - m5
    dev11 = tpl - m11
    dev31 = tpl - m31

    # Band-limited version
    tpl_lp = lowpass_15hz(tpl, fs)
    tpl_lp_d1 = np.abs(np.gradient(tpl_lp))

    # Lead stats (broadcast)
    stats = global_stats.get(lead, dict(mean=0.0, std=0.1, median=0.0, min=-0.1, max=0.1))
    amp_range = stats['max'] - stats['min']
    coeff_var = stats['std'] / (abs(stats['mean']) + 1e-8)

    # Assemble features
    feat = {
        'i': idx,
        'i_norm': i_norm,
        't_sec': t_sec,
        't_sec2': t_sec**2,
        't_sec3': t_sec**3,
        'is_lead_II': (lead == 'II') * 1.0,
        'fs': np.full(n_rows, fs, dtype=np.float32),
        'n_rows': np.full(n_rows, n_rows, dtype=np.int32),
        'duration': np.full(n_rows, duration, dtype=np.float32),
        'phase': phase,
        'p_mask': p_mask,
        'qrs_mask': qrs_mask,
        't_mask': t_mask,
        'tpl': tpl,
        'tpl_d1': tpl_d1,
        'tpl_d2': tpl_d2,
        'tpl_m5': m5,
        'tpl_m11': m11,
        'tpl_m31': m31,
        'tpl_dev5': dev5,
        'tpl_dev11': dev11,
        'tpl_dev31': dev31,
        'tpl_lp': tpl_lp,
        'tpl_lp_d1_abs': tpl_lp_d1,
        'lead_mean': np.full(n_rows, stats['mean'], dtype=np.float32),
        'lead_std': np.full(n_rows, stats['std'], dtype=np.float32),
        'lead_median': np.full(n_rows, stats['median'], dtype=np.float32),
        'lead_min': np.full(n_rows, stats['min'], dtype=np.float32),
        'lead_max': np.full(n_rows, stats['max'], dtype=np.float32),
        'lead_amp_range': np.full(n_rows, amp_range, dtype=np.float32),
        'lead_coeff_var': np.full(n_rows, coeff_var, dtype=np.float32),
        'dist_start': idx,
        'dist_end': (n_rows - 1 - idx),
        'dist_start_norm': idx / max(n_rows - 1, 1),
        'dist_end_norm': (n_rows - 1 - idx) / max(n_rows - 1, 1),
        'lead_cat': np.full(n_rows, lead, dtype=object),  # treat as categorical in CatBoost/LGBM
    }

    # Add sin/cos bases with names
    for k in range(sin_feats.shape[1]):
        feat[f'sin_{k}'] = sin_feats[:, k]
        feat[f'cos_{k}'] = cos_feats[:, k]

    return pd.DataFrame(feat)




# Train a tree model (CatBoost) with 80/20 split and evaluate with the competition score


In [None]:
# Group-aware 80/20 split on ECG ids
all_ids = train['id'].unique().tolist()
limit_ids = all_ids[:Config.LIMIT_IDS] if Config.LIMIT_IDS else all_ids

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=Config.SEED)
train_idx, val_idx = next(gss.split(np.zeros(len(limit_ids)), groups=limit_ids))
train_ids = [limit_ids[i] for i in train_idx]
val_ids = [limit_ids[i] for i in val_idx]
print('Train ids:', len(train_ids), 'Val ids:', len(val_ids))
# Build training matrices (limited subset)


In [None]:

lead_templates = build_lead_templates(train[train['id'].isin(train_ids)], TRAIN_DIR, LEADS, template_len=500)
global_stats   = compute_global_stats(train[train['id'].isin(train_ids)], TRAIN_DIR)


In [None]:

X_train, y_train = build_training_rows(train[train['id'].isin(train_ids)], TRAIN_DIR, lead_templates, global_stats, subsample_every=Config.SUBSAMPLE_EVERY)
X_val, y_val     = build_training_rows(train[train['id'].isin(val_ids)], TRAIN_DIR, lead_templates, global_stats, subsample_every=Config.SUBSAMPLE_EVERY)
print('X_train:', X_train.shape, 'y_train:', y_train.shape)
print('X_val:', X_val.shape, 'y_val:', y_val.shape)


In [None]:
# check NaN
print('Train NaN:', X_train.isna().sum().sum(), 'Val NaN:', X_val.isna().sum().sum())
# check NaN in y
print('y_train NaN:', y_train.isna().sum(), 'y_val NaN:', y_val.isna().sum())

In [None]:
# One-hot encode categorical columns and align columns
cat_cols = [c for c in X_train.columns if X_train[c].dtype == 'object']
X_train = pd.get_dummies(X_train, columns=cat_cols, dtype=np.float32)
X_val   = pd.get_dummies(X_val,   columns=cat_cols, dtype=np.float32)

# Align columns
DUMMY_COLUMNS = X_train.columns
X_val = X_val.reindex(columns=DUMMY_COLUMNS, fill_value=0)

# Scale features
SCALER = StandardScaler()
X_train = pd.DataFrame(SCALER.fit_transform(X_train), columns=DUMMY_COLUMNS).astype(np.float32)
X_val   = pd.DataFrame(SCALER.transform(X_val),   columns=DUMMY_COLUMNS).astype(np.float32)

print('After dummies+scaling -> X_train:', X_train.shape, 'X_val:', X_val.shape)

# Identify categorical feature names for reference (now encoded)
print('Encoded categorical columns:', [c for c in DUMMY_COLUMNS if 'lead_cat_' in c])
X_train = X_train.reset_index(drop=True)
X_val   = X_val.reset_index(drop=True)


In [None]:

model = CatBoostRegressor(
    loss_function='RMSE',
    learning_rate=0.01,
    depth=6,
    l2_leaf_reg=3.0,
    iterations=Config.ITERATIONS,
    random_seed=Config.SEED,
    od_type='Iter',
    od_wait=50,
    verbose=200,
    task_type="GPU" if Config.DEVICE != torch.device("cpu") else "CPU",
    
)
# model.fit(X_train, y_train,eval_set=(X_val, y_val))
model.fit(X_train, y_train)
# Validate with competition score on the 20% holdout

def infer_fs_from_df(df: pd.DataFrame) -> int:
    if 'II' in df.columns:
        L = len(df['II'])
        return int(L / 10) if L >= 10 else 500
    return 500



In [None]:

def build_solution_and_submission_for_ids(id_list, model):
    solution_rows = []
    submission_rows = []
    for image_id in id_list:
        csv_path = os.path.join(TRAIN_DIR, str(image_id), f"{image_id}.csv")
        if not os.path.exists(csv_path):
            continue
        try:
            df_ecg = pd.read_csv(csv_path)
        except Exception:
            continue
        fs = infer_fs_from_df(df_ecg)
        for lead in LEADS:
            if lead not in df_ecg.columns:
                continue
            # y_true = df_ecg[lead].values.astype(np.float32)
            y_true = df_ecg[lead].dropna().values.astype(np.float32)  # Drop NaN values
            n_rows = len(y_true)
            # Check for NaN in y_true
            # if np.any(np.isnan(y_true)):
            #     print(f"Warning: NaN in ground truth for {image_id} lead {lead}. Filling with 0.")
            #     y_true = np.nan_to_num(y_true)  # Replace NaN with 0
            feats = make_series_features(n_rows, fs, lead, lead_templates, global_stats)

            # Preprocess: one-hot encode object columns and scale using training artifacts
            obj_cols = [c for c in feats.columns if feats[c].dtype == 'object']
            feats_proc = pd.get_dummies(feats, columns=obj_cols, dtype=np.float32)
            feats_proc = feats_proc.reindex(columns=DUMMY_COLUMNS, fill_value=0)
            feats_scaled = pd.DataFrame(SCALER.transform(feats_proc), columns=DUMMY_COLUMNS).astype(np.float32)
            #stat of feats NaN
            na = feats_scaled.isna().sum()
            if na.sum() > 0:
                print(f"Warning: NaN in features for {image_id} lead {lead}:")
                print(na[na > 0])
            y_pred = model.predict(feats_scaled)
            if len(y_pred) != n_rows:
                print(f"Warning: prediction length mismatch for {image_id} lead {lead}: expected {n_rows}, got {len(y_pred)}")
                continue
            if np.any(np.isnan(y_pred)):
                print(f"Warning: NaN in predictions for {image_id} lead {lead}")
            for i in range(n_rows):
                rid = f"{image_id}_{i}_{lead}"
                solution_rows.append({'id': rid, 'fs': fs, 'value': float(y_true[i])})
                submission_rows.append({'id': rid, 'value': float(y_pred[i])})
    # fill NaN for solution
    solution_rows = pd.DataFrame(solution_rows)
    return pd.DataFrame(solution_rows), pd.DataFrame(submission_rows)

sol_val, sub_val = build_solution_and_submission_for_ids(val_ids, model)
print('Validation rows:', sol_val.shape, sub_val.shape)
val_score = score(sol_val, sub_val, row_id_column_name='id')
print('Validation competition score:', val_score)

## make submission

In [None]:
# Predict on test set and write submission.csv
print('Predicting on test set and writing submission.csv ...')

predictions = {}
for _, row in test.iterrows():
    base_id = row['id']
    lead    = row['lead']
    n_rows  = int(row['number_of_rows'])
    fs      = int(row.get('fs', 500))

    feats = make_series_features(n_rows, fs, lead, lead_templates, global_stats)

    # Preprocess test features with full-training artifacts
    obj_cols = [c for c in feats.columns if feats[c].dtype == 'object']
    feats_proc = pd.get_dummies(feats, columns=obj_cols, dtype=np.float32)
    feats_proc = feats_proc.reindex(columns=DUMMY_COLUMNS, fill_value=0)
    feats_scaled = pd.DataFrame(SCALER.transform(feats_proc), columns=DUMMY_COLUMNS).astype(np.float32)

    y_pred = model.predict(feats_scaled)
    # Optional post-filtering for smoother morphology (15 Hz low-pass)
    y_pred = lowpass_15hz(y_pred, fs) if len(y_pred) >= 10 else y_pred
    predictions[(base_id, lead)] = y_pred.astype(np.float32)

submission_data = []
for _, row in test.iterrows():
    base_id = row['id']
    lead    = row['lead']
    n_rows  = int(row['number_of_rows'])
    sig     = predictions[(base_id, lead)]
    for i in range(n_rows):
        submission_data.append({'id': f"{base_id}_{i}_{lead}", 'value': float(sig[i])})

submission_df = pd.DataFrame(submission_data)
submission_df.to_csv('submission.csv', index=False)
submission_df.head()

In [None]:
submission_df.shape