# WARNING: Do not run to get high score, this is educationally testing bad assumptions! Check other posts for good scores.

# Simplified ECG Digitization

Small Challenge: remove the bad assumptions and fix the model! For fun.

## Import libraries

Loading basic packages for image processing and data handling

## huge thanks to seowoohyeon's public code release! (https://www.kaggle.com/code/seowoohyeon/physionet-adjust)

In [None]:
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

## Configuration

Setting up constants for ECG leads and image processing

In [None]:
# ECG lead names
LEADS = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']

# Image parameters (using wrong values to hurt performance)
CROP_TOP = 300  # Wrong crop value
MV_PER_PIXEL = 100  # Wrong conversion factor

## Simple marker detection

Basic template matching without sophisticated search

In [None]:
class SimpleMarkerFinder:
    def __init__(self):
        # Just using one reference image instead of averaging multiple
        ref_img = cv2.imread('/kaggle/input/physionet-ecg-image-digitization/train/4292118763/4292118763-0001.png')
        
        # Fixed marker positions without proper extraction
        self.markers = []
        for i in range(3):
            for j in range(5):
                x = 700 + 280 * i
                y = 100 + 500 * j
                self.markers.append((y, x))
        
        # Add two more markers
        self.markers.append((100, 1500))
        self.markers.append((2100, 1500))
    
    def find_markers(self, img):
        # Just return fixed positions without actually searching
        return [np.array(m) if i < 15 else None for i, m in enumerate(self.markers)]

## Naive signal extraction

Extract ECG trace by simple averaging without corrections

In [None]:
def get_trace_simple(img, row_idx, start_x, end_x):
    # Crop image
    cropped = img[CROP_TOP:, :]
    
    # Convert to grayscale badly
    if len(cropped.shape) == 3:
        gray = cropped[:, :, 0]  # Just use blue channel
    else:
        gray = cropped
    
    # Find row boundaries without proper algorithm
    row_height = gray.shape[0] // 4
    row_start = row_idx * row_height
    row_end = (row_idx + 1) * row_height
    row_data = gray[row_start:row_end, start_x:end_x]
    
    # Extract signal by finding darkest pixel in each column
    signal = []
    for col in range(row_data.shape[1]):
        column = row_data[:, col]
        # Just use argmin without any smoothing
        dark_idx = np.argmin(column)
        signal.append(dark_idx)
    
    return np.array(signal)

## Bad baseline correction

Remove baseline using wrong assumption of linear drift

In [None]:
def remove_baseline_wrong(signal, start_y, end_y):
    # Assume linear baseline (usually wrong)
    baseline = np.linspace(start_y, end_y, len(signal))
    
    # Convert to millivolts using wrong scale
    signal_mv = (baseline - signal) / MV_PER_PIXEL
    
    # Clip extreme values (loses important information)
    signal_mv = np.clip(signal_mv, -0.5, 0.5)
    
    return signal_mv

## Process single lead

Extract one ECG lead without sophisticated processing

In [None]:
def extract_lead_simple(img, lead_name, markers, num_samples):
    # Hardcoded lead positions (wrong for many images)
    lead_map = {
        'I': (0, 0, 1), 'II': (0, 5, 6), 'III': (0, 10, 11),
        'aVR': (1, 1, 2), 'aVL': (1, 6, 7), 'aVF': (1, 11, 12),
        'V1': (2, 2, 3), 'V2': (2, 7, 8), 'V3': (2, 12, 13),
        'V4': (3, 3, 4), 'V5': (3, 8, 9), 'V6': (3, 13, 14)
    }
    
    if lead_name not in lead_map:
        return np.zeros(num_samples)
    
    row, start_idx, end_idx = lead_map[lead_name]
    
    # Get marker positions (might be None)
    start_marker = markers[start_idx] if start_idx < len(markers) and markers[start_idx] is not None else np.array([500, 700])
    end_marker = markers[end_idx] if end_idx < len(markers) and markers[end_idx] is not None else np.array([500, 1000])
    
    # Extract trace
    signal = get_trace_simple(img, row, int(start_marker[1]), int(end_marker[1]))
    
    # Remove baseline
    signal_mv = remove_baseline_wrong(signal, start_marker[0], end_marker[0])
    
    # Resample to required length using simple linear interpolation
    # No fancy alignment or signal processing
    if len(signal_mv) > 0:
        x_old = np.linspace(0, 1, len(signal_mv))
        x_new = np.linspace(0, 1, num_samples)
        resampled = np.interp(x_new, x_old, signal_mv)
    else:
        resampled = np.zeros(num_samples)
    
    return resampled.astype(np.float32)

## Build mean model

Calculate average signal per lead from training data

In [None]:
def create_mean_predictions(train_df):
    means = defaultdict(list)
    
    print("Building mean model...")
    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        labels_file = f'/kaggle/input/physionet-ecg-image-digitization/train/{row.id}/{row.id}.csv'
        try:
            labels = pd.read_csv(labels_file)
            for lead in labels.columns:
                vals = labels[lead].dropna().values
                if len(vals) > 0:
                    # Resample to fixed length
                    resampled = np.interp(
                        np.linspace(0, len(vals)-1, 10000),  # Using shorter length
                        np.arange(len(vals)),
                        vals
                    )
                    means[lead].append(resampled)
        except:
            pass
    
    # Average all signals
    for lead in means:
        means[lead] = np.mean(np.stack(means[lead]), axis=0)
    
    return means

## Process test images

Extract predictions for test set with fallback to mean model

In [None]:
def process_test_image(img_path, lead_name, num_rows, mean_model, marker_finder):
    # Try to read image
    img = cv2.imread(img_path)
    
    if img is None:
        # Fallback to mean
        mean_signal = mean_model.get(lead_name, np.zeros(10000))
        return np.interp(
            np.linspace(0, 1, num_rows),
            np.linspace(0, 1, len(mean_signal)),
            mean_signal
        )
    
    # Try to find markers (will often fail)
    try:
        markers = marker_finder.find_markers(img)
        
        # Extract lead
        prediction = extract_lead_simple(img, lead_name, markers, num_rows)
        
        # If extraction failed, use mean
        if np.all(prediction == 0) or np.any(np.isnan(prediction)):
            mean_signal = mean_model.get(lead_name, np.zeros(10000))
            prediction = np.interp(
                np.linspace(0, 1, num_rows),
                np.linspace(0, 1, len(mean_signal)),
                mean_signal
            )
        
        return prediction
    except:
        # On any error, use mean model
        mean_signal = mean_model.get(lead_name, np.zeros(10000))
        return np.interp(
            np.linspace(0, 1, num_rows),
            np.linspace(0, 1, len(mean_signal)),
            mean_signal
        )

## Main execution

Load data, process images, create submission

In [None]:
# Load data
print("Loading data...")
train_data = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/train.csv')
test_data = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/test.csv')

# Create mean model
mean_model = create_mean_predictions(train_data)

# Initialize marker finder
marker_finder = SimpleMarkerFinder()

# Process test images
print("Processing test images...")
submission_rows = []
current_image_id = None
current_predictions = {}

for _, test_row in tqdm(test_data.iterrows(), total=len(test_data)):
    # Check if we need to process new image
    if test_row.id != current_image_id:
        img_path = f"/kaggle/input/physionet-ecg-image-digitization/test/{test_row.id}.png"
        current_image_id = test_row.id
        current_predictions = {}  # Clear cache
    
    # Get prediction for this lead
    if test_row.lead not in current_predictions:
        current_predictions[test_row.lead] = process_test_image(
            img_path,
            test_row.lead,
            test_row.number_of_rows,
            mean_model,
            marker_finder
        )
    
    prediction = current_predictions[test_row.lead]
    
    # Create submission rows
    for timestep in range(test_row.number_of_rows):
        submission_rows.append({
            'id': f"{test_row.id}_{timestep}_{test_row.lead}",
            'value': float(prediction[timestep])
        })

# Save submission
print("Saving submission...")
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('submission.csv', index=False)
print("Done!")

## Spoilers

Changes that hurt performance:
- Removed sophisticated signal alignment algorithms
- No occlusion detection or correction
- No slope based interpolation for flat segments
- No Einthoven's law enforcement
- Wrong crop parameters and scaling factors
- Fixed marker positions instead of template matching
- Simple linear baseline assumption
- No denoising or smoothing
- Only using blue channel for grayscale
- No proper top/bottom trace extraction
- Clipping values loses important information
- Heavy reliance on fallback mean model