In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
from tqdm import tqdm
import warnings
import os

warnings.filterwarnings('ignore')

print("="*60)
print("üöÄ PHYSIONET ECG BASELINE MODEL")
print("="*60)

# Quick check of available data
print("\nüìÅ Checking available input files...")
input_path = Path('/kaggle/input/physionet-ecg-image-digitization')

if input_path.exists():
    print(f"‚úÖ Competition data found at: {input_path}")
    print(f"\nüìÇ Main files:")
    for item in sorted(input_path.iterdir())[:10]:  # Show first 10
        if item.is_file():
            size_mb = item.stat().st_size / (1024*1024)
            print(f"  üìÑ {item.name:<30} ({size_mb:.2f} MB)")
        else:
            print(f"  üìÅ {item.name}/")
else:
    print("‚ùå Competition data not found! Make sure you're in the competition notebook.")

print("\n" + "="*60)


In [None]:
print("\nüìä LOADING COMPETITION DATA")
print("="*60)

DATA_PATH = Path('/kaggle/input/physionet-ecg-image-digitization')

train_df = pd.read_csv(DATA_PATH / 'train.csv')
test_df = pd.read_csv(DATA_PATH / 'test.csv')
sample_sub = pd.read_parquet(DATA_PATH / 'sample_submission.parquet')

print(f"‚úÖ Train samples: {len(train_df):,}")
print(f"‚úÖ Test samples: {test_df['id'].nunique():,}")
print(f"‚úÖ Predictions needed: {len(sample_sub):,}")

print(f"\nüîç Train data preview:")
print(train_df.head())

print(f"\nüîç Test data preview:")
print(test_df.head())

In [None]:
print("\nüîß DEFINING PREPROCESSING FUNCTIONS")
print("="*60)

def load_and_preprocess_image(image_path):
    """Load ECG image and convert to grayscale"""
    img = cv2.imread(str(image_path))
    if img is None:
        return None
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return gray

def remove_grid(image):
    """Remove grid lines from image to isolate signal"""
    # Invert image (signal becomes white on black background)
    inverted = cv2.bitwise_not(image)
    
    # Apply Gaussian blur to smooth
    blurred = cv2.GaussianBlur(inverted, (3, 3), 0)
    
    # Threshold to get binary signal
    _, binary = cv2.threshold(blurred, 30, 255, cv2.THRESH_BINARY)
    
    return binary

print("‚úÖ Preprocessing functions ready!")

In [None]:
print("\nüîß DEFINING SIGNAL EXTRACTION FUNCTIONS")
print("="*60)

def extract_lead_signal(image, lead_position, num_samples):
    """
    Extract signal for one lead from a specific region of the image.
    
    Parameters:
    - image: preprocessed image
    - lead_position: (y_start, y_end, x_start, x_end)
    - num_samples: number of time points to extract
    """
    y_start, y_end, x_start, x_end = lead_position
    
    # Extract lead region
    lead_region = image[y_start:y_end, x_start:x_end]
    
    if lead_region.size == 0:
        return np.zeros(num_samples)
    
    # Sample horizontally to get time-series
    width = lead_region.shape[1]
    signal = []
    
    for i in range(num_samples):
        x_pos = int(i * width / num_samples)
        if x_pos >= width:
            x_pos = width - 1
        
        # Extract column
        column = lead_region[:, x_pos]
        
        if len(column) == 0:
            signal.append(0)
            continue
        
        # Find brightest point (signal)
        signal_y = np.argmax(column)
        
        # Convert pixel position to voltage (normalized scale)
        voltage = (signal_y / len(column) - 0.5) * 4
        signal.append(voltage)
    
    return np.array(signal)

def estimate_lead_positions(image_height, image_width):
    """
    Estimate positions of 12 leads in standard ECG layout.
    
    Layout:
    Row 1: I, II, III
    Row 2: aVR, aVL, aVF  
    Row 3: V1, V2, V3
    Row 4: V4, V5, V6
    (Lead II long strip would be at bottom, but simplified here)
    """
    row_height = image_height // 5
    col_width = image_width // 3
    
    positions = {}
    leads_layout = [
        ['I', 'II', 'III'],
        ['aVR', 'aVL', 'aVF'],
        ['V1', 'V2', 'V3'],
        ['V4', 'V5', 'V6']
    ]
    
    for row_idx, row_leads in enumerate(leads_layout):
        for col_idx, lead_name in enumerate(row_leads):
            y_start = row_idx * row_height
            y_end = (row_idx + 1) * row_height
            x_start = col_idx * col_width
            x_end = (col_idx + 1) * col_width
            positions[lead_name] = (y_start, y_end, x_start, x_end)
    
    return positions

print("‚úÖ Signal extraction functions ready!")

In [None]:
print("\nüîß DEFINING MAIN EXTRACTION PIPELINE")
print("="*60)

def extract_ecg_from_image(image_path, fs, lead_durations):
    """
    Complete pipeline to extract all 12 leads from an ECG image.
    """
    # Load and preprocess
    image = load_and_preprocess_image(image_path)
    if image is None:
        return None
    
    # Remove grid
    processed = remove_grid(image)
    
    # Estimate lead positions
    positions = estimate_lead_positions(image.shape[0], image.shape[1])
    
    # Extract each lead
    signals = {}
    for lead, position in positions.items():
        duration = lead_durations.get(lead, 2.5)
        num_samples = int(fs * duration)
        signal = extract_lead_signal(processed, position, num_samples)
        signals[lead] = signal
    
    return signals

# Define standard lead durations
LEAD_DURATIONS = {
    'I': 2.5, 'III': 2.5, 'aVR': 2.5, 'aVL': 2.5, 'aVF': 2.5,
    'V1': 2.5, 'V2': 2.5, 'V3': 2.5, 'V4': 2.5, 'V5': 2.5, 'V6': 2.5,
    'II': 10.0  # Lead II is longer
}

print("‚úÖ Main pipeline ready!")

In [None]:
print("\nüß™ TESTING ON ONE TRAINING EXAMPLE")
print("="*60)

# Get first training sample
sample_id = str(train_df.iloc[0]['id'])
sample_fs = train_df.iloc[0]['fs']

sample_image = DATA_PATH / 'train' / sample_id / f"{sample_id}-0001.png"

print(f"üìå Testing on sample: {sample_id}")
print(f"üìä Sampling frequency: {sample_fs} Hz")

# Extract signals
extracted = extract_ecg_from_image(sample_image, sample_fs, LEAD_DURATIONS)

if extracted:
    print(f"‚úÖ Extraction successful!")
    print(f"üìä Extracted leads: {list(extracted.keys())}")
    
    # Load ground truth
    gt_df = pd.read_csv(DATA_PATH / 'train' / sample_id / f"{sample_id}.csv")
    
    # Plot comparison
    fig, axes = plt.subplots(2, 1, figsize=(15, 8))
    
    # Ground truth
    axes[0].plot(gt_df['II'][:1000], linewidth=0.8, color='blue', label='Ground Truth')
    axes[0].set_title('Ground Truth - Lead II (first 1000 samples)', fontsize=12, fontweight='bold')
    axes[0].set_ylabel('Amplitude (mV)')
    axes[0].grid(True, alpha=0.3)
    axes[0].legend()
    
    # Extracted
    axes[1].plot(extracted['II'][:1000], linewidth=0.8, color='red', label='Baseline Extraction')
    axes[1].set_title('Baseline Extraction - Lead II (first 1000 samples)', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Sample Index')
    axes[1].set_ylabel('Amplitude (mV)')
    axes[1].grid(True, alpha=0.3)
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig('baseline_comparison.png', dpi=100, bbox_inches='tight')
    plt.show()
    
    print("\nüí° This is a ROUGH baseline - signals won't match perfectly yet!")
else:
    print("‚ùå Extraction failed!")


In [None]:
 
print("\nüì§ PROCESSING TEST SET")
print("="*60)

# Get unique test IDs
test_ids = test_df['id'].unique()

# IMPORTANT: Start with just 20 images for testing
# Remove [:20] to process ALL images for final submission
NUM_TEST_SAMPLES = 20  # Change to len(test_ids) for full submission

print(f"üß™ Processing {NUM_TEST_SAMPLES} test images (out of {len(test_ids)} total)")
print(f"‚ö†Ô∏è This is a TEST RUN - change NUM_TEST_SAMPLES to process all images")

all_predictions = []

# Process test images
for test_id in tqdm(test_ids[:NUM_TEST_SAMPLES], desc="Extracting ECGs"):
    # Get metadata
    test_meta = test_df[test_df['id'] == test_id].iloc[0]
    fs = test_meta['fs']
    
    # Path to test image
    test_image_path = DATA_PATH / 'test' / f"{test_id}.png"
    
    # Extract signals
    extracted = extract_ecg_from_image(test_image_path, fs, LEAD_DURATIONS)
    
    if extracted is None:
        # Fallback: use zeros
        extracted = {lead: np.zeros(int(fs * LEAD_DURATIONS[lead])) 
                    for lead in LEAD_DURATIONS.keys()}
    
    # Format predictions
    for lead, signal in extracted.items():
        for row_id, value in enumerate(signal):
            pred_id = f"{test_id}_{row_id}_{lead}"
            all_predictions.append({
                'id': pred_id,
                'value': float(value)
            })

print(f"\n‚úÖ Generated {len(all_predictions):,} predictions")


In [None]:
print("\nüíæ CREATING SUBMISSION FILE")
print("="*60)

# Create submission dataframe
submission_df = pd.DataFrame(all_predictions)

print(f"üìä Submission shape: {submission_df.shape}")
print(f"üìã Sample submission shape: {sample_sub.shape}")

print(f"\nüîç First 10 predictions:")
print(submission_df.head(10))

print(f"\nüîç Last 10 predictions:")
print(submission_df.tail(10))

# Save submission
submission_df.to_csv('submission.csv', index=False)
print(f"\n‚úÖ Saved to: /kaggle/working/submission.csv")

# Also save as parquet (alternative format)
submission_df.to_parquet('submission.parquet', index=False)
print(f"‚úÖ Also saved as: /kaggle/working/submission.parquet")

In [None]:
print("\nüîç SUBMISSION VALIDATION")
print("="*60)

# Check format
required_columns = ['id', 'value']
has_required = all(col in submission_df.columns for col in required_columns)
print(f"‚úÖ Has required columns: {has_required}")

# Check for NaN values
has_nan = submission_df['value'].isna().any()
print(f"‚úÖ No NaN values: {not has_nan}")

# Check value ranges (should be reasonable for ECG in mV)
print(f"\nüìä Value statistics:")
print(submission_df['value'].describe())

# Check ID format
sample_ids = submission_df['id'].head(3).tolist()
print(f"\nüîç Sample IDs format:")
for sid in sample_ids:
    print(f"  {sid}")

print(f"\n‚ö†Ô∏è NOTE: This is a PARTIAL submission ({NUM_TEST_SAMPLES} images)")
print(f"   For full submission, set NUM_TEST_SAMPLES = len(test_ids) in Cell 7")

In [None]:
print("\n" + "="*60)
print("üéâ BASELINE MODEL COMPLETE!")
print("="*60)

print(f"""
‚úÖ What you accomplished:
  1. Built image preprocessing pipeline
  2. Created signal extraction algorithm
  3. Processed {NUM_TEST_SAMPLES} test images
  4. Generated submission file

üìä Current status:
  - Predictions: {len(all_predictions):,}
  - File: submission.csv (ready to submit!)
  - Format: Valid ‚úì

üéØ Next steps to improve:

  IMMEDIATE (to submit):
  1. Change NUM_TEST_SAMPLES to len(test_ids) in Cell 7
  2. Run Cell 7-9 again (will take ~30-60 min)
  3. Submit to competition
  4. Get baseline score!

  IMPROVEMENTS (after baseline):
  1. Better lead detection (find actual boundaries)
  2. Calibrate voltage scale using training data
  3. Handle different image types better
  4. Try deep learning (CNN/U-Net)
  5. Add signal smoothing/filtering
  6. Detect and remove artifacts

üí° Tips:
  - Don't worry if first score is low
  - Learn from leaderboard position
  - Iterate and improve!
  - Check discussion forum for ideas

üöÄ Ready to submit your baseline? 
   Update Cell 7 and run for full predictions!
""")

print("="*60)