# PhysioNet ECG Digitization - Initial EDA

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 6)

# Set paths for Kaggle environment
DATA_PATH = Path('/kaggle/input/physionet-ecg-image-digitization')
TRAIN_PATH = DATA_PATH / 'train'
TEST_PATH = DATA_PATH / 'test'

print("Setup complete!")

## 2. Load Metadata

In [None]:
# Load training and test metadata
train_df = pd.read_csv(DATA_PATH / 'train.csv')
test_df = pd.read_csv(DATA_PATH / 'test.csv')
sample_submission = pd.read_parquet(DATA_PATH / 'sample_submission.parquet')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Submission rows: {len(sample_submission)}")

## 3. Training Data Exploration

In [None]:
# Display basic info about training data
print("Training Data Info:")
print(train_df.head(10))
print("\nData Types:")
print(train_df.dtypes)
print("\nBasic Statistics:")
print(train_df.describe())

In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum())

In [None]:
# Analyze sampling frequencies
print("Sampling Frequency Distribution:")
print(train_df['fs'].value_counts().sort_index())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
train_df['fs'].value_counts().sort_index().plot(kind='bar')
plt.title('Sampling Frequency Distribution')
plt.xlabel('Sampling Frequency (Hz)')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
train_df['sig_len'].value_counts().sort_index().plot(kind='bar')
plt.title('Signal Length Distribution')
plt.xlabel('Signal Length (samples)')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Verify the relationship: sig_len = 10 seconds * fs
train_df['expected_sig_len'] = train_df['fs'] * 10
train_df['sig_len_match'] = train_df['sig_len'] == train_df['expected_sig_len']
print(f"All signal lengths match 10 seconds * fs: {train_df['sig_len_match'].all()}")
print(f"Percentage matching: {train_df['sig_len_match'].mean() * 100:.2f}%")

## 4. Test Data Exploration

In [None]:
# Display basic info about test data
print("Test Data Info:")
print(test_df.head(10))
print("\nData Types:")
print(test_df.dtypes)
print("\nBasic Statistics:")
print(test_df.describe())

In [None]:
# Check lead distribution in test data
print("Lead Distribution in Test Data:")
print(test_df['lead'].value_counts())

plt.figure(figsize=(10, 5))
test_df['lead'].value_counts().plot(kind='bar')
plt.title('ECG Lead Distribution in Test Set')
plt.xlabel('Lead')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Analyze number_of_rows in test data
print("\nNumber of rows distribution:")
print(test_df['number_of_rows'].value_counts().sort_index())

# Check if number_of_rows varies by lead
print("\nNumber of rows by lead:")
print(test_df.groupby('lead')['number_of_rows'].describe())

## 5. ECG Image Exploration

In [None]:
# Get a sample ID to explore different image types
sample_id = str(train_df['id'].iloc[0])
print(f"Exploring images for sample ID: {sample_id}")

# Image segments based on data description
image_types = {
    '0001': 'Original color ECG image',
    '0003': 'Printed in color, scanned in color',
    '0004': 'Printed in color, scanned in B&W',
    '0005': 'Mobile photos of color printed images',
    '0006': 'Mobile photos of ECGs on laptop screen',
    '0009': 'Mobile photos of stained/soaked printed ECGs',
    '0010': 'Mobile photos with extensive damage',
    '0011': 'Scans of printed ECG with mold (color)',
    '0012': 'Scans of printed ECG with mold (B&W)'
}

# Check which image files exist for this sample
sample_dir = TRAIN_PATH / sample_id
existing_images = list(sample_dir.glob(f"{sample_id}-*.png"))
print(f"\nFound {len(existing_images)} images for this sample")
for img_path in sorted(existing_images):
    print(f"  - {img_path.name}")

In [None]:
# Visualize different image types for the same ECG
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (segment, description) in enumerate(image_types.items()):
    img_path = sample_dir / f"{sample_id}-{segment}.png"
    if img_path.exists():
        img = Image.open(img_path)
        axes[idx].imshow(img)
        axes[idx].set_title(f"{segment}: {description}", fontsize=9)
        axes[idx].axis('off')
        # Print image dimensions
        print(f"{segment}: {img.size} (W x H), Mode: {img.mode}")
    else:
        axes[idx].text(0.5, 0.5, 'Image not found', ha='center', va='center')
        axes[idx].set_title(f"{segment}: Not available")
        axes[idx].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Visualize test image
test_id = str(test_df['id'].iloc[0])
print(f"Sample test image ID: {test_id}")

test_img_path = TEST_PATH / f"{test_id}.png"
if test_img_path.exists():
    test_img = Image.open(test_img_path)
    print(f"Test image dimensions: {test_img.size} (W x H), Mode: {test_img.mode}")

    plt.figure(figsize=(16, 8))
    plt.imshow(test_img)
    plt.title(f"Test Image: {test_id}")
    plt.axis('off')
    plt.tight_layout()
    plt.show()
else:
    print("Test image not found")

## 6. Time Series Data Exploration

In [None]:
# Load time series data for a sample
sample_csv_path = sample_dir / f"{sample_id}.csv"
ecg_data = pd.read_csv(sample_csv_path)

print("ECG Time Series Data Shape:", ecg_data.shape)
print("\nColumns (ECG Leads):", ecg_data.columns.tolist())
print("\nFirst few rows:")
print(ecg_data.head())
print("\nBasic Statistics:")
print(ecg_data.describe())

In [None]:
# Verify the expected 12 leads
expected_leads = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
print(f"Expected leads: {expected_leads}")
print(f"Actual leads: {ecg_data.columns.tolist()}")
print(f"All leads present: {set(expected_leads) == set(ecg_data.columns)}")

In [None]:
# Visualize all 12 leads for the sample ECG
fig, axes = plt.subplots(12, 1, figsize=(16, 20))

for idx, lead in enumerate(expected_leads):
    axes[idx].plot(ecg_data[lead], linewidth=0.8)
    axes[idx].set_ylabel(f'{lead} (mV)', fontsize=10)
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_xlim(0, len(ecg_data))

    # Add some statistics
    mean_val = ecg_data[lead].mean()
    std_val = ecg_data[lead].std()
    axes[idx].set_title(f'{lead} - Mean: {mean_val:.3f} mV, Std: {std_val:.3f} mV',
                       fontsize=9, loc='right')

axes[-1].set_xlabel('Sample Index', fontsize=10)
fig.suptitle(f'12-Lead ECG Time Series for Sample {sample_id}', fontsize=14, y=0.995)
plt.tight_layout()
plt.show()

In [None]:
# Calculate time axis based on sampling frequency
sample_fs = train_df[train_df['id'] == int(sample_id)]['fs'].values[0]
time_axis = np.arange(len(ecg_data)) / sample_fs

print(f"Sampling frequency: {sample_fs} Hz")
print(f"Number of samples: {len(ecg_data)}")
print(f"Duration: {time_axis[-1]:.2f} seconds")

# Plot with time axis
fig, axes = plt.subplots(4, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, lead in enumerate(expected_leads):
    axes[idx].plot(time_axis, ecg_data[lead], linewidth=0.8, color='darkblue')
    axes[idx].set_title(f'{lead}', fontsize=11, fontweight='bold')
    axes[idx].set_ylabel('mV', fontsize=9)
    axes[idx].grid(True, alpha=0.3)
    axes[idx].set_xlim(0, 10)

for ax in axes[-3:]:
    ax.set_xlabel('Time (seconds)', fontsize=9)

fig.suptitle(f'12-Lead ECG with Time Axis (fs={sample_fs} Hz)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Analyze signal amplitude ranges across all leads
amplitude_stats = pd.DataFrame({
    'Lead': expected_leads,
    'Min': [ecg_data[lead].min() for lead in expected_leads],
    'Max': [ecg_data[lead].max() for lead in expected_leads],
    'Mean': [ecg_data[lead].mean() for lead in expected_leads],
    'Std': [ecg_data[lead].std() for lead in expected_leads],
    'Range': [ecg_data[lead].max() - ecg_data[lead].min() for lead in expected_leads]
})

print("Amplitude Statistics Across All Leads:")
print(amplitude_stats.to_string(index=False))

# Visualize amplitude ranges
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(amplitude_stats['Lead'], amplitude_stats['Range'])
axes[0].set_title('Signal Range (Max - Min) by Lead')
axes[0].set_xlabel('Lead')
axes[0].set_ylabel('Range (mV)')
axes[0].tick_params(axis='x', rotation=45)

axes[1].bar(amplitude_stats['Lead'], amplitude_stats['Std'])
axes[1].set_title('Signal Standard Deviation by Lead')
axes[1].set_xlabel('Lead')
axes[1].set_ylabel('Std (mV)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Submission Format Analysis

In [None]:
# Explore sample submission structure
print("Sample Submission Shape:", sample_submission.shape)
print("\nColumns:", sample_submission.columns.tolist())
print("\nFirst few rows:")
print(sample_submission.head(20))
print("\nLast few rows:")
print(sample_submission.tail(20))

In [None]:
# Parse the composite ID format
sample_submission['base_id_parsed'] = sample_submission['id'].str.split('_').str[0]
sample_submission['row_id_parsed'] = sample_submission['id'].str.split('_').str[1].astype(int)
sample_submission['lead_parsed'] = sample_submission['id'].str.split('_').str[2]

print("Parsed ID components:")
print(sample_submission[['id', 'base_id_parsed', 'row_id_parsed', 'lead_parsed']].head(20))

In [None]:
# Verify submission format matches test.csv structure
print("Unique base_ids in submission:", sample_submission['base_id_parsed'].nunique())
print("Unique IDs in test.csv:", test_df['id'].nunique())

# Check if all test IDs are in submission
test_ids = set(test_df['id'].astype(str))
submission_base_ids = set(sample_submission['base_id_parsed'])
print(f"\nAll test IDs in submission: {test_ids.issubset(submission_base_ids)}")

In [None]:
# Analyze rows per lead in submission
print("Rows per lead in submission:")
rows_per_lead = sample_submission.groupby('lead_parsed')['row_id_parsed'].agg(['count', 'min', 'max'])
print(rows_per_lead)

# Check if Lead II has more rows (10 seconds vs 2.5 seconds)
lead_ii_rows = sample_submission[sample_submission['lead_parsed'] == 'II']['row_id_parsed'].max() + 1
other_lead_rows = sample_submission[sample_submission['lead_parsed'] == 'I']['row_id_parsed'].max() + 1
print(f"\nLead II rows: {lead_ii_rows}")
print(f"Other lead rows (example Lead I): {other_lead_rows}")
print(f"Ratio: {lead_ii_rows / other_lead_rows:.2f} (should be ~4 for 10s vs 2.5s)")

In [None]:
# For a single test sample, show expected submission structure
sample_test_id = str(test_df['id'].iloc[0])
sample_rows = sample_submission[sample_submission['base_id_parsed'] == sample_test_id]

print(f"Submission rows for test ID {sample_test_id}:")
print(f"Total rows: {len(sample_rows)}")
print("\nRows by lead:")
print(sample_rows.groupby('lead_parsed').size())
print("\nSample rows:")
print(sample_rows.head(30))

## 8. Image-to-Signal Comparison

In [None]:
# Side-by-side comparison of original image and reconstructed signal
fig = plt.figure(figsize=(18, 12))

# Show original image
gs = fig.add_gridspec(13, 1, hspace=0.4)
ax_img = fig.add_subplot(gs[0, :])
img = Image.open(sample_dir / f"{sample_id}-0001.png")
ax_img.imshow(img)
ax_img.set_title(f'Original ECG Image (ID: {sample_id})', fontsize=12, fontweight='bold')
ax_img.axis('off')

# Show time series for all 12 leads
for idx, lead in enumerate(expected_leads):
    ax = fig.add_subplot(gs[idx+1, :])
    ax.plot(time_axis, ecg_data[lead], linewidth=0.8, color='red')
    ax.set_ylabel(lead, fontsize=9, rotation=0, labelpad=20)
    ax.set_xlim(0, 10)
    ax.grid(True, alpha=0.2)
    ax.tick_params(labelsize=8)
    if idx < 11:
        ax.set_xticklabels([])
    else:
        ax.set_xlabel('Time (seconds)', fontsize=9)

fig.suptitle('Image vs Ground Truth Time Series Comparison', fontsize=14, y=0.995)
plt.tight_layout()
plt.show()

In [None]:
# Summary statistics for quick reference
print("=" * 80)
print("DATASET SUMMARY")
print("=" * 80)
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Sampling frequencies: {sorted(train_df['fs'].unique())} Hz")
print(f"ECG leads: {expected_leads}")
print(f"Image types per training sample: 9")
print(f"Image types per test sample: 1")
print(f"Signal duration (training): 10 seconds")
print(f"Signal duration (test - Lead II): 10 seconds")
print(f"Signal duration (test - other leads): 2.5 seconds")
print(f"Total submission rows: {len(sample_submission)}")
print("=" * 80)