In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['font.size'] = 10

# Paths de Kaggle
DATA_PATH = Path('/kaggle/input/physionet-ecg-image-digitization')
TRAIN_PATH = DATA_PATH / 'train'
TEST_PATH = DATA_PATH / 'test'

print("üöÄ PhysioNet ECG Digitization - EDA")
print("=" * 60)
print(f"üìÅ Data path exists: {DATA_PATH.exists()}")
print(f"üìÅ Train path exists: {TRAIN_PATH.exists()}")
print(f"üìÅ Test path exists: {TEST_PATH.exists()}")

In [None]:
print("\n" + "="*60)
print("üìä SECTION 1: DATA OVERVIEW")
print("="*60)

# Cargar metadata
train_meta = pd.read_csv(DATA_PATH / 'train.csv')
test_meta = pd.read_csv(DATA_PATH / 'test.csv')
sample_submission = pd.read_parquet(DATA_PATH / 'sample_submission.parquet')

print("\nüìã Train Metadata:")
print(f"  - Shape: {train_meta.shape}")
print(f"  - Columns: {list(train_meta.columns)}")
print(f"\n{train_meta.head()}")

print("\nüìã Test Metadata:")
print(f"  - Shape: {test_meta.shape}")
print(f"  - Columns: {list(test_meta.columns)}")
print(f"\n{test_meta.head()}")

print("\nüìã Sample Submission:")
print(f"  - Shape: {sample_submission.shape}")
print(f"  - Columns: {list(sample_submission.columns)}")
print(f"\n{sample_submission.head()}")

# An√°lisis de distribuciones
print("\nüìä Sampling Frequencies Distribution (Train):")
print(train_meta['fs'].value_counts().sort_index())

print("\nüìä Signal Lengths Distribution (Train):")
print(train_meta['sig_len'].value_counts().sort_index())

# Visualizar distribuciones
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Sampling frequencies
train_meta['fs'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Sampling Frequencies in Train Set', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sampling Frequency (Hz)')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Signal lengths
train_meta['sig_len'].value_counts().sort_index().plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Signal Lengths in Train Set', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Signal Length (samples)')
axes[1].set_ylabel('Count')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*60)
print("üñºÔ∏è  SECTION 2: IMAGE ANALYSIS - ALL VARIANTS")
print("="*60)

# Tipos de im√°genes seg√∫n la documentaci√≥n
IMAGE_VARIANTS = {
    '0001': 'Original digital (clean)',
    '0003': 'Printed + scanned (color)',
    '0004': 'Printed + scanned (B&W)',
    '0005': 'Mobile photo of print',
    '0006': 'Mobile photo of screen',
    '0009': 'Stained/soaked print',
    '0010': 'Extensively damaged',
    '0011': 'Scanned with mold (color)',
    '0012': 'Scanned with mold (B&W)'
}

# Seleccionar un ECG de ejemplo
sample_id = train_meta['id'].iloc[0]
print(f"\nüîç Analyzing sample ECG: {sample_id}")

# Verificar qu√© variantes existen para este sample
sample_dir = TRAIN_PATH / str(sample_id)
if sample_dir.exists():
    available_images = sorted(sample_dir.glob(f"{sample_id}-*.png"))
    print(f"  - Found {len(available_images)} image variants")
    
    # Visualizar todas las variantes disponibles
    n_images = len(available_images)
    n_cols = 3
    n_rows = (n_images + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6*n_rows))
    axes = axes.flatten() if n_images > 1 else [axes]
    
    for idx, img_path in enumerate(available_images):
        img = Image.open(img_path)
        img_array = np.array(img)
        
        # Extraer el c√≥digo de variante
        variant_code = img_path.stem.split('-')[-1]
        variant_desc = IMAGE_VARIANTS.get(variant_code, 'Unknown')
        
        axes[idx].imshow(img_array)
        axes[idx].set_title(f'{variant_code}: {variant_desc}\nSize: {img_array.shape}', 
                           fontsize=11, fontweight='bold')
        axes[idx].axis('off')
        
        print(f"  - {variant_code}: {variant_desc}")
        print(f"    Shape: {img_array.shape}, Dtype: {img_array.dtype}")
    
    # Ocultar ejes sobrantes
    for idx in range(n_images, len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print(f"  ‚ö†Ô∏è  Sample directory not found: {sample_dir}")

In [None]:
print("\n" + "="*60)
print("üìà SECTION 3: ECG SIGNAL ANALYSIS")
print("="*60)

# Cargar se√±al ground truth del mismo sample
signal_path = sample_dir / f"{sample_id}.csv"
if signal_path.exists():
    signal_df = pd.read_csv(signal_path)
    
    print(f"\nüìä Signal DataFrame:")
    print(f"  - Shape: {signal_df.shape}")
    print(f"  - Columns (12 leads): {list(signal_df.columns)}")
    print(f"\n{signal_df.head()}")
    
    # Estad√≠sticas por lead
    print("\nüìä Signal Statistics by Lead:")
    stats_df = signal_df.describe().T
    print(stats_df)
    
    # Visualizar las 12 derivaciones
    leads = signal_df.columns
    n_leads = len(leads)
    
    fig, axes = plt.subplots(n_leads, 1, figsize=(16, 2*n_leads))
    
    for idx, lead in enumerate(leads):
        signal = signal_df[lead].values
        time = np.arange(len(signal)) / train_meta[train_meta['id']==sample_id]['fs'].values[0]
        
        axes[idx].plot(time, signal, linewidth=0.8, color='darkblue')
        axes[idx].set_ylabel(f'{lead}\n(mV)', fontweight='bold')
        axes[idx].grid(True, alpha=0.3)
        axes[idx].set_xlim(0, time[-1])
        
        # Estad√≠sticas en el t√≠tulo
        axes[idx].set_title(
            f'Mean: {signal.mean():.3f} mV | Std: {signal.std():.3f} mV | Range: [{signal.min():.3f}, {signal.max():.3f}]',
            fontsize=9
        )
        
        if idx == n_leads - 1:
            axes[idx].set_xlabel('Time (seconds)', fontweight='bold')
    
    plt.suptitle(f'ECG Signal - All 12 Leads (ID: {sample_id})', 
                 fontsize=14, fontweight='bold', y=1.001)
    plt.tight_layout()
    plt.show()
    
    # An√°lisis de relaciones matem√°ticas entre leads
    print("\nüî¨ Mathematical Relationships Between Leads:")
    print("  (Einthoven's Triangle and Goldberger's equations)")
    
    # Lead III deber√≠a ser aproximadamente II - I
    lead_III_calculated = signal_df['II'] - signal_df['I']
    lead_III_actual = signal_df['III']
    diff_III = np.abs(lead_III_calculated - lead_III_actual).mean()
    print(f"  - Lead III vs (II - I): Mean absolute difference = {diff_III:.6f} mV")
    
    # aVR deber√≠a ser aproximadamente -(I + II)/2
    aVR_calculated = -(signal_df['I'] + signal_df['II']) / 2
    aVR_actual = signal_df['aVR']
    diff_aVR = np.abs(aVR_calculated - aVR_actual).mean()
    print(f"  - Lead aVR vs -(I + II)/2: Mean absolute difference = {diff_aVR:.6f} mV")
    
    print("\nüí° Insight: Estas relaciones pueden usarse para validaci√≥n y correcci√≥n!")

else:
    print(f"  ‚ö†Ô∏è  Signal file not found: {signal_path}")

In [None]:
print("\n" + "="*60)
print("üîó SECTION 4: IMAGE-SIGNAL RELATIONSHIP")
print("="*60)

# Cargar imagen limpia (-0001) y se√±al para comparar
clean_img_path = sample_dir / f"{sample_id}-0001.png"
if clean_img_path.exists() and signal_path.exists():
    
    clean_img = np.array(Image.open(clean_img_path))
    
    print(f"\nüñºÔ∏è  Clean Image Properties:")
    print(f"  - Shape: {clean_img.shape}")
    print(f"  - Dtype: {clean_img.dtype}")
    print(f"  - Value range: [{clean_img.min()}, {clean_img.max()}]")
    
    # Visualizaci√≥n lado a lado
    fig = plt.figure(figsize=(18, 10))
    
    # Imagen completa
    ax1 = plt.subplot(2, 1, 1)
    ax1.imshow(clean_img)
    ax1.set_title('Complete ECG Image (Clean Digital Version)', 
                  fontsize=14, fontweight='bold')
    ax1.axis('off')
    
    # Zoom a una regi√≥n espec√≠fica para ver detalles de la cuadr√≠cula
    ax2 = plt.subplot(2, 2, 3)
    h, w = clean_img.shape[:2]
    crop = clean_img[int(h*0.3):int(h*0.5), int(w*0.1):int(w*0.3)]
    ax2.imshow(crop)
    ax2.set_title('Zoomed Region: Grid Detail', fontsize=12, fontweight='bold')
    ax2.axis('off')
    
    # Se√±al correspondiente (ejemplo: Lead II)
    ax3 = plt.subplot(2, 2, 4)
    signal_sample = signal_df['II'].values[:1000]  # Primeros 1000 puntos
    time_sample = np.arange(len(signal_sample)) / train_meta[train_meta['id']==sample_id]['fs'].values[0]
    ax3.plot(time_sample, signal_sample, linewidth=1.5, color='darkred')
    ax3.set_title('Corresponding Signal (Lead II, first 1000 samples)', 
                  fontsize=12, fontweight='bold')
    ax3.set_xlabel('Time (s)')
    ax3.set_ylabel('Amplitude (mV)')
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nüí° Key Observations:")
    print("  - ECG images contain a grid (typically 1mm squares)")
    print("  - Standard calibration: 25 mm/s horizontal, 10 mm/mV vertical")
    print("  - Need to detect grid and calibration marks")
    print("  - Lead II is longer (10s) vs other leads (2.5s)")

In [None]:
print("\n" + "="*60)
print("üéØ SECTION 5: CHALLENGE IDENTIFICATION")
print("="*60)

print("\nüî¥ Expected Challenges:")
challenges = [
    "1. Geometric distortions: rotation, perspective, scaling",
    "2. Noise and artifacts: stains, mold, damage, reflections",
    "3. Variable layouts: different manufacturers, lead positions",
    "4. Grid detection: crucial for calibration",
    "5. Lead identification: which signal is which in the image",
    "6. Different durations: Lead II (10s) vs others (2.5s)",
    "7. Overlapping signals: some layouts have overlaid leads",
    "8. Quality variation: pristine to barely readable",
    "9. Color vs B&W: loss of information in B&W scans",
    "10. Photo artifacts: moir√©, blur, uneven lighting"
]

for challenge in challenges:
    print(f"  {challenge}")

# Comparar variantes extremas
print("\nüî¨ Comparing Clean vs Damaged Images:")

clean_variant = sample_dir / f"{sample_id}-0001.png"
damaged_variants = [
    sample_dir / f"{sample_id}-0009.png",  # Stained
    sample_dir / f"{sample_id}-0010.png",  # Damaged
]

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Clean
if clean_variant.exists():
    img = np.array(Image.open(clean_variant))
    axes[0].imshow(img)
    axes[0].set_title('0001: Clean Digital\n(Baseline)', fontsize=12, fontweight='bold')
    axes[0].axis('off')

# Damaged variants
for idx, dmg_path in enumerate(damaged_variants):
    if dmg_path.exists():
        img = np.array(Image.open(dmg_path))
        variant_code = dmg_path.stem.split('-')[-1]
        axes[idx+1].imshow(img)
        axes[idx+1].set_title(f'{variant_code}: {IMAGE_VARIANTS.get(variant_code, "")}\n(Challenge!)', 
                             fontsize=12, fontweight='bold')
        axes[idx+1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*60)
print("üìù SECTION 6: KEY TAKEAWAYS")
print("="*60)

takeaways = """
‚úÖ DATA STRUCTURE:
  - Train: {n_train} ECGs with multiple image variants each
  - Test: {n_test} images (one per ECG)
  - 12 leads per ECG (11 short + 1 long)
  
‚úÖ TECHNICAL SPECS:
  - Sampling frequencies: {fs_range}
  - Signal duration: 2.5s (most leads) and 10s (Lead II)
  - Standard grid: 1mm squares
  - Standard scale: 25 mm/s, 10 mm/mV
  
‚úÖ MAIN CHALLENGES:
  1. Robust preprocessing (rotation, perspective correction)
  2. Grid detection and calibration
  3. Lead segmentation (identify which is which)
  4. Handle extreme artifacts (stains, mold, damage)
  5. Maintain mathematical relationships between leads
  
‚úÖ STRATEGY INSIGHTS:
  - Use clean variants (-0001) for initial model development
  - Gradually train on degraded variants
  - Implement physics-based constraints (lead relationships)
  - Consider ensemble: CV + Deep Learning
  
üéØ NEXT STEPS (HITO 1):
  1. Build basic preprocessing pipeline (CV)
  2. Implement simple signal extraction (baseline)
  3. Create local evaluation metric (SNR)
  4. Make first submission
"""

print(takeaways.format(
    n_train=len(train_meta),
    n_test=len(test_meta),
    fs_range=f"{train_meta['fs'].min()}-{train_meta['fs'].max()} Hz"
))

print("\n" + "="*60)
print("‚ú® EDA COMPLETE - Ready for modeling!")
print("="*60)