# ****Exploratory Data Analysis (EDA)****

In [None]:
import numpy as np
import pandas as pd

import warnings
import numpy as np

# Ignore specific RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)



In [None]:

train_meta = pd.read_csv("/kaggle/input/physionet-ecg-image-digitization/train.csv")
test_meta = pd.read_csv("/kaggle/input/physionet-ecg-image-digitization/test.csv")

<div style="background:#f9fbfe; border:1px solid #dbe4f0; border-radius:10px; padding:15px 20px; box-shadow:0 2px 6px rgba(0,0,0,0.05); max-width:600px; margin:auto; font-family:'Segoe UI', sans-serif; font-size:14px; line-height:1.5;">

<h3 style="color:#2c3e50; border-bottom:2px solid #3498db; padding-bottom:4px; margin-top:0; font-size:1.3em;">
ü©∫ Train Metadata
</h3>

<p>
The <code style="background:#eef1f5; padding:1px 4px; border-radius:4px; color:#d6336c;">train.csv</code> file provides essential metadata for each ECG record.
</p>

<div style="background:#f2f7ff; border-left:4px solid #3498db; padding:10px 15px; border-radius:6px; margin-top:10px;">
  <p style="font-weight:600; margin-bottom:6px;">üìò Columns</p>
  <ul style="list-style:none; padding-left:0; margin:0;">
    <li><strong style="color:#2980b9;">id</strong> ‚Üí Unique ECG identifier (and folder name)</li>
    <li><strong style="color:#2980b9;">fs</strong> ‚Üí Sampling frequency (samples per second)</li>
    <li><strong style="color:#2980b9;">sig_len</strong> ‚Üí Total number of samples per lead (<code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">fs √ó 10 seconds</code>)</li>
  </ul>
</div>

<div style="margin-top:15px; background:#ffffff; border:1px dashed #cfd8dc; padding:10px 15px; border-radius:6px;">
  <p style="margin:0;">
    Each <code style="background:#eef1f5; padding:1px 4px; border-radius:4px; color:#d6336c;">id</code> corresponds to a folder in 
    <code style="background:#eef1f5; padding:1px 4px; border-radius:4px; color:#d6336c;">/train/</code> that contains:
  </p>

  <ul style="margin-top:6px; margin-bottom:0;">
    <li>üìÑ One <code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">.csv</code> file ‚Äî ground truth ECG signal (12 leads)</li>
    <li>üñºÔ∏è Multiple <code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">.png</code> images ‚Äî visual renderings of the same ECG</li>
  </ul>

  <p style="margin-top:8px; font-size:13px; color:#555;">
    üí° Acts as the link between digital ECG data and its image representations.
  </p>
</div>

</div>


In [None]:
train_meta.head()

In [None]:
train_meta.shape

<div style="background:#f9fbfe; border:1px solid #dbe4f0; border-radius:10px; padding:15px 20px; box-shadow:0 2px 6px rgba(0,0,0,0.05); max-width:600px; margin:auto; font-family:'Segoe UI', sans-serif; font-size:14px; line-height:1.5;">

<h3 style="color:#2c3e50; border-bottom:2px solid #3498db; padding-bottom:4px; margin-top:0; font-size:1.3em;">
üß© Exploring One Sample from the Training Data
</h3>

<p>
Each folder inside <code style="background:#eef1f5; padding:1px 4px; border-radius:4px; color:#d6336c;">/train/</code> represents one patient‚Äôs ECG record.
</p>

<div style="background:#f2f7ff; border-left:4px solid #3498db; padding:10px 15px; border-radius:6px; margin-top:10px;">
  <p style="margin:0;">
    <strong style="color:#2980b9;">Example:</strong> 
    <code style="background:#eef1f5; padding:1px 4px; border-radius:4px; color:#d6336c;">/train/7663343/</code>
  </p>

  <ul style="list-style:none; padding-left:0; margin-top:8px; margin-bottom:0;">
    <li><code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">7663343.csv</code> ‚Üí True digital ECG signal (12 leads)</li>
    <li><code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">7663343-0001.png ‚Ä¶ 7663343-0012.png</code> ‚Üí Visual ECG versions</li>
  </ul>

  <ul style="margin:8px 0 0 15px; font-size:13px;">
    
  </ul>
</div>

<div style="margin-top:15px; background:#ffffff; border:1px dashed #cfd8dc; padding:10px 15px; border-radius:6px;">
  <p style="margin:0;">
    We visualized:
  </p>
  <ol style="margin-top:6px; margin-bottom:0;">
    <li>The ECG image (<code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">7663343-0001.png</code>)</li>
    <li>The corresponding ground-truth waveform from <code style="background:#eef1f5; padding:1px 3px; border-radius:4px; color:#d6336c;">7663343.csv</code></li>
  </ol>

  <p style="margin-top:8px; font-size:13px; color:#555;">
    üí° This comparison helps understand how analog-like ECG images relate to their digital signals ‚Äî the core objective of the competition.
  </p>
</div>

</div>


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2

# Load only one sample 
sample_id = "7663343"
sample_path = f"/kaggle/input/physionet-ecg-image-digitization/train/{sample_id}"
signal_path = os.path.join(sample_path, f"{sample_id}.csv")
df_signal = pd.read_csv(signal_path)

In [None]:
# List all .png files
image_files = sorted([f for f in os.listdir(sample_path) if f.endswith(".png")])

# Number of images
n_images = len(image_files)
cols = 3
rows = (n_images + cols - 1) // cols

plt.figure(figsize=(15, 5 * rows))

for i, file in enumerate(image_files, 1):
    img_path = os.path.join(sample_path, file)
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    plt.subplot(rows, cols, i)
    plt.imshow(img)
    plt.title(file, fontsize=10)
    plt.axis("off")

plt.suptitle(f"All ECG Images for {sample_id}", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Plot all 12 leads overlayed
plt.figure(figsize=(14, 6))
for col in df_signal.columns:
    plt.plot(df_signal[col], label=col, alpha=0.7)
plt.title(f"Overlay of All 12 ECG Leads for ID {sample_id}")
plt.xlabel("Sample Index (Time)")
plt.ylabel("mV (Voltage)")
plt.legend(ncol=3)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Plot all 12 leads

plt.figure(figsize=(10, 15))
for i, col in enumerate(df_signal.columns):
    plt.subplot(6, 2, i + 1)
    plt.plot(df_signal[col], linewidth=0.8)
    plt.title(col)
    plt.xlabel("Digital Signals")
    plt.ylabel("mV")
plt.tight_layout()
plt.show()


In [None]:
df_signal.shape

In [None]:
df_signal.head()

In [None]:
df_signal.isnull().sum()

In [None]:
df_signal.describe()

In [None]:
import os
import pandas as pd

# Paths
train_dir = "/kaggle/input/physionet-ecg-image-digitization/train"
train_meta_path = "/kaggle/input/physionet-ecg-image-digitization/train.csv"

# Load metadata
train_meta = pd.read_csv(train_meta_path)

# Create mapping table
records = []

for _, row in train_meta.iterrows():
    sample_id = str(row['id'])
    sample_path = os.path.join(train_dir, sample_id)
    csv_path = os.path.join(sample_path, f"{sample_id}.csv")
    
    # Collect all ECG image paths
    image_files = [f for f in os.listdir(sample_path) if f.endswith(".png")]
    
    for img_file in image_files:
        img_path = os.path.join(sample_path, img_file)
        records.append({
            "id": sample_id,
            "fs": row["fs"],
            "sig_len": row["sig_len"],
            "image_file": img_file,
            "image_path": img_path,
            "csv_path": csv_path
        })

# Convert to DataFrame
df_links = pd.DataFrame(records)



In [None]:
df_links.shape

In [None]:
df_links.head()

In [None]:
df_links.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
image_count = df_links.groupby("id")["image_file"].count()
image_count.hist(bins=15, figsize=(6,3))
plt.title("Distribution of image versions per ECG record")
plt.xlabel("Number of images")
plt.ylabel("Count")
plt.show()


In [None]:
def preprocess_image(img_path, img_size=(512, 512)):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, img_size)
    img = img / 255.0  # normalize [0, 1]
    return np.expand_dims(img, axis=-1)


In [None]:
def preprocess_signal(csv_path, lead="II", target_len=5000):
    df = pd.read_csv(csv_path)
    sig = df[lead].values.astype(np.float32)
    if len(sig) > target_len:
        sig = sig[:target_len]
    elif len(sig) < target_len:
        sig = np.pad(sig, (0, target_len - len(sig)))
    # Normalize to [-1, 1]
    sig = (sig - np.mean(sig)) / (np.std(sig) + 1e-8)
    return sig


In [None]:
def load_sample(row, img_size=(512, 512), lead="II"):
    X = preprocess_image(row.image_path, img_size)
    y = preprocess_signal(row.csv_path, lead)
    return X, y


In [None]:
sample_row = df_links.iloc[0]
X, y = load_sample(sample_row)

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.imshow(X.squeeze(), cmap='gray')
plt.title("Processed ECG Image")

plt.subplot(1,2,2)
plt.plot(y, color='red')
plt.title("Processed Signal (Lead II)")
plt.tight_layout()
plt.show()


<div style="border: 2px solid #FFA500; border-radius: 10px; padding: 10px; background-color: #FFF5E6; text-align: center; font-family: Arial, sans-serif; width: 80%; max-width: 600px; margin: auto;">
  <h3 style="color: #FFA500;">üëç <strong>Enjoyed this guide?</strong></h3>
  <p style="color: #333333;">If you found this guide helpful, please consider giving it an upvote! Your support helps us continue to create valuable content and improve our resources.</p>
  <p style="font-size: 16px; color: #FF8C00;">Thank you! üòä</p>
</div>