In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Packages

In [None]:
%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [None]:
%%time
train = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/train.csv')
test = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/test.csv')
submission = pd.read_parquet("/kaggle/input/physionet-ecg-image-digitization/sample_submission.parquet")

## Load test train

In [None]:
idx = 0
print(train.id[idx])

TRAIN_DIR = '/kaggle/input/physionet-ecg-image-digitization/train/'
name = str(train.id[idx])
df_with_id0 = TRAIN_DIR + name + '/' + name + '.csv'

df = pd.read_csv(df_with_id0)
df.head()

# EDA

In [None]:
%%time
train_metadata = train[train['id'] == 7663343]

# Check if signal length matches recording duration
fs = train_metadata['fs'].values[0]
sig_len = train_metadata['sig_len'].values[0]
duration = sig_len / fs

print(f"Signal duration: {duration} seconds")
print(f"Sampling frequency: {fs} Hz")
print(f"Number of samples: {sig_len}")

# Compare with what we see on the images
def analyze_ecg_image(image_path):
    """Analyze ECG image to determine characteristics"""
    img = plt.imread(image_path)
    print(f"\nAnalysis of {os.path.basename(image_path)}:")
    print(f"Image size: {img.shape}")
    
    # Can add analysis of grid, time markers, etc.
    return img

# Analyze the first image
analyze_ecg_image(TRAIN_DIR + '7663343/7663343-0001.png')

## Functions

In [None]:
%%time
def get_image_type(filename):
    """Determine image type based on filename"""
    type_mapping = {
        '0001': 'original_color',
        '0003': 'printed_scanned_color', 
        '0004': 'printed_scanned_bw',
        '0005': 'mobile_photo_color',
        '0006': 'mobile_photo_screen',
        '0009': 'stained_soaked',
        '0010': 'extensive_damage',
        '0011': 'mold_color',
        '0012': 'mold_bw'
    }
    
    image_id = filename.split('-')[1].split('.')[0]
    return type_mapping.get(image_id, 'unknown')

def has_artifacts(filename):
    """Determine if the image has artifacts"""
    artifact_types = ['0009', '0010', '0011', '0012']
    image_id = filename.split('-')[1].split('.')[0]
    return image_id in artifact_types

## ECG Signal

In [None]:
# Compare the original signal with different image versions
fig, axes = plt.subplots(3, 3, figsize=(18, 12))

# Plot original signal
time = np.arange(len(df['II'])) / train_metadata['fs'].values[0]
axes[0,0].plot(time, df['II'], 'b-', linewidth=0.8)
axes[0,0].set_title('Original ECG Signal (Lead II)')
axes[0,0].set_xlabel('Time (s)')
axes[0,0].set_ylabel('mV')
axes[0,0].grid(True)

# Display different image versions
image_files = [f for f in os.listdir(TRAIN_DIR + '7663343/') if f.endswith('.png')]
for i, img_file in enumerate(image_files[:8]):
    row = (i + 1) // 3
    col = (i + 1) % 3
    
    img_path = TRAIN_DIR + '7663343/' + img_file
    img = plt.imread(img_path)
    
    axes[row, col].imshow(img)
    axes[row, col].set_title(f'{get_image_type(img_file)}\n{img_file}')
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()

# Prediction and  Submision

In [None]:
%%time
import numpy as np
import pandas as pd
import os
from scipy.signal import butter, filtfilt
from tqdm import tqdm

# ============================================================
# Paths
# ============================================================
BASE_DIR = "/kaggle/input/physionet-ecg-image-digitization"
TRAIN_DIR = os.path.join(BASE_DIR, "train")
TEST_PATH = os.path.join(BASE_DIR, "test.csv")
SAMPLE_SUB = os.path.join(BASE_DIR, "sample_submission.parquet")

# ============================================================
# Load data
# ============================================================
train = pd.read_csv(os.path.join(BASE_DIR, "train.csv"))
test = pd.read_csv(TEST_PATH)
submission = pd.read_parquet(SAMPLE_SUB)

print(f"✅ Train shape: {train.shape}")
print(f"✅ Test shape: {test.shape}")

# ============================================================
# Create averaged ECG templates per lead
# ============================================================
leads = ['I','II','III','aVR','aVL','aVF','V1','V2','V3','V4','V5','V6']
template_len = 1000
lead_templates = {}

for lead in leads:
    signals = []
    for _, row in train.sample(n=min(300, len(train)), random_state=42).iterrows():
        csv_path = os.path.join(TRAIN_DIR, str(row["id"]), f"{row['id']}.csv")
        if not os.path.exists(csv_path):
            continue
        try:
            df = pd.read_csv(csv_path)
            if lead not in df.columns:
                continue
            s = df[lead].dropna().values.astype(np.float32)
            if len(s) < 100: continue

            s_norm = (s - s.mean()) / (s.std() + 1e-8)
            s_resamp = np.interp(np.linspace(0, 1, template_len),
                                 np.linspace(0, 1, len(s_norm)), s_norm)
            signals.append(s_resamp)
        except:
            continue

    if signals:
        lead_templates[lead] = np.mean(signals, axis=0)
    else:
        t = np.linspace(0, 1, template_len)
        lead_templates[lead] = np.sin(2 * np.pi * t)

print(f"✅ Created templates for {len(lead_templates)} leads")

# ============================================================
# Low-pass filter helper
# ============================================================
def lowpass_filter(signal, fs=500, cutoff=15.0):
    nyq = 0.5 * fs
    normal_cutoff = min(cutoff / nyq, 0.99)
    b, a = butter(2, normal_cutoff, btype='low')
    return filtfilt(b, a, signal)

# ============================================================
# Predict test signals using templates + jitter ensemble
# ============================================================
predictions = {}
min_val, max_val = 0.0, 0.07

for _, row in tqdm(test.iterrows(), total=len(test)):
    base_id, lead, n_rows = row["id"], row["lead"], row["number_of_rows"]
    fs = row.get("fs", 500)
    template = lead_templates.get(lead, lead_templates["II"]).copy()

    # Resize to match target length
    signal = np.interp(np.linspace(0, 1, n_rows),
                       np.linspace(0, 1, len(template)), template)

    # Filter + jitter averaging
    ensemble = []
    for jitter in [0.98, 1.0, 1.02]:
        s_jitter = np.interp(np.linspace(0, 1, n_rows),
                             np.linspace(0, 1, len(template)),
                             np.roll(template, int(len(template)*((jitter-1)/2))))
        s_filtered = lowpass_filter(s_jitter, fs)
        ensemble.append(s_filtered)
    signal = np.mean(ensemble, axis=0)

    # Normalize and scale
    s_min, s_max = signal.min(), signal.max()
    if s_max - s_min < 1e-8:
        signal = np.full(n_rows, (min_val + max_val) / 2)
    else:
        signal = (signal - s_min) / (s_max - s_min)
        signal = min_val + signal * (max_val - min_val)

    predictions[(base_id, lead)] = signal.astype(np.float32)

# ============================================================
# Build submission
# ============================================================
submission_data = []
for _, row in test.iterrows():
    base_id, lead, n_rows = row["id"], row["lead"], row["number_of_rows"]
    signal = predictions[(base_id, lead)]
    for i in range(n_rows):
        submission_data.append({
            "id": f"{base_id}_{i}_{lead}",
            "value": float(signal[i])
        })

submission_df = pd.DataFrame(submission_data)
submission_df.to_csv("/kaggle/working/submission.csv", index=False)

print("✅ Saved submission:", submission_df.shape)
submission_df.head(20)