In [None]:
import os
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt
from concurrent.futures import ThreadPoolExecutor, as_completed

# Define constants
TRAIN_DIRECTORY = '/kaggle/input/physionet-ecg-image-digitization/train'
ECG_LEADS = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
TEMPLATE_LENGTH = 500
MIN_SIGNAL_VALUE = 0.0
MAX_SIGNAL_VALUE = 0.09

# Function to process a single recording and extract signals for all leads
def extract_signals_from_file(record_id):
    file_path = os.path.join(TRAIN_DIRECTORY, str(record_id), f"{record_id}.csv")
    if not os.path.exists(file_path):
        return None
    try:
        data_frame = pd.read_csv(file_path)
        lead_signals = {}
        for lead_name in ECG_LEADS:
            if lead_name in data_frame.columns:
                raw_signal = data_frame[lead_name].dropna().values.astype(np.float32)
                if len(raw_signal) >= 50:
                    normalized_signal = (raw_signal - raw_signal.mean()) / (raw_signal.std() + 1e-8)
                    resampled_signal = np.interp(
                        np.linspace(0, 1, TEMPLATE_LENGTH),
                        np.linspace(0, 1, len(normalized_signal)),
                        normalized_signal
                    )
                    lead_signals[lead_name] = resampled_signal
        return lead_signals if lead_signals else None
    except Exception:
        return None

# Load training metadata
train_metadata = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/train.csv')

# Collect signals for each lead using parallel processing
lead_signal_collections = {lead: [] for lead in ECG_LEADS}

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    future_to_id = {executor.submit(extract_signals_from_file, row['id']): row['id'] for _, row in train_metadata.iterrows()}
    for future in as_completed(future_to_id):
        result = future.result()
        if result:
            for lead_name, signal_data in result.items():
                lead_signal_collections[lead_name].append(signal_data)

# Compute average templates for each lead
average_templates = {}
for lead_name in ECG_LEADS:
    collected_signals = lead_signal_collections[lead_name]
    if collected_signals:
        average_templates[lead_name] = np.mean(collected_signals, axis=0)
    else:
        time_points = np.linspace(0, 1, TEMPLATE_LENGTH)
        average_templates[lead_name] = np.sin(2 * np.pi * time_points)

# Load test metadata and sample submission
test_metadata = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/test.csv')

# Generate predictions for test data
test_predictions = {}
for _, test_row in test_metadata.iterrows():
    recording_id = test_row['id']
    lead_name = test_row['lead']
    row_count = test_row['number_of_rows']
    sampling_rate = test_row.get('fs', 500)
    
    base_template = average_templates.get(lead_name, average_templates['II']).copy()
    
    if len(base_template) != row_count:
        generated_signal = np.interp(
            np.linspace(0, 1, row_count),
            np.linspace(0, 1, len(base_template)),
            base_template
        )
    else:
        generated_signal = base_template.copy()
    
    if len(generated_signal) > 10:
        nyquist_freq = 0.5 * sampling_rate
        cutoff_freq = min(15.0 / nyquist_freq, 0.99)
        filter_b, filter_a = butter(2, cutoff_freq, btype='low')
        generated_signal = filtfilt(filter_b, filter_a, generated_signal)
    
    sig_min, sig_max = generated_signal.min(), generated_signal.max()
    
    if sig_max - sig_min < 1e-8:
        generated_signal = np.full(row_count, (MIN_SIGNAL_VALUE + MAX_SIGNAL_VALUE) / 2)
    else:
        generated_signal = (generated_signal - sig_min) / (sig_max - sig_min)
        generated_signal = MIN_SIGNAL_VALUE + generated_signal * (MAX_SIGNAL_VALUE - MIN_SIGNAL_VALUE)
    
    test_predictions[(recording_id, lead_name)] = generated_signal.astype(np.float32)

# Prepare submission data
submission_entries = []
for _, test_row in test_metadata.iterrows():
    recording_id = test_row['id']
    lead_name = test_row['lead']
    row_count = test_row['number_of_rows']
    signal_data = test_predictions[(recording_id, lead_name)]
    
    for index in range(row_count):
        submission_entries.append({
            'id': f"{recording_id}_{index}_{lead_name}",
            'value': float(signal_data[index])
        })

# Create and save submission DataFrame
submission_df = pd.DataFrame(submission_entries)
submission_df.to_csv('submission.csv', index=False)