#***Remove duplicates from ECG Files and create a filtered file:***


---




In [None]:
import os
from collections import defaultdict

folder = "/sise/nadav-group/nadavrap-group/ECGs/ECG files"

# Map (ID, LEAD) → list of filenames
id_lead_to_files = defaultdict(list)

# Step 1: Build mapping from (ID, LEAD) to list of filenames
for filename in os.listdir(folder):
    if filename.endswith(".xml"):
        parts = filename.replace(".xml", "").split("_")
        if len(parts) >= 4:
            id_ = parts[0]
            lead = parts[1]
            key = (id_, lead)
            id_lead_to_files[key].append(filename)

# Step 2: Extract only those with duplicates and group them into tuples
duplicate_tuples = []

for key, files in id_lead_to_files.items():
    if len(files) > 1:
        duplicate_tuples.append(tuple(files))  # convert list of filenames to tuple

# Output results
print("Number of (ID, LEAD) groups with duplicates:", len(duplicate_tuples))
print("Example duplicate tuples:")
for t in duplicate_tuples[:5]:  # show first 5
    print(t)

In [None]:
import os
import shutil

source_dir = "/sise/nadav-group/nadavrap-group/ECGs/ECG files"
dest_dir = "/sise/nadav-group/nadavrap-group/ECGs/ECG files filtered"

# Create destination directory if it doesn't exist
os.makedirs(dest_dir, exist_ok=True)

# Flatten all filenames in duplicate tuples that end with 3_0.xml
excluded_files = set()

for tpl in duplicate_tuples:
    for fname in tpl:
        if fname.endswith("3_0.xml"):
            excluded_files.add(fname)

# Copy all files except the excluded ones
copied_count = 0

for filename in os.listdir(source_dir):
    if filename.endswith(".xml") and filename not in excluded_files:
        src_path = os.path.join(source_dir, filename)
        dst_path = os.path.join(dest_dir, filename)
        shutil.copy2(src_path, dst_path)
        copied_count += 1

print("✅ Copy complete.")
print(f"Total files copied: {copied_count}")
print(f"Total files excluded (3_0 in duplicate tuples): {len(excluded_files)}")

#***Preprocess - Daubechies wavelet 4 + Filtering Ivalid Signals:***

---



In [None]:
%pip install PyWavelets

In [None]:
import os
import numpy as np
import xml.etree.ElementTree as ET
import pywt
from scipy.ndimage import uniform_filter1d

# Paths
input_dir = "/sise/nadav-group/nadavrap-group/ECGs/ECG files filtered"
output_dir = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/Preprocessing/Filtered_Files_updated"
os.makedirs(output_dir, exist_ok=True)

# Load ECG signals from XML
def load_signals_from_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    signals = {}
    for waveform in root.findall(".//WaveformData"):
        lead_name = waveform.attrib['lead']
        signal_data = waveform.text.strip()
        signals[lead_name] = np.array([float(x) for x in signal_data.split(',')])
    return signals

# Remove DC using moving average
def remove_moving_average_dc(signal, window_size):
    moving_avg = uniform_filter1d(signal, size=window_size, mode='reflect')
    return signal - moving_avg

# DWT-based denoising with DC removal
def preprocess_ecg_dwt(signal, wavelet='db4', level=4, fs=500, window_size_sec=0.5):
    window_size = int(window_size_sec * fs)
    signal_no_dc = remove_moving_average_dc(signal, window_size)
    coeffs = pywt.wavedec(signal_no_dc, wavelet, level=level)
    threshold = np.median(np.abs(coeffs[-1])) / 0.6745 * np.sqrt(2 * np.log(len(signal)))
    coeffs[1:] = [pywt.threshold(c, threshold, mode='soft') for c in coeffs[1:]]
    cleaned_signal = pywt.waverec(coeffs, wavelet)
    return cleaned_signal[:len(signal)]

# Main loop over all XML files
fs = 500
window_size_sec = 0.5
# Define directory for invalid signals
invalid_dir = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/Preprocessing/Invalid_Signals"
os.makedirs(invalid_dir, exist_ok=True)
count = 0
for filename in os.listdir(input_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(input_dir, filename)
        print(f"Processing: {file_path}")

        raw_signals = load_signals_from_xml(file_path)
        base_name = os.path.splitext(filename)[0]

        for lead, signal in raw_signals.items():
            denoised_signal = preprocess_ecg_dwt(signal, fs=fs, window_size_sec=window_size_sec)
            # Remove NaNs or Infs

            # After preprocessing each signal:
            if np.isnan(denoised_signal).any() or np.isinf(denoised_signal).any():
                print(f"❌ Invalid values found in {filename}, lead: {lead}")

                # Save invalid signal for debugging
                invalid_filename = f"{base_name}_lead_{lead}_INVALID.npy"
                invalid_path = os.path.join(invalid_dir, invalid_filename)
                np.save(invalid_path, denoised_signal)
                count += 1
                continue  # Skip saving this signal to the main output
            # Save as .npy
            output_file = os.path.join(output_dir, f"{base_name}_lead_{lead}_denoised.npy")
            np.save(output_file, denoised_signal)
print("count of invalid signals:", count)

print("All files processed and saved.")