# Apnea Audio Data Preparation Pipeline

This notebook provides a complete and robust pipeline for preparing audio data for apnea event detection. It performs the following steps for each patient specified in the configuration:

1.  **Downloads** EDF (audio) and RML (event annotation) files from provided URLs.
2.  **Validates** that all of a patient's audio files have the same sample rate to prevent data corruption.
3.  **Extracts** apnea event timestamps (`Obstructive`, `Central`, `Mixed`) from the RML file.
4.  **Concatenates** the 'Mic' channel from all of a patient's EDF files into a single, continuous WAV file.
5.  **Analyzes** the WAV file second-by-second, extracting a rich set of 23 audio features for each second.
6.  **Labels** each second as `1` (apnea) or `0` (normal) based on the extracted event timestamps.
7.  **Appends** the features and labels for each patient into a single master CSV file, ready for model training.

In [None]:
# === 1. IMPORTS ===
# --- Standard Libraries ---
import os
import csv
import xml.etree.ElementTree as ET

# --- Data and Signal Processing ---
import numpy as np
import scipy.stats
import librosa
import soundfile as sf
import mne

# --- File Downloading ---
import requests

In [None]:
# === 2. CONFIGURATION ===
# --- Edit this section for your patients ---

# List of patient data. Each dictionary represents one patient.
# Add a new dictionary to this list for each new patient you want to process.
patient_groups = [
    {
        "rml_url": "RML_LINK_1",
        "edf_urls": ["EDF_LINK_1A", "EDF_LINK_1B"]
    },
    {
        "rml_url": "RML_LINK_2",
        "edf_urls": ["EDF_LINK_2A", "EDF_LINK_2B", "EDF_LINK_2C"]
    },
    # {
    #     "rml_url": "RML_LINK_3",
    #     "edf_urls": ["EDF_LINK_3A"]
    # },
]

# --- Set the base directory for all downloaded and generated files ---
base_dir = "/content/apnea_data"  # Using Colab's content directory

### Helper Functions
These cells define all the functions used in the pipeline. There is no need to edit them.

In [None]:
# === 3. FUNCTION DEFINITIONS ===

def download_file(url, out_path):
    """Downloads a file from a URL to a specified path using requests."""
    if not os.path.exists(out_path):
        print(f"Downloading {url}...")
        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()  # Raise an exception for bad status codes
            with open(out_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Successfully downloaded to {out_path}")
        except requests.exceptions.RequestException as e:
            print(f"❌ Failed to download {url}. Error: {e}")
            return False
    else:
        print(f"File already exists: {out_path}")
    return True

def extract_apnea_events(xml_file_path):
    """Parses an RML file and extracts start and end times of apnea events."""
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    namespace = {'ns': 'http://www.respironics.com/PatientStudy.xsd'}
    apnea_events = []
    apnea_types = ['ObstructiveApnea', 'CentralApnea', 'MixedApnea']
    for event in root.findall('.//ns:Event', namespace):
        if event.get('Family') == 'Respiratory' and event.get('Type') in apnea_types:
            start_time = float(event.get('Start'))
            duration = float(event.get('Duration'))
            end_time = start_time + duration
            apnea_events.append({'start': start_time, 'end': end_time})
    apnea_events.sort(key=lambda x: x['start'])
    print(f"Extracted {len(apnea_events)} apnea events from {os.path.basename(xml_file_path)}")
    return apnea_events

def is_apnea(frame_start, frame_end, events):
    """Checks if a given time frame overlaps with any apnea event."""
    for event in events:
        if frame_end > event['start'] and frame_start < event['end']:
            return 1  # Apnea
    return 0  # Normal

def extract_features(frame, sr):
    """Calculates a rich set of audio features from a single audio frame."""
    if np.all(frame == 0):
        return [0.0] * 23 # Return zeros for a silent frame

    # Basic features
    energy = np.mean(np.abs(frame))
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=frame)[0])
    rms = np.mean(librosa.feature.rms(y=frame)[0])

    # Spectral features
    spectrum = np.abs(np.fft.rfft(frame))
    freqs = np.fft.rfftfreq(len(frame), 1/sr)
    centroid = librosa.feature.spectral_centroid(y=frame, sr=sr)[0, 0]
    bandwidth = librosa.feature.spectral_bandwidth(y=frame, sr=sr)[0, 0]
    rolloff = librosa.feature.spectral_rolloff(y=frame, sr=sr)[0, 0]
    flatness = librosa.feature.spectral_flatness(y=frame)[0, 0]

    # Advanced features
    mfccs = librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=13)
    mfccs_mean = mfccs.mean(axis=1)
    skew = scipy.stats.skew(frame)
    kurt = scipy.stats.kurtosis(frame)

    # Spectrogram entropy
    power_spec = spectrum ** 2
    ps_norm = power_spec / (np.sum(power_spec) + 1e-10)
    entropy = -np.sum(ps_norm * np.log2(ps_norm + 1e-10))

    # Aggregate all features into a single list
    features = [energy, zcr, centroid, rms, bandwidth, rolloff, flatness, skew, kurt, entropy] + list(mfccs_mean)
    return features

In [None]:
# === 4. MASTER PROCESSING FUNCTION ===

def process_patient(patient_id, patient_group, base_dir, master_csv_path):
    """Runs the full data preparation pipeline for a single patient."""
    print(f"\n{'='*20} Processing {patient_id} {'='*20}")
    
    # Create a dedicated directory for the patient's files
    patient_dir = os.path.join(base_dir, patient_id)
    os.makedirs(patient_dir, exist_ok=True)

    # --- Step 1: Download all files for the patient ---
    print("\n--- Step 1: Downloading Files ---")
    rml_path = os.path.join(patient_dir, f"{patient_id}.rml")
    if not download_file(patient_group["rml_url"], rml_path):
        print(f"Skipping {patient_id} due to RML download failure.")
        return
        
    edf_paths = []
    for i, edf_url in enumerate(patient_group["edf_urls"], 1):
        edf_path = os.path.join(patient_dir, f"{patient_id}_part_{i}.edf")
        if download_file(edf_url, edf_path):
            edf_paths.append(edf_path)

    # --- Step 2: Concatenate EDF 'Mic' channels into a single WAV file ---
    print("\n--- Step 2: Concatenating Audio Files ---")
    output_wav = os.path.join(patient_dir, f"{patient_id}_full_mic.wav")
    target_sfreq = None
    
    # CRITICAL: First, validate that all EDFs have a 'Mic' channel and the same sample rate
    valid_edf_paths = []
    for edf_path in edf_paths:
        try:
            raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
            if "Mic" in raw.ch_names:
                current_sfreq = int(raw.info["sfreq"])
                if target_sfreq is None:
                    target_sfreq = current_sfreq
                elif target_sfreq != current_sfreq:
                    print(f"❌ CRITICAL ERROR: Sample rate mismatch in {os.path.basename(edf_path)}. Expected {target_sfreq}, found {current_sfreq}.")
                    print(f"Skipping {patient_id}.")
                    return
                valid_edf_paths.append(edf_path)
            else:
                print(f"⚠️ WARNING: 'Mic' channel not found in {os.path.basename(edf_path)}. It will be skipped.")
        except Exception as e:
            print(f"❌ ERROR: Could not read {os.path.basename(edf_path)}. Error: {e}")

    if target_sfreq is None:
        print(f"❌ No valid EDF file with a 'Mic' channel found for {patient_id}. Skipping.")
        return

    # If validation passes, concatenate the valid files
    with sf.SoundFile(output_wav, 'w', samplerate=target_sfreq, channels=1, subtype='PCM_16') as out_f:
        for edf_path in valid_edf_paths:
            raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
            # Ensure data is float32 for consistency with librosa
            mic_data = raw.get_data(picks=["Mic"])[0].astype(np.float32)
            out_f.write(mic_data)
            print(f"Appended {len(mic_data)} samples from {os.path.basename(edf_path)}")
    print(f"✅ Successfully created concatenated WAV: {os.path.basename(output_wav)}")
    
    # --- Step 3: Extract Features and Labels, Append to Master CSV ---
    print("\n--- Step 3: Extracting Features and Labels ---")
    apnea_events = extract_apnea_events(rml_path)
    
    header = ['patient_id', 'frame_start', 'frame_end', 'energy', 'zcr', 'centroid', 'rms', 'bandwidth', 'rolloff', 'flatness', 'skew', 'kurt', 'entropy'] + [f'mfcc_{i+1}' for i in range(13)] + ['label']
    write_header = not os.path.exists(master_csv_path) or os.stat(master_csv_path).st_size == 0

    try:
        with sf.SoundFile(output_wav, 'r') as f_in:
            sr = f_in.samplerate
            frame_sec = 1
            frame_len = int(sr * frame_sec)
            n_frames = int(np.floor(len(f_in) / frame_len))

            with open(master_csv_path, 'a', newline='') as f_out:
                writer = csv.writer(f_out)
                if write_header:
                    writer.writerow(header)
                
                for i in range(n_frames):
                    frame = f_in.read(frames=frame_len, dtype='float32')
                    if len(frame) < frame_len:
                        break
                    
                    frame_start_sec = i * frame_sec
                    frame_end_sec = frame_start_sec + frame_sec
                    
                    # THE CORE LOGIC: Calculate features on raw audio, then determine label
                    # This prevents data leakage.
                    features = extract_features(frame, sr)
                    label = is_apnea(frame_start_sec, frame_end_sec, apnea_events)
                    
                    writer.writerow([patient_id, frame_start_sec, frame_end_sec] + features + [label])
        print(f"✅ Appended {n_frames} frames from {patient_id} to master CSV.")
    except Exception as e:
        print(f"❌ ERROR during feature extraction for {patient_id}. Error: {e}")

In [None]:
# === 5. MAIN EXECUTION LOOP ===
# This cell runs the entire pipeline for all patients defined in the configuration.

master_csv_file = os.path.join(base_dir, "master_apnea_dataset.csv")

# Ensure the base directory exists
os.makedirs(base_dir, exist_ok=True)

for i, group in enumerate(patient_groups, 1):
    p_id = f"patient_{i}"
    process_patient(p_id, group, base_dir, master_csv_file)

print(f"\n{'='*20} PIPELINE COMPLETE {'='*20}")
print(f"Master dataset saved to: {master_csv_file}")