# End-to-End Sleep Apnea Data Preparation for Deep Learning (Colab)

This notebook implements a storage-efficient pipeline to process a large number of patients for training an end-to-end sleep apnea detection model. The key goals are:

1.  **Scalability:** Process up to 100 patients from a list of download links.
2.  **Storage Efficiency:** Implement a **download-process-delete** workflow to avoid exceeding Google Colab's disk storage limits.
3.  **Deep Learning Focus:** Convert raw audio into **mel-spectrograms**, which are ideal image-like inputs for Convolutional Neural Networks (CNNs).
4.  **Standardization:** Downsample all audio to **16kHz**, the standard for most audio-based deep learning tasks.
5.  **Final Output:** Save the entire processed dataset into two compressed NumPy (`.npz`) files (`spectrograms.npz` and `labels.npz`) in your Google Drive for easy loading in a separate model training notebook.

In [None]:
# Cell 1: Setup & Imports
# Install necessary libraries that are not pre-installed on Colab
!pip install mne

# --- Standard & Third-Party Libraries ---
import os
import re
import shutil
import time
import requests
import numpy as np
import pandas as pd
import librosa
import mne
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm

# --- Google Drive Integration ---
from google.colab import drive

print("✅ All libraries installed and imported successfully!")

In [None]:
# Cell 2: Configuration - IMPORTANT: EDIT THIS CELL!
print("--- Configuring the data preparation pipeline... ---")

# --- Google Drive Path ---
# This is where your final .npz files will be saved. 
# The notebook will create this folder if it doesn't exist.
DRIVE_OUTPUT_PATH = "/content/drive/MyDrive/ApneaSpectrograms" # <-- EDIT THIS!

# --- Data Source ---
# Path to the text file containing all download URLs.
# You will need to upload this file to your Colab session.
LINK_FILE = 'download_links.txt' # <-- UPLOAD THIS FILE TO COLAB

# --- Processing Parameters ---
NUM_PATIENTS_TO_PROCESS = 100 # Number of patients to download and process
TARGET_SAMPLE_RATE = 16000  # 16kHz is the standard for audio deep learning
CLIP_DURATION_SEC = 30.0    # Duration of each audio clip for spectrogram generation
CLIP_OVERLAP_SEC = 15.0     # Create a new clip every 15 seconds (50% overlap)
APNEA_THRESHOLD = 0.1       # Label clip as apnea if >10% of it contains an apnea event

# --- Spectrogram Parameters ---
N_MELS = 64              # Vertical resolution of the spectrogram (number of Mel bands)
HOP_LENGTH = 512         # Controls the horizontal resolution (step size)

# --- Setup ---
# Mount Google Drive
try:
    drive.mount('/content/drive')
    print(f"Google Drive mounted. Output will be saved to: {DRIVE_OUTPUT_PATH}")
except Exception as e:
    print(f"ERROR: Could not mount Google Drive. {e}")

# Create the output directory on Google Drive
os.makedirs(DRIVE_OUTPUT_PATH, exist_ok=True)

print("✅ Configuration set.")

In [None]:
# Cell 3: Helper Functions
print("--- Defining helper functions for download, parsing, and processing... ---")

def group_links_by_patient(filepath):
    """Groups RML and EDF links from a text file by patient ID."""
    # ... (This function will be identical to the one in your download notebook) ...
    pass

def download_file_with_retry(url, local_path, max_retries=3):
    """Downloads a file with a simple retry mechanism."""
    # ... (This function will be identical to the one in your download notebook) ...
    pass

def extract_apnea_events(xml_file_path):
    """Parses an RML file to get a list of apnea event intervals."""
    # ... (This function will be identical to your existing one) ...
    pass

def audio_to_spectrogram(audio_clip, sr):
    """Converts a single audio clip to a mel-spectrogram."""
    mel_spec = librosa.feature.melspectrogram(y=audio_clip, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db.astype(np.float16) # Use float16 to save space

def process_patient_to_spectrograms(patient_id, temp_dir):
    """Core function: loads patient audio, slices it, and converts to spectrograms."""
    # ... (This will be the main new function as described in our plan) ...
    # 1. Find RML and EDF files in temp_dir
    # 2. Parse RML for apnea events
    # 3. Load and concatenate all EDF 'Mic' channels into a single audio array
    # 4. Downsample the entire audio array to TARGET_SAMPLE_RATE
    # 5. Slice the downsampled audio into 30-second clips with 15-second overlap
    # 6. For each clip:
    #    a. Calculate its apnea label (0 or 1)
    #    b. Convert the audio clip to a mel-spectrogram
    #    c. Append the spectrogram and label to lists
    # 7. Return the lists of spectrograms and labels
    pass

print("✅ Helper functions defined.")

In [None]:
# Cell 4: Main Processing Loop
print("--- Starting main processing loop... ---")

all_spectrograms = []
all_labels = []

# 1. Group links from the uploaded file
grouped_links = group_links_by_patient(LINK_FILE)
valid_patients = [pid for pid, files in grouped_links.items() if files['rml'] and files['edf']]
patients_to_process = valid_patients[:NUM_PATIENTS_TO_PROCESS]

print(f"Found {len(valid_patients)} valid patients. Processing the first {len(patients_to_process)}.")

# 2. Loop through each patient
for patient_original_id in tqdm(patients_to_process, desc="Overall Patient Progress"):
    temp_patient_dir = os.path.join('/content/temp_data', patient_original_id)
    os.makedirs(temp_patient_dir, exist_ok=True)
    print(f"
Processing {patient_original_id}...")
    
    try:
        # --- DOWNLOAD --- 
        # ... Download RML and all EDFs for this patient into temp_patient_dir ...
        
        # --- PROCESS --- 
        spectrograms, labels = process_patient_to_spectrograms(patient_original_id, temp_patient_dir)
        if spectrograms:
            all_spectrograms.extend(spectrograms)
            all_labels.extend(labels)
            print(f"  -> Added {len(labels)} clips for {patient_original_id}. Total clips: {len(all_labels)}")
        else:
            print(f"  -> No clips processed for {patient_original_id}.")
            
    except Exception as e:
        print(f"  -> ERROR processing {patient_original_id}: {e}")
        
    finally:
        # --- DELETE --- 
        if os.path.exists(temp_patient_dir):
            shutil.rmtree(temp_patient_dir)
            print(f"  -> Cleaned up temporary files for {patient_original_id}.")

print("
✅ All patients processed!")

In [None]:
# Cell 5: Final Save to Google Drive
print("--- Saving final dataset to Google Drive... ---")

if all_spectrograms:
    # Convert lists to NumPy arrays
    spectrogram_array = np.array(all_spectrograms)
    label_array = np.array(all_labels, dtype=np.uint8)
    
    print(f"Final spectrograms array shape: {spectrogram_array.shape}")
    print(f"Final labels array shape: {label_array.shape}")
    
    # Define output file paths
    spectrogram_file = os.path.join(DRIVE_OUTPUT_PATH, 'spectrograms.npz')
    label_file = os.path.join(DRIVE_OUTPUT_PATH, 'labels.npz')
    
    # Save using NumPy's compressed format
    np.savez_compressed(spectrogram_file, spectrograms=spectrogram_array)
    np.savez_compressed(label_file, labels=label_array)
    
    print(f"
✅ Successfully saved dataset!")
    print(f"Spectrograms saved to: {spectrogram_file}")
    print(f"Labels saved to: {label_file}")
    print("
You are now ready to use these files in your model training notebook.")
else:
    print("❌ No data was processed, so no files were saved.")