# Sleep Apnea Data Preparation - GitHub Codespaces (Batch Processing)

**Rolling Download-Process-Delete Workflow for ~100 Patients**

This notebook processes PhysioNet sleep apnea data in batches optimized for GitHub Codespaces:
1. **Download 25 patients** → Process with 16kHz feature extraction → Save locally → Delete raw data
2. **Repeat for 4 batches** to process ~100 patients total
3. **Resumable**: Can start from any batch if interrupted
4. **Local storage**: Results saved to local filesystem (no cloud integration)

---
## 📋 **CONFIGURATION** - Modify these parameters to resume processing

```python
BATCH_SIZE = 25          # Patients per batch
START_BATCH = 1          # 🔧 CHANGE THIS to resume (1, 2, 3, or 4)
END_BATCH = 4            # Target final batch
TOTAL_PATIENTS = 100     # Total patients to process
```

**Examples:**
- Fresh start: `START_BATCH = 1` (processes patients 1-25)
- Resume after batch 1: `START_BATCH = 2` (processes patients 26-50)
- Final batch only: `START_BATCH = 4` (processes patients 76-100)

In [None]:
# Cell 1: Configuration and Setup
print("=== SLEEP APNEA DATA PREPARATION - CODESPACES BATCH PROCESSING ===")
print("Rolling Download-Process-Delete workflow for ~100 patients\n")

# ============================================================================
# 🔧 BATCH CONFIGURATION - MODIFY THESE TO RESUME PROCESSING
# ============================================================================
BATCH_SIZE = 25          # Patients per batch (optimal for Codespaces storage) 
START_BATCH = 1          # 🔧 CHANGE THIS: 1, 2, 3, or 4 to resume processing
END_BATCH = 4            # Final batch number (4 batches = 100 patients)
TOTAL_PATIENTS = 100     # Total patients to process across all batches

# ============================================================================
# 🧵 DYNAMIC THREADING CONFIGURATION - ADJUST ANYTIME!
# ============================================================================
# 🔧 CHANGE THESE VALUES ANYTIME TO TEST DIFFERENT THREAD COUNTS:
MAX_CONCURRENT_PATIENTS = 4  # 🔧 START HERE: Try 4, then 6, 8, 10... until you hit limits
TIMEOUT_PER_PATIENT = 900     # 15 minutes timeout per patient
ENABLE_THREADING = True       # Set to False to use original sequential processing

# 💡 CODESPACES THREADING OPTIMIZATION TIPS:
print(f"🧵 CODESPACES THREADING OPTIMIZATION GUIDE:")
print(f"   💡 Codespaces typically has more CPU cores than Colab")
print(f"   💡 Start with 4 threads, then try 6, 8, 10, 12...")
print(f"   💡 Monitor: htop command or VS Code performance tab")
print(f"   💡 Sweet spot: Usually 4-12 threads for Codespaces")
print(f"   💡 Too many threads → resource contention, slower performance")
print(f"   💡 Signs of overload: High CPU usage, slower per-patient times")

# ============================================================================
# CALCULATED VALUES - DO NOT MODIFY
# ============================================================================
patients_start = (START_BATCH - 1) * BATCH_SIZE + 1
patients_end = START_BATCH * BATCH_SIZE
if patients_end > TOTAL_PATIENTS:
    patients_end = TOTAL_PATIENTS

print(f"\n📊 BATCH CONFIGURATION:")
print(f"   Current batch: {START_BATCH}/{END_BATCH}")
print(f"   Patients in this batch: {patients_start}-{patients_end} ({patients_end - patients_start + 1} patients)")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Target total: {TOTAL_PATIENTS} patients")

print(f"\n🧵 CURRENT THREADING SETTINGS:")
print(f"   Threading enabled: {ENABLE_THREADING}")
print(f"   🔧 Concurrent patients: {MAX_CONCURRENT_PATIENTS} (CHANGE THIS TO EXPERIMENT!)")
print(f"   Timeout per patient: {TIMEOUT_PER_PATIENT/60:.1f} minutes")
print(f"   Expected theoretical speedup: ~{MAX_CONCURRENT_PATIENTS}x faster")
print(f"   💡 To test different thread counts: Change MAX_CONCURRENT_PATIENTS and re-run Cell 8")

# ============================================================================
# LOCAL STORAGE CONFIGURATION (CODESPACES)
# ============================================================================
OUTPUT_BASE_PATH = './sleep_apnea_data'  # Local directory for results
CURRENT_BATCH_FILE = f'codespaces_dataset_batch{START_BATCH}.csv'
LINKS_FILE = './download_links.txt'      # Local path to links file

print(f"\n💾 LOCAL STORAGE (CODESPACES):")
print(f"   Save location: {OUTPUT_BASE_PATH}")
print(f"   Current batch file: {CURRENT_BATCH_FILE}")
print(f"   Links file: {LINKS_FILE}")
print(f"   💡 All results saved locally (no cloud integration)")

# ============================================================================
# AUDIO PROCESSING SETTINGS
# ============================================================================
TARGET_SAMPLE_RATE = 16000  # 16kHz for optimized processing
FRAME_DURATION = 30.0       # seconds per frame
OVERLAP_RATIO = 0.5         # 50% overlap between frames
APNEA_THRESHOLD = 0.1       # 10% apnea overlap threshold for labeling
AUDIO_CHANNEL = 'Mic'       # Audio channel to extract

print(f"\n🎵 AUDIO PROCESSING:")
print(f"   Sample rate: {TARGET_SAMPLE_RATE} Hz")
print(f"   Frame duration: {FRAME_DURATION} seconds")
print(f"   Frame overlap: {OVERLAP_RATIO * 100}%")
print(f"   Apnea threshold: {APNEA_THRESHOLD * 100}%")

print(f"\n✅ Configuration complete. Ready to process batch {START_BATCH}.")
print(f"🔧 TO EXPERIMENT: Change MAX_CONCURRENT_PATIENTS (4→6→8→10...) and re-run Cell 8!")

In [None]:
# Cell 2: Setup Local Directories (Codespaces)
print("📁 SETTING UP LOCAL DIRECTORIES...")

import os

# Create necessary directories
os.makedirs(OUTPUT_BASE_PATH, exist_ok=True)
os.makedirs('./temp_patient_data', exist_ok=True)
os.makedirs('./downloads', exist_ok=True)

print(f"✅ Local directories created successfully")
print(f"📂 Created directories:")
print(f"   - {OUTPUT_BASE_PATH} (local save location)")
print(f"   - ./temp_patient_data (temporary processing)")
print(f"   - ./downloads (download staging)")

# Check if batch file already exists
batch_file_path = os.path.join(OUTPUT_BASE_PATH, CURRENT_BATCH_FILE)
if os.path.exists(batch_file_path):
    print(f"\n⚠️  WARNING: {CURRENT_BATCH_FILE} already exists locally!")
    print(f"   File path: {batch_file_path}")
    print(f"   Consider changing START_BATCH if this batch is already complete.")
else:
    print(f"\n✅ {CURRENT_BATCH_FILE} does not exist. Ready for fresh processing.")

# Check if links file exists
if os.path.exists(LINKS_FILE):
    print(f"\n✅ Links file found: {LINKS_FILE}")
    with open(LINKS_FILE, 'r') as f:
        link_count = len([line for line in f if line.strip()])
    print(f"   Contains {link_count} download links")
else:
    print(f"\n❌ Links file not found: {LINKS_FILE}")
    print(f"   Please ensure download_links.txt is in the current directory")

# List existing batch files
existing_batches = [f for f in os.listdir(OUTPUT_BASE_PATH) if f.startswith('codespaces_dataset_batch') and f.endswith('.csv')]
if existing_batches:
    print(f"\n📊 EXISTING BATCH FILES:")
    total_size_mb = 0
    for batch_file in sorted(existing_batches):
        file_path = os.path.join(OUTPUT_BASE_PATH, batch_file)
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        total_size_mb += file_size_mb
        print(f"   - {batch_file} ({file_size_mb:.1f} MB)")
    print(f"   Total: {total_size_mb:.1f} MB")
else:
    print(f"\n📊 No existing batch files found. This appears to be a fresh start.")

# Show disk space
import shutil
disk_usage = shutil.disk_usage('.')
free_space_gb = disk_usage.free / (1024**3)
total_space_gb = disk_usage.total / (1024**3)
print(f"\n💾 DISK SPACE:")
print(f"   Available: {free_space_gb:.1f} GB / {total_space_gb:.1f} GB")
print(f"   Estimated space per batch: ~2-5 GB (depending on compression)")

In [None]:
# Cell 3: Install Dependencies and Import Libraries
print("📦 INSTALLING DEPENDENCIES...")

# Install required packages
!pip install librosa mne tqdm psutil

print("\n📚 IMPORTING LIBRARIES...")

# Core libraries
import os
import re
import time
import shutil
import requests
import numpy as np
import pandas as pd
from pathlib import Path

# Audio processing
import librosa
import mne

# Progress tracking
from tqdm.notebook import tqdm

# XML processing (will need to recreate extract_apnea_events function)
import xml.etree.ElementTree as ET

# Threading for parallel processing
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# System monitoring
import psutil

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully")
print(f"🔊 librosa version: {librosa.__version__}")
print(f"🧠 mne version: {mne.__version__}")
print(f"🔢 numpy version: {np.__version__}")
print(f"🐼 pandas version: {pd.__version__}")
print(f"🧵 Threading support: {threading.active_count()} active threads")
print(f"🖥️ System info: {psutil.cpu_count()} CPUs, {psutil.virtual_memory().total / (1024**3):.1f} GB RAM")

In [None]:
# Cell 4: XML Annotation Parser (Recreated from working_with_xml.py)
print("🔍 DEFINING XML ANNOTATION PARSER...")

def extract_apnea_events(rml_file_path, output_csv=None):
    """
    Extract apnea events from RML annotation files.
    Recreated from working_with_xml.py for Codespaces compatibility.
    
    Args:
        rml_file_path (str): Path to the RML file
        output_csv (str, optional): Path to save CSV output
    
    Returns:
        list: List of tuples (event_type, start_time, end_time)
    """
    try:
        # Parse the XML file
        tree = ET.parse(rml_file_path)
        root = tree.getroot()
        
        apnea_events = []
        
        # Find all scored events
        for scored_event in root.findall('.//ScoredEvent'):
            # Get event name/type
            name_elem = scored_event.find('Name')
            if name_elem is None:
                continue
                
            event_name = name_elem.text
            
            # Filter for apnea-related events
            apnea_keywords = ['Apnea', 'Hypopnea', 'apnea', 'hypopnea']
            if not any(keyword in event_name for keyword in apnea_keywords):
                continue
            
            # Get start time
            start_elem = scored_event.find('Start')
            if start_elem is None:
                continue
            start_time = float(start_elem.text)
            
            # Get duration
            duration_elem = scored_event.find('Duration')
            if duration_elem is None:
                continue
            duration = float(duration_elem.text)
            
            # Calculate end time
            end_time = start_time + duration
            
            apnea_events.append((event_name, start_time, end_time))
        
        # Save to CSV if requested
        if output_csv:
            df = pd.DataFrame(apnea_events, columns=['EventType', 'StartTime', 'EndTime'])
            df.to_csv(output_csv, index=False)
            print(f"   💾 Saved {len(apnea_events)} events to {output_csv}")
        
        return apnea_events
        
    except Exception as e:
        print(f"   ❌ Error parsing {rml_file_path}: {e}")
        return []

print("✅ XML annotation parser defined")
print("   - Extracts apnea events from RML files")
print("   - Filters for Apnea/Hypopnea events")
print("   - Returns (event_type, start_time, end_time) tuples")

In [None]:
# Cell 5: Download Functions (Enhanced from file download.ipynb)
print("📥 DEFINING DOWNLOAD FUNCTIONS...")

def group_links_by_patient(links_content):
    """
    Groups download URLs by patient ID.
    Modified to work with links content directly (not file path).
    """
    grouped_data = {}
    patient_id_regex = re.compile(r'(\d{8}-\d{6})')
    
    for url in links_content.strip().split('\n'):
        url = url.strip()
        if not url:
            continue
            
        match = patient_id_regex.search(url)
        if not match:
            continue
            
        patient_id = match.group(1)
        if patient_id not in grouped_data:
            grouped_data[patient_id] = {'rml': None, 'edf': []}
            
        if url.endswith('.rml'):
            grouped_data[patient_id]['rml'] = url
        elif url.endswith('.edf'):
            grouped_data[patient_id]['edf'].append(url)
    
    return grouped_data

def download_file_with_retry(url, local_path, max_retries=3, base_delay=2):
    """
    Downloads a file with retry logic and resume capability.
    Enhanced from original with better error handling.
    """
    # Check if file already exists and is complete
    if os.path.exists(local_path):
        try:
            local_size = os.path.getsize(local_path)
            if local_size > 1000:  # Assume files > 1KB are likely complete
                print(f"      ✓ File exists: {os.path.basename(local_path)} ({local_size/1024:.1f} KB)")
                return True
        except Exception:
            pass
    
    # Download with retry logic
    for attempt in range(max_retries):
        try:
            print(f"      📥 Downloading: {os.path.basename(local_path)} (attempt {attempt + 1}/{max_retries})")
            
            # Create directory if needed
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            
            with requests.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                
                # Download to temporary file first
                temp_path = local_path + '.tmp'
                with open(temp_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                # Move to final location
                shutil.move(temp_path, local_path)
                file_size = os.path.getsize(local_path)
                print(f"      ✅ Downloaded: {os.path.basename(local_path)} ({file_size/1024:.1f} KB)")
                return True
                
        except Exception as e:
            print(f"      ❌ Attempt {attempt + 1} failed: {str(e)[:100]}...")
            
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt)
                print(f"      ⏳ Retrying in {delay} seconds...")
                time.sleep(delay)
    
    # Clean up temp file if exists
    temp_path = local_path + '.tmp'
    if os.path.exists(temp_path):
        os.remove(temp_path)
    
    return False

def download_patient_data(patient_original_id, patient_files, patient_folder):
    """
    Downloads all files for a single patient.
    Returns True if successful, False otherwise.
    """
    print(f"   📂 Downloading to: {patient_folder}")
    
    success = True
    
    # Download RML file
    if patient_files['rml']:
        rml_url = patient_files['rml']
        rml_filename_match = re.search(r'fileName=([^&]+)', rml_url)
        rml_filename = rml_filename_match.group(1) if rml_filename_match else os.path.basename(rml_url).split('?')[0]
        rml_filename = requests.utils.unquote(rml_filename)
        rml_path = os.path.join(patient_folder, rml_filename)
        
        if not download_file_with_retry(rml_url, rml_path):
            success = False
            print(f"      ❌ Failed to download RML file")
    
    # Download EDF files
    edf_success_count = 0
    for edf_url in patient_files['edf']:
        edf_filename_match = re.search(r'fileName=([^&]+)', edf_url)
        edf_filename = edf_filename_match.group(1) if edf_filename_match else os.path.basename(edf_url).split('?')[0]
        edf_filename = requests.utils.unquote(edf_filename)
        edf_path = os.path.join(patient_folder, edf_filename)
        
        # Also download corresponding .hea file
        hea_url = edf_url.replace('.edf', '.hea')
        hea_filename = edf_filename.replace('.edf', '.hea')
        hea_path = os.path.join(patient_folder, hea_filename)
        
        edf_ok = download_file_with_retry(edf_url, edf_path)
        hea_ok = download_file_with_retry(hea_url, hea_path)
        
        if edf_ok and hea_ok:
            edf_success_count += 1
        else:
            print(f"      ❌ Failed EDF/HEA pair: {edf_filename}")
    
    total_edf_count = len(patient_files['edf'])
    print(f"   📊 Downloaded {edf_success_count}/{total_edf_count} EDF files")
    
    return success and edf_success_count > 0

print("✅ Download functions defined")
print("   - group_links_by_patient(): Groups URLs by patient ID")
print("   - download_file_with_retry(): Downloads with retry logic")
print("   - download_patient_data(): Downloads all files for one patient")

In [None]:
# Cell 6: Feature Extraction Functions (Enhanced from parallel_feature_extraction.ipynb)
print("🎵 DEFINING FEATURE EXTRACTION FUNCTIONS...")

def extract_comprehensive_features(audio_frame, sample_rate):
    """
    Extract comprehensive audio features for sleep apnea detection.
    Enhanced from parallel_feature_extraction.ipynb with 16kHz optimization.
    """
    try:
        if len(audio_frame) == 0:
            return None
        
        # Basic acoustic features
        rms = float(librosa.feature.rms(y=audio_frame).mean())
        zcr = float(librosa.feature.zero_crossing_rate(y=audio_frame).mean())
        centroid = float(librosa.feature.spectral_centroid(y=audio_frame, sr=sample_rate).mean())
        bandwidth = float(librosa.feature.spectral_bandwidth(y=audio_frame, sr=sample_rate).mean())
        rolloff = float(librosa.feature.spectral_rolloff(y=audio_frame, sr=sample_rate).mean())
        
        # MFCCs (first 8 coefficients)
        mfccs = librosa.feature.mfcc(y=audio_frame, sr=sample_rate, n_mfcc=8)
        mfcc_means = mfccs.mean(axis=1)
        mfcc_stds = mfccs.std(axis=1)
        
        # Temporal features for breathing patterns (5-second windows)
        window_size = int(5 * sample_rate)  # 5 seconds
        num_windows = len(audio_frame) // window_size
        
        if num_windows >= 2:
            rms_windows = []
            zcr_windows = []
            
            for i in range(num_windows):
                start_idx = i * window_size
                end_idx = start_idx + window_size
                window = audio_frame[start_idx:end_idx]
                
                rms_windows.append(librosa.feature.rms(y=window).mean())
                zcr_windows.append(librosa.feature.zero_crossing_rate(y=window).mean())
            
            rms_variability = float(np.std(rms_windows))
            zcr_variability = float(np.std(zcr_windows))
            breathing_regularity = float(1.0 / (1.0 + rms_variability))
        else:
            rms_variability = 0.0
            zcr_variability = 0.0
            breathing_regularity = 0.5
        
        # Silence detection
        silence_threshold = np.percentile(np.abs(audio_frame), 20)
        silence_mask = np.abs(audio_frame) < silence_threshold
        silence_ratio = float(np.mean(silence_mask))
        
        # Breathing pause detection
        silence_changes = np.diff(silence_mask.astype(int))
        pause_starts = np.where(silence_changes == 1)[0]
        pause_ends = np.where(silence_changes == -1)[0]
        
        if len(pause_starts) > 0 and len(pause_ends) > 0:
            if len(pause_ends) < len(pause_starts):
                pause_ends = np.append(pause_ends, len(audio_frame))
            pause_durations = (pause_ends[:len(pause_starts)] - pause_starts) / sample_rate
            avg_pause_duration = float(np.mean(pause_durations))
            max_pause_duration = float(np.max(pause_durations))
        else:
            avg_pause_duration = 0.0
            max_pause_duration = 0.0
        
        # Combine all features
        features = {
            'clean_rms': rms,
            'clean_zcr': zcr,
            'clean_centroid': centroid,
            'clean_bandwidth': bandwidth,
            'clean_rolloff': rolloff,
            'clean_rms_variability': rms_variability,
            'clean_zcr_variability': zcr_variability,
            'clean_breathing_regularity': breathing_regularity,
            'clean_silence_ratio': silence_ratio,
            'clean_avg_pause_duration': avg_pause_duration,
            'clean_max_pause_duration': max_pause_duration
        }
        
        # Add MFCCs
        for i, (mean_val, std_val) in enumerate(zip(mfcc_means, mfcc_stds), 1):
            features[f'clean_mfcc_{i}_mean'] = float(mean_val)
            features[f'clean_mfcc_{i}_std'] = float(std_val)
        
        return features
        
    except Exception as e:
        print(f"      ⚠️ Feature extraction error: {e}")
        return None

def get_apnea_label(timestamp, duration, apnea_events, threshold=0.1):
    """
    Calculate apnea label based on overlap with annotated events.
    Uses proportion-based labeling with configurable threshold.
    """
    try:
        frame_end = timestamp + duration
        apnea_seconds = 0
        
        for _, start, end in apnea_events:
            overlap_start = max(timestamp, start)
            overlap_end = min(frame_end, end)
            if overlap_start < overlap_end:
                apnea_seconds += (overlap_end - overlap_start)
        
        proportion = apnea_seconds / duration
        label = 1 if proportion > threshold else 0
        return label, proportion
    except:
        return 0, 0.0

def process_patient_edf_files(patient_folder, patient_id):
    """
    Process all EDF files for a single patient and extract features.
    Returns list of feature records for the patient.
    """
    print(f"   🎵 Processing audio files for {patient_id}...")
    
    try:
        # Find EDF and RML files
        edf_files = sorted([f for f in os.listdir(patient_folder) if f.endswith('.edf')])
        rml_files = [f for f in os.listdir(patient_folder) if f.endswith('.rml')]
        
        if not edf_files or not rml_files:
            print(f"      ❌ Missing files: {len(edf_files)} EDF, {len(rml_files)} RML")
            return []
        
        print(f"      📁 Found {len(edf_files)} EDF and {len(rml_files)} RML files")
        
        # Load apnea events
        rml_path = os.path.join(patient_folder, rml_files[0])
        apnea_events = extract_apnea_events(rml_path)
        print(f"      📋 Loaded {len(apnea_events)} apnea events")
        
        # Process each EDF file
        all_features = []
        
        for edf_idx, edf_file in enumerate(edf_files, 1):
            print(f"      🎵 Processing EDF {edf_idx}/{len(edf_files)}: {edf_file}")
            
            try:
                edf_path = os.path.join(patient_folder, edf_file)
                raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
                
                if AUDIO_CHANNEL not in raw.ch_names:
                    print(f"         ⚠️ No {AUDIO_CHANNEL} channel, skipping")
                    continue
                
                raw.pick_channels([AUDIO_CHANNEL])
                original_sample_rate = int(raw.info['sfreq'])
                duration_min = raw.n_times / original_sample_rate / 60
                
                print(f"         ⏱️ Duration: {duration_min:.1f} min, {original_sample_rate} Hz → {TARGET_SAMPLE_RATE} Hz")
                
                # Frame parameters
                original_frame_samples = int(FRAME_DURATION * original_sample_rate)
                original_step_samples = int(original_frame_samples * (1 - OVERLAP_RATIO))
                
                # Time offset for multi-EDF processing
                time_offset = (edf_idx - 1) * 60 * 60  # Each EDF ≈ 1 hour
                
                frame_count = 0
                for frame_start in range(0, raw.n_times - original_frame_samples + 1, original_step_samples):
                    frame_end = frame_start + original_frame_samples
                    timestamp = (frame_start / original_sample_rate) + time_offset
                    
                    # Load and downsample audio frame
                    try:
                        audio_frame, _ = raw[:, frame_start:frame_end]
                        audio_frame = audio_frame.flatten()
                        
                        # Downsample to target rate
                        if original_sample_rate != TARGET_SAMPLE_RATE:
                            audio_frame = librosa.resample(
                                audio_frame, 
                                orig_sr=original_sample_rate, 
                                target_sr=TARGET_SAMPLE_RATE
                            )
                        
                        # Extract features
                        features = extract_comprehensive_features(audio_frame, TARGET_SAMPLE_RATE)
                        if features is None:
                            continue
                        
                        # Get apnea label
                        apnea_label, apnea_proportion = get_apnea_label(
                            timestamp, FRAME_DURATION, apnea_events, APNEA_THRESHOLD
                        )
                        
                        # Create record
                        record = {
                            'patient_id': patient_id,
                            'edf_file': edf_file,
                            'timestamp': float(timestamp),
                            'frame_duration': FRAME_DURATION,
                            'sample_rate': TARGET_SAMPLE_RATE,
                            'apnea_label': int(apnea_label),
                            'apnea_proportion': float(apnea_proportion),
                            **features
                        }
                        
                        all_features.append(record)
                        frame_count += 1
                        
                    except Exception as e:
                        print(f"         ⚠️ Frame {frame_count} failed: {e}")
                        continue
                
                print(f"         ✅ Extracted {frame_count} frames")
                del raw  # Free memory
                
            except Exception as e:
                print(f"         ❌ EDF processing failed: {e}")
                continue
        
        apnea_count = sum(1 for f in all_features if f['apnea_label'] == 1)
        print(f"   ✅ {patient_id}: {len(all_features)} frames, {apnea_count} apnea")
        return all_features
        
    except Exception as e:
        print(f"   ❌ {patient_id}: Processing failed: {e}")
        return []

print("✅ Feature extraction functions defined")
print("   - extract_comprehensive_features(): 27 audio features with 16kHz optimization")
print("   - get_apnea_label(): Proportion-based labeling with 10% threshold")
print("   - process_patient_edf_files(): Full patient processing pipeline")

In [None]:
# Cell 7: Load Download Links File
print("📎 LOADING DOWNLOAD LINKS FILE")
print(f"Loading from: {LINKS_FILE}")

# Load and parse the links
try:
    with open(LINKS_FILE, 'r') as f:
        links_content = f.read()
    
    print(f"✅ {LINKS_FILE} loaded successfully ({len(links_content)} characters)")
    
    # Group links by patient
    grouped_links = group_links_by_patient(links_content)
    print(f"📊 Found {len(grouped_links)} unique patients in links file")
    
    # Show sample of available patients
    valid_patients = [pid for pid, files in grouped_links.items() if files['rml'] and files['edf']]
    print(f"✅ {len(valid_patients)} patients have both RML and EDF files")
    
    if len(valid_patients) >= TOTAL_PATIENTS:
        print(f"🎯 Sufficient patients available for target of {TOTAL_PATIENTS}")
    else:
        print(f"⚠️ Only {len(valid_patients)} valid patients found, less than target {TOTAL_PATIENTS}")
        
    # Show first few patient IDs as example
    print(f"\n📋 Sample patient IDs:")
    for i, pid in enumerate(list(valid_patients)[:5]):
        files_info = grouped_links[pid]
        print(f"   {i+1}. {pid}: {len(files_info['edf'])} EDF files, {'✓' if files_info['rml'] else '✗'} RML")
    
    if len(valid_patients) > 5:
        print(f"   ... and {len(valid_patients) - 5} more patients")
        
except FileNotFoundError:
    print(f"❌ {LINKS_FILE} not found")
    print(f"Please ensure download_links.txt is in the current directory")
    raise Exception("Download links file is required to proceed")
except Exception as e:
    print(f"❌ Error loading links file: {e}")
    raise

print(f"\n✅ Links file processing complete")

In [None]:
# Cell 8: Main Batch Processing Loop (Dynamic Threading for Codespaces)
print(f"🚀 STARTING BATCH {START_BATCH} PROCESSING (CODESPACES)")
print(f"Processing patients {patients_start}-{patients_end}")
print(f"Threading: {'ENABLED' if ENABLE_THREADING else 'DISABLED'} ({MAX_CONCURRENT_PATIENTS} concurrent)" if ENABLE_THREADING else "Sequential processing")
print(f"{'='*60}")

# 🔧 DYNAMIC THREADING: Read current configuration
current_threads = MAX_CONCURRENT_PATIENTS
current_timeout = TIMEOUT_PER_PATIENT

print(f"🧵 DYNAMIC THREADING STATUS (CODESPACES):")
print(f"   Current thread count: {current_threads}")
print(f"   💡 TO EXPERIMENT: Change MAX_CONCURRENT_PATIENTS in Cell 1 and re-run this cell")
print(f"   💡 CODESPACES TRY: 4 → 6 → 8 → 10 → 12... until you see diminishing returns")
print(f"   💡 MONITOR: Use 'htop' command in terminal or VS Code performance tab")

def get_system_info():
    """Get system information for monitoring resource usage"""
    try:
        cpu_percent = psutil.cpu_percent(interval=1)
        memory = psutil.virtual_memory()
        disk = psutil.disk_usage('.')
        
        return {
            'cpu_percent': cpu_percent,
            'cpu_count': psutil.cpu_count(),
            'memory_used_gb': memory.used / (1024**3),
            'memory_total_gb': memory.total / (1024**3),
            'memory_percent': memory.percent,
            'disk_used_gb': disk.used / (1024**3),
            'disk_free_gb': disk.free / (1024**3)
        }
    except Exception as e:
        print(f"⚠️ Could not get system info: {e}")
        return None

def process_single_patient_threading(patient_info):
    """
    Process a single patient in a thread-safe manner.
    Enhanced with resource monitoring and thread identification.
    Returns: (success, patient_features, stats)
    """
    patient_original_id, patient_number, patient_idx = patient_info
    patient_id = f"patient_{patient_number:02d}"
    patient_folder = f"./temp_patient_data/{patient_id}"
    thread_id = threading.current_thread().name
    
    print(f"\n[{thread_id}] --- Processing {patient_id} (Original: {patient_original_id}) ---")
    
    start_time = time.time()
    stats = {
        'patient_id': patient_id,
        'original_id': patient_original_id,
        'thread_id': thread_id,
        'thread_count_used': current_threads,
        'success': False,
        'frames_extracted': 0,
        'apnea_frames': 0,
        'processing_time': 0,
        'download_time': 0,
        'feature_extraction_time': 0,
        'cleanup_time': 0,
        'error_message': None
    }
    
    try:
        # Step 1: Download patient data
        download_start = time.time()
        print(f"[{thread_id}] 📥 Step 1/3: Downloading {patient_id}...")
        patient_files = grouped_links[patient_original_id]
        
        download_success = download_patient_data(patient_original_id, patient_files, patient_folder)
        stats['download_time'] = time.time() - download_start
        
        if not download_success:
            stats['error_message'] = "Download failed"
            print(f"[{thread_id}] ❌ Download failed for {patient_id} (took {stats['download_time']:.1f}s)")
            return False, [], stats
        
        print(f"[{thread_id}] ✅ Download successful for {patient_id} (took {stats['download_time']:.1f}s)")
        
        # Step 2: Process and extract features
        extraction_start = time.time()
        print(f"[{thread_id}] 🎵 Step 2/3: Extracting features for {patient_id}...")
        patient_features = process_patient_edf_files(patient_folder, patient_id)
        stats['feature_extraction_time'] = time.time() - extraction_start
        
        if not patient_features:
            stats['error_message'] = "Feature extraction failed"
            print(f"[{thread_id}] ❌ Feature extraction failed for {patient_id} (took {stats['feature_extraction_time']:.1f}s)")
            return False, [], stats
        
        # Update stats
        stats['frames_extracted'] = len(patient_features)
        stats['apnea_frames'] = sum(1 for f in patient_features if f['apnea_label'] == 1)
        stats['success'] = True
        
        print(f"[{thread_id}] ✅ Features extracted for {patient_id}: {len(patient_features)} frames (took {stats['feature_extraction_time']:.1f}s)")
        
        return True, patient_features, stats
        
    except Exception as e:
        stats['error_message'] = str(e)
        print(f"[{thread_id}] ❌ Critical error processing {patient_id}: {e}")
        return False, [], stats
        
    finally:
        # Step 3: Clean up temporary files (always runs)
        cleanup_start = time.time()
        stats['processing_time'] = time.time() - start_time
        
        print(f"[{thread_id}] 🗑️ Step 3/3: Cleaning up {patient_id}...")
        if os.path.exists(patient_folder):
            try:
                shutil.rmtree(patient_folder)
                stats['cleanup_time'] = time.time() - cleanup_start
                print(f"[{thread_id}] ✅ Cleanup successful for {patient_id} (took {stats['cleanup_time']:.1f}s)")
            except Exception as cleanup_error:
                stats['cleanup_time'] = time.time() - cleanup_start
                print(f"[{thread_id}] ⚠️ Cleanup warning for {patient_id}: {cleanup_error}")

# Main processing logic
batch_start_time = time.time()
batch_features = []  # Thread-safe accumulation
successful_patients = 0
failed_patients = 0
processing_stats = []

# Get initial system info
initial_system_info = get_system_info()
if initial_system_info:
    print(f"\n📊 INITIAL CODESPACES SYSTEM STATUS:")
    print(f"   CPUs: {initial_system_info['cpu_count']} cores")
    print(f"   CPU: {initial_system_info['cpu_percent']:.1f}%")
    print(f"   RAM: {initial_system_info['memory_used_gb']:.1f}/{initial_system_info['memory_total_gb']:.1f} GB ({initial_system_info['memory_percent']:.1f}%)")
    print(f"   Disk: {initial_system_info['disk_free_gb']:.1f} GB free")

# Select patients for this batch
valid_patients = [pid for pid, files in grouped_links.items() if files['rml'] and files['edf']]
batch_patients = valid_patients[patients_start-1:patients_end]  # Convert to 0-based indexing

print(f"\n📋 Selected {len(batch_patients)} patients for batch {START_BATCH}")

# Prepare patient info tuples
patient_info_list = [
    (patient_original_id, patients_start + patient_idx, patient_idx)
    for patient_idx, patient_original_id in enumerate(batch_patients)
]

# Threading lock for thread-safe operations
results_lock = threading.Lock()

if ENABLE_THREADING:
    print(f"\n🧵 STARTING THREADED PROCESSING (CODESPACES):")
    print(f"   🔧 Concurrent patients: {current_threads} (configurable in Cell 1)")
    print(f"   Timeout per patient: {current_timeout/60:.1f} minutes")
    print(f"   Total patients: {len(patient_info_list)}")
    print(f"   💡 CODESPACES TIP: Try higher thread counts than Colab!")
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=current_threads, thread_name_prefix="Patient") as executor:
        # Submit all patient processing jobs
        future_to_patient = {
            executor.submit(process_single_patient_threading, patient_info): patient_info[0]
            for patient_info in patient_info_list
        }
        
        print(f"📤 Submitted {len(future_to_patient)} patient processing jobs to {current_threads} threads")
        
        # Collect results as they complete
        for future in as_completed(future_to_patient, timeout=current_timeout * len(patient_info_list)):
            patient_original_id = future_to_patient[future]
            
            try:
                # Get result with individual patient timeout
                success, patient_features, stats = future.result(timeout=current_timeout)
                
                # Thread-safe result accumulation
                with results_lock:
                    processing_stats.append(stats)
                    
                    if success:
                        batch_features.extend(patient_features)
                        successful_patients += 1
                        
                        # Progress update with timing details
                        total_frames = len(batch_features)
                        apnea_frames = sum(1 for f in batch_features if f['apnea_label'] == 1)
                        
                        print(f"🎯 [{stats['thread_id']}] SUCCESS: {stats['patient_id']} - {stats['frames_extracted']} frames")
                        print(f"   ⏱️ Timing: Download {stats['download_time']:.1f}s | Processing {stats['feature_extraction_time']:.1f}s | Cleanup {stats['cleanup_time']:.1f}s")
                        print(f"📊 Batch progress: {successful_patients + failed_patients}/{len(patient_info_list)} patients, {total_frames} total frames")
                    else:
                        failed_patients += 1
                        print(f"❌ [{stats['thread_id']}] FAILED: {stats['patient_id']} - {stats['error_message']}")
                        
                    # Show system status every 5 patients
                    if (successful_patients + failed_patients) % 5 == 0:
                        current_system_info = get_system_info()
                        if current_system_info:
                            print(f"📊 CODESPACES STATUS: CPU {current_system_info['cpu_percent']:.1f}% | RAM {current_system_info['memory_percent']:.1f}% | Disk {current_system_info['disk_free_gb']:.1f}GB free")
                        
            except Exception as e:
                with results_lock:
                    failed_patients += 1
                print(f"❌ Exception for {patient_original_id}: {e}")

else:
    print(f"\n📜 STARTING SEQUENTIAL PROCESSING:")
    
    # Sequential processing (original method)
    for patient_info in tqdm(patient_info_list, desc=f"Batch {START_BATCH} Progress"):
        success, patient_features, stats = process_single_patient_threading(patient_info)
        
        processing_stats.append(stats)
        
        if success:
            batch_features.extend(patient_features)
            successful_patients += 1
        else:
            failed_patients += 1
        
        # Show progress
        total_frames = len(batch_features)
        apnea_frames = sum(1 for f in batch_features if f['apnea_label'] == 1)
        print(f"📊 Progress: {successful_patients + failed_patients}/{len(patient_info_list)} patients, {total_frames} frames")

batch_elapsed = time.time() - batch_start_time

# Final batch statistics with threading analysis
print(f"\n{'='*60}")
print(f"🏁 BATCH {START_BATCH} PROCESSING COMPLETE (CODESPACES)!")
print(f"⏱️ Total time: {batch_elapsed/60:.1f} minutes")
print(f"🧵 Threading: {'ENABLED' if ENABLE_THREADING else 'DISABLED'} (used {current_threads} threads)")
print(f"✅ Successful patients: {successful_patients}")
print(f"❌ Failed patients: {failed_patients}")
print(f"📊 Total frames extracted: {len(batch_features):,}")

if batch_features:
    apnea_count = sum(1 for f in batch_features if f['apnea_label'] == 1)
    apnea_rate = (apnea_count / len(batch_features)) * 100
    print(f"🚨 Apnea frames: {apnea_count:,} ({apnea_rate:.1f}%)")
    print(f"😴 Normal frames: {len(batch_features) - apnea_count:,} ({100-apnea_rate:.1f}%)")
    
    # Enhanced threading performance analysis for Codespaces
    if ENABLE_THREADING and processing_stats:
        successful_stats = [s for s in processing_stats if s['success']]
        if successful_stats:
            avg_total_time = sum(s['processing_time'] for s in successful_stats) / len(successful_stats)
            avg_download_time = sum(s['download_time'] for s in successful_stats) / len(successful_stats)
            avg_processing_time = sum(s['feature_extraction_time'] for s in successful_stats) / len(successful_stats)
            avg_cleanup_time = sum(s['cleanup_time'] for s in successful_stats) / len(successful_stats)
            
            estimated_sequential_time = avg_total_time * len(patient_info_list)
            actual_speedup = estimated_sequential_time / batch_elapsed
            theoretical_speedup = current_threads
            efficiency = (actual_speedup / theoretical_speedup) * 100
            
            print(f"\n🚀 CODESPACES THREADING PERFORMANCE ANALYSIS:")
            print(f"   🔧 Thread count used: {current_threads}")
            print(f"   ⏱️ Average times per patient:")
            print(f"      - Download: {avg_download_time/60:.1f} minutes")
            print(f"      - Processing: {avg_processing_time/60:.1f} minutes") 
            print(f"      - Cleanup: {avg_cleanup_time:.1f} seconds")
            print(f"      - Total: {avg_total_time/60:.1f} minutes")
            print(f"   🏃 Speedup achieved: {actual_speedup:.1f}x (vs sequential)")
            print(f"   🎯 Theoretical max: {theoretical_speedup}x")
            print(f"   📈 Threading efficiency: {efficiency:.1f}%")
            
            # Codespaces-specific recommendations
            print(f"\n💡 CODESPACES OPTIMIZATION SUGGESTIONS:")
            if efficiency > 80:
                print(f"   ✅ Excellent efficiency! Try increasing to {current_threads + 2} threads")
                print(f"   💡 Codespaces can often handle higher thread counts than Colab")
            elif efficiency > 60:
                print(f"   ⚠️ Good efficiency. Try {current_threads + 2} threads but monitor resources")
            elif efficiency > 40:
                print(f"   ⚠️ Moderate efficiency. Consider staying at {current_threads} threads")
            else:
                print(f"   ❌ Low efficiency. Try reducing to {max(2, current_threads - 2)} threads")
                
            print(f"   📊 Monitor with 'htop' command and adjust MAX_CONCURRENT_PATIENTS accordingly")

# Final system status
final_system_info = get_system_info()
if final_system_info and initial_system_info:
    print(f"\n📊 FINAL CODESPACES SYSTEM STATUS:")
    print(f"   CPU: {final_system_info['cpu_percent']:.1f}% (was {initial_system_info['cpu_percent']:.1f}%)")
    print(f"   RAM: {final_system_info['memory_used_gb']:.1f}/{final_system_info['memory_total_gb']:.1f} GB ({final_system_info['memory_percent']:.1f}%)")
    print(f"   RAM change: {final_system_info['memory_used_gb'] - initial_system_info['memory_used_gb']:+.1f} GB")
    print(f"   Disk: {final_system_info['disk_free_gb']:.1f} GB free")

else:
    print(f"⚠️ No features extracted in this batch")

print(f"\n🔧 TO EXPERIMENT WITH DIFFERENT THREAD COUNTS (CODESPACES):")
print(f"   1. Change MAX_CONCURRENT_PATIENTS in Cell 1 (try {current_threads + 2})")
print(f"   2. Re-run this Cell 8 to test the new thread count")
print(f"   3. Compare the 'Threading efficiency' percentage")
print(f"   4. Use 'htop' in terminal to monitor system resources")
print(f"   5. Find the sweet spot for your Codespaces instance!")

# Store processing statistics for analysis
globals()['batch_processing_stats'] = processing_stats
globals()['threading_experiment_results'] = {
    'platform': 'codespaces',
    'thread_count': current_threads,
    'batch_time_minutes': batch_elapsed/60,
    'successful_patients': successful_patients,
    'failed_patients': failed_patients,
    'frames_extracted': len(batch_features)
}

In [None]:
# Cell 9: Save Batch Results to Local Storage (Codespaces)
print(f"💾 SAVING BATCH {START_BATCH} TO LOCAL STORAGE (CODESPACES)")
print(f"{'='*50}")

if batch_features:
    # Convert to DataFrame
    print(f"📊 Converting {len(batch_features)} records to DataFrame...")
    df = pd.DataFrame(batch_features)
    
    # Display dataset info
    print(f"✅ DataFrame created: {df.shape}")
    print(f"👥 Unique patients: {df['patient_id'].nunique()}")
    
    # Feature columns analysis
    feature_cols = [col for col in df.columns if col.startswith('clean_')]
    print(f"🎯 Feature columns: {len(feature_cols)}")
    
    # Save to local storage
    batch_file_path = os.path.join(OUTPUT_BASE_PATH, CURRENT_BATCH_FILE)
    print(f"📁 Saving to: {batch_file_path}")
    
    df.to_csv(batch_file_path, index=False)
    
    # Verify file was saved
    if os.path.exists(batch_file_path):
        file_size_mb = os.path.getsize(batch_file_path) / (1024 * 1024)
        print(f"✅ Successfully saved: {CURRENT_BATCH_FILE} ({file_size_mb:.1f} MB)")
        
        # Display sample of the data
        print(f"\n📋 SAMPLE DATA:")
        print(df[['patient_id', 'timestamp', 'apnea_label', 'clean_rms', 'clean_zcr']].head())
        
        # Feature correlation analysis
        print(f"\n🔗 TOP 5 FEATURE CORRELATIONS WITH APNEA:")
        correlations = df[feature_cols].corrwith(df['apnea_label']).abs().sort_values(ascending=False)
        for feature, corr in correlations.head().items():
            print(f"   {feature}: {corr:.3f}")
        
        # Per-patient breakdown
        print(f"\n👤 PER-PATIENT BREAKDOWN:")
        patient_stats = df.groupby('patient_id').agg({
            'apnea_label': ['count', 'sum', 'mean']
        }).round(3)
        
        for patient in df['patient_id'].unique():
            count = patient_stats.loc[patient, ('apnea_label', 'count')]
            apnea = patient_stats.loc[patient, ('apnea_label', 'sum')]
            rate = patient_stats.loc[patient, ('apnea_label', 'mean')] * 100
            print(f"   {patient}: {count} frames, {apnea} apnea ({rate:.1f}%)")
        
    else:
        print(f"❌ Error: File was not saved to {batch_file_path}")
        
else:
    print(f"⚠️ No data to save - batch processing failed")

# Show all existing batch files
print(f"\n📂 ALL BATCH FILES IN LOCAL STORAGE:")
existing_batches = [f for f in os.listdir(OUTPUT_BASE_PATH) if f.startswith('codespaces_dataset_batch') and f.endswith('.csv')]

if existing_batches:
    total_size_mb = 0
    for batch_file in sorted(existing_batches):
        file_path = os.path.join(OUTPUT_BASE_PATH, batch_file)
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        total_size_mb += file_size_mb
        print(f"   ✅ {batch_file} ({file_size_mb:.1f} MB)")
    
    print(f"\n📊 Total saved data: {total_size_mb:.1f} MB across {len(existing_batches)} batches")
    
    # Progress tracking
    completed_batches = len(existing_batches)
    remaining_batches = END_BATCH - completed_batches
    if remaining_batches > 0:
        print(f"🚀 Progress: {completed_batches}/{END_BATCH} batches complete")
        print(f"📋 Next: Set START_BATCH = {completed_batches + 1} to continue")
    else:
        print(f"🎉 All {END_BATCH} batches completed! Dataset ready for analysis.")
        
        # Create combined dataset
        print(f"\n🔄 Creating combined dataset...")
        all_batch_dfs = []
        for batch_file in sorted(existing_batches):
            file_path = os.path.join(OUTPUT_BASE_PATH, batch_file)
            batch_df = pd.read_csv(file_path)
            all_batch_dfs.append(batch_df)
        
        combined_df = pd.concat(all_batch_dfs, ignore_index=True)
        combined_file_path = os.path.join(OUTPUT_BASE_PATH, 'codespaces_complete_dataset.csv')
        combined_df.to_csv(combined_file_path, index=False)
        
        combined_size_mb = os.path.getsize(combined_file_path) / (1024 * 1024)
        print(f"✅ Combined dataset saved: codespaces_complete_dataset.csv ({combined_size_mb:.1f} MB)")
        print(f"📊 Total records: {len(combined_df):,}")
        print(f"👥 Total patients: {combined_df['patient_id'].nunique()}")
        
else:
    print(f"   No batch files found")

# Show disk usage
disk_usage = shutil.disk_usage('.')
free_space_gb = disk_usage.free / (1024**3)
used_space_gb = disk_usage.used / (1024**3)
total_space_gb = disk_usage.total / (1024**3)

print(f"\n💾 CODESPACES DISK USAGE:")
print(f"   Used: {used_space_gb:.1f} GB")
print(f"   Available: {free_space_gb:.1f} GB")
print(f"   Total: {total_space_gb:.1f} GB")

print(f"\n🎯 BATCH {START_BATCH} PROCESSING COMPLETE (CODESPACES)!")
print(f"Data safely saved to local storage: {OUTPUT_BASE_PATH}")
print(f"📁 Files are accessible from VS Code file explorer or terminal")