# Clean Feature Extraction for Sleep Apnea Detection
## Modified from augumented_data_prep.ipynb to focus on clean feature extraction only
### Noise injection and denoising code preserved but commented out for future use

In [1]:
# # Cell 2: Imports and Setup
# print("--- Importing libraries... ---")

# # Standard libraries
# import os
# import re
# import shutil
# import numpy as np
# import pandas as pd
# from tqdm.notebook import tqdm
# import soundfile as sf
# import warnings
# import time
# # import subprocess # For running external scripts - COMMENTED OUT for clean extraction

# # For audio and signal processing
# import librosa
# import mne
# from xml.etree import ElementTree as ET

# # --- Import your custom XML parser ---
# import sys
# sys.path.append('../src') # Add the directory to Python's path
# from working_with_xml import extract_apnea_events # Your specific function

# # Suppress minor warnings from libraries to keep output clean
# warnings.filterwarnings("ignore", category=RuntimeWarning)

# print("Libraries imported successfully.")

import pandas as pd

# Load all datasets
batch5 = pd.read_csv('../data/datasets/colab_dataset_batch5.csv')
batch6 = pd.read_csv('../data/datasets/colab_dataset_batch6.csv')
local = pd.read_csv('../data/datasets/final_local_dataset.csv')

# Combine into single dataset
combined_dataset = pd.concat([batch5, batch6, local], ignore_index=True)

# Save combined dataset
combined_dataset.to_csv('../data/datasets/combined_complete_dataset.csv', index=False)


In [8]:
# Cell 3: Configuration - Clean Feature Extraction Focus
print("--- Configuring clean feature extraction pipeline (30-second frames)... ---")

# --- FILE & PATH SETTINGS ---
# Path to your local patient data
RAW_PATIENT_DATA_BASE_DIR = "F:\Solo All In One Docs\Evaluating-Noise-Reduction-Techniques\data\sleep_data"  # Update this to your local path
CSV_OUTPUT_PATH = "F:\Solo All In One Docs\Evaluating-Noise-Reduction-Techniques\data\sleep_data"  # Where to save the final dataset

# COMMENTED OUT - Noise/Denoising paths for future use
# NOISE_AUDIO_DIR = "noise_audio" 
# DENOISER_SCRIPTS_DIR = os.path.abspath("../src")

# List the specific patient folders you want to process.
PATIENT_FOLDERS_TO_PROCESS = []  # Empty = process all found patient folders
# Example: ['patient_01', 'patient_02', 'patient_03']

# --- DEBUG MODE SETTINGS ---
DEBUG_MODE = True  # Set to False for full processing
DEBUG_PATIENT_COUNT = 6 # In debug mode, process only this many patients

# --- DATA PROCESSING SETTINGS (UPDATED FOR TEMPORAL ANALYSIS) ---
AUDIO_CHANNEL_NAME = 'Mic'  # The microphone channel name from your EDF files
FRAME_DURATION_SEC = 30.0  # 30-second frames for temporal pattern analysis
CHUNK_DURATION_MIN = 10  # Process audio in chunks to manage RAM
APNEA_THRESHOLD = 0.1  # 10% apnea content threshold for binary labeling

# --- TEMPORAL FEATURE SETTINGS ---
OVERLAP_RATIO = 0.5  # 50% overlap between consecutive frames
BREATHING_RATE_WINDOW = 10  # Window size for breathing rate analysis (seconds)

# COMMENTED OUT - Noise injection settings for future use
# NOISE_CATEGORIES = ['vacuum_cleaner', 'cat', 'door_wood_creaks']
# NOISE_LEVEL_RMS_RATIO = 0.3333
# SAVE_SNIPPETS = True
# SNIPPET_DURATION_SEC = 5

# COMMENTED OUT - Denoiser script mapping for future use
# DENOISER_SCRIPT_MAP = {
#     "spectral": "spec_subtraction_same_file.py",
#     "wiener": "wiener_filtering.py", 
#     "logmmse": "log_mmse.py",
# }

print(f"Configuration set for 30-second frame extraction with {APNEA_THRESHOLD*100}% apnea threshold.")

--- Configuring clean feature extraction pipeline (30-second frames)... ---
Configuration set for 30-second frame extraction with 10.0% apnea threshold.


  RAW_PATIENT_DATA_BASE_DIR = "F:\Solo All In One Docs\Evaluating-Noise-Reduction-Techniques\data\sleep_data"  # Update this to your local path
  CSV_OUTPUT_PATH = "F:\Solo All In One Docs\Evaluating-Noise-Reduction-Techniques\data\sleep_data"  # Where to save the final dataset


In [9]:
# Cell 4: Helper Functions + Main Processing Function
print("--- Defining helper functions and main processing function for 30-second temporal feature extraction... ---")

# --- Wrapper for your external RML parser ---
def parse_respironics_rml(rml_path):
    """Wraps your extract_apnea_events function to return (start_time, end_time) tuples."""
    apnea_event_data = extract_apnea_events(rml_path, output_csv=None)
    events_only_times = [(float(start), float(end)) for event_type, start, end in apnea_event_data]
    return events_only_times

def calculate_apnea_proportion(frame_start_sec, frame_duration_sec, apnea_events):
    """Calculate proportion of frame that contains apnea events."""
    frame_end_sec = frame_start_sec + frame_duration_sec
    apnea_seconds = 0
    
    for start, end in apnea_events:
        # Calculate overlap between frame and apnea event
        overlap_start = max(frame_start_sec, start)
        overlap_end = min(frame_end_sec, end)
        if overlap_start < overlap_end:
            apnea_seconds += (overlap_end - overlap_start)
    
    return apnea_seconds / frame_duration_sec

def label_temporal_frame(frame_start_sec, frame_duration_sec, apnea_events, threshold=0.1):
    """Label frame based on apnea proportion with threshold."""
    proportion = calculate_apnea_proportion(frame_start_sec, frame_duration_sec, apnea_events)
    return 1 if proportion > threshold else 0, proportion

def extract_basic_features(frame, sr):
    """Extract basic acoustic features from a frame."""
    # Ensure frame has data
    if len(frame) == 0:
        return {}
        
    rms = librosa.feature.rms(y=frame).mean()
    zcr = librosa.feature.zero_crossing_rate(y=frame).mean()
    
    try:
        centroid = librosa.feature.spectral_centroid(y=frame, sr=sr).mean()
        bandwidth = librosa.feature.spectral_bandwidth(y=frame, sr=sr).mean()
        rolloff = librosa.feature.spectral_rolloff(y=frame, sr=sr).mean()
    except:
        centroid = bandwidth = rolloff = 0
    
    # Extract MFCCs
    try:
        mfccs = librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=13)
        mfccs_mean = mfccs.mean(axis=1)
        mfccs_std = mfccs.std(axis=1)
    except:
        mfccs_mean = np.zeros(13)
        mfccs_std = np.zeros(13)
    
    features = {
        'rms': rms,
        'zcr': zcr, 
        'centroid': centroid,
        'bandwidth': bandwidth,
        'rolloff': rolloff
    }
    
    # Add MFCC means and standard deviations
    for i, (mean_val, std_val) in enumerate(zip(mfccs_mean, mfccs_std), 1):
        features[f'mfcc_{i}_mean'] = mean_val
        features[f'mfcc_{i}_std'] = std_val
    
    return features

def extract_temporal_features(frame, sr, window_sec=5):
    """Extract temporal breathing pattern features from 30-second frame."""
    frame_duration = len(frame) / sr
    
    # Split frame into sub-windows for temporal analysis
    window_samples = int(window_sec * sr)
    n_windows = int(len(frame) // window_samples)
    
    if n_windows < 2:
        return {}
    
    # Extract features from each sub-window
    sub_window_features = []
    for i in range(n_windows):
        start_idx = i * window_samples
        end_idx = min((i + 1) * window_samples, len(frame))
        sub_frame = frame[start_idx:end_idx]
        
        if len(sub_frame) > sr * 0.5:  # At least 0.5 seconds
            sub_rms = librosa.feature.rms(y=sub_frame).mean()
            sub_zcr = librosa.feature.zero_crossing_rate(y=sub_frame).mean()
            sub_window_features.append({'rms': sub_rms, 'zcr': sub_zcr})
    
    if len(sub_window_features) < 2:
        return {}
    
    # Calculate temporal variability features
    rms_values = [f['rms'] for f in sub_window_features]
    zcr_values = [f['zcr'] for f in sub_window_features]
    
    temporal_features = {
        'rms_temporal_mean': np.mean(rms_values),
        'rms_temporal_std': np.std(rms_values),
        'rms_temporal_range': np.max(rms_values) - np.min(rms_values),
        'zcr_temporal_mean': np.mean(zcr_values),
        'zcr_temporal_std': np.std(zcr_values),
        'zcr_temporal_range': np.max(zcr_values) - np.min(zcr_values),
    }
    
    # Breathing regularity (coefficient of variation)
    if temporal_features['rms_temporal_mean'] > 0:
        temporal_features['rms_regularity'] = temporal_features['rms_temporal_std'] / temporal_features['rms_temporal_mean']
    else:
        temporal_features['rms_regularity'] = 0
        
    if temporal_features['zcr_temporal_mean'] > 0:
        temporal_features['zcr_regularity'] = temporal_features['zcr_temporal_std'] / temporal_features['zcr_temporal_mean']
    else:
        temporal_features['zcr_regularity'] = 0
    
    return temporal_features

def extract_silence_features(frame, sr, silence_threshold=0.01):
    """Extract silence/pause detection features."""
    # Identify silent segments
    frame_abs = np.abs(frame)
    silent_mask = frame_abs < silence_threshold
    
    # Calculate silence statistics
    total_samples = len(frame)
    silent_samples = np.sum(silent_mask)
    silence_ratio = silent_samples / total_samples
    
    # Find continuous silent segments
    silent_segments = []
    in_silence = False
    silence_start = 0
    
    for i, is_silent in enumerate(silent_mask):
        if is_silent and not in_silence:
            silence_start = i
            in_silence = True
        elif not is_silent and in_silence:
            silence_duration = (i - silence_start) / sr
            silent_segments.append(silence_duration)
            in_silence = False
    
    # Handle case where frame ends in silence
    if in_silence:
        silence_duration = (len(silent_mask) - silence_start) / sr
        silent_segments.append(silence_duration)
    
    silence_features = {
        'silence_ratio': silence_ratio,
        'silence_count': len(silent_segments),
        'silence_mean_duration': np.mean(silent_segments) if silent_segments else 0,
        'silence_max_duration': np.max(silent_segments) if silent_segments else 0,
        'silence_total_duration': np.sum(silent_segments) if silent_segments else 0,
    }
    
    return silence_features

def extract_comprehensive_features(frame, sr):
    """Extract all features for 30-second temporal analysis."""
    # Basic acoustic features
    basic_features = extract_basic_features(frame, sr)
    
    # Temporal breathing pattern features
    temporal_features = extract_temporal_features(frame, sr)
    
    # Silence/pause features
    silence_features = extract_silence_features(frame, sr)
    
    # Combine all features
    all_features = {}
    all_features.update(basic_features)
    all_features.update(temporal_features)
    all_features.update(silence_features)
    
    return all_features

def process_single_patient(patient_folder_name, patient_base_dir, patient_idx=0, total_patients=1):
    """Process a single patient with 30-second temporal frames."""
    start_time = time.time()
    print(f"\n{'='*60}")
    print(f"PATIENT {patient_idx+1}/{total_patients}: {patient_folder_name}")
    print(f"{'='*60}")
    
    patient_local_dir = os.path.join(patient_base_dir, patient_folder_name)
    patient_features = []
    
    # Find EDF and RML files
    print(f"üîç Scanning files in {patient_local_dir}...")
    edf_files = sorted([f for f in os.listdir(patient_local_dir) if f.endswith('.edf')])
    rml_files = sorted([f for f in os.listdir(patient_local_dir) if f.endswith('.rml')])
    
    print(f"   Found {len(edf_files)} EDF files: {edf_files}")
    print(f"   Found {len(rml_files)} RML files: {rml_files}")
    
    if not edf_files:
        print(f"‚ùå No EDF files found for {patient_folder_name}")
        return []
        
    if not rml_files:
        print(f"‚ùå No RML files found for {patient_folder_name}")
        return []
    
    # Process each EDF file
    for edf_idx, edf_file in enumerate(edf_files):
        print(f"\nüìÅ Processing EDF {edf_idx+1}/{len(edf_files)}: {edf_file}")
        edf_path = os.path.join(patient_local_dir, edf_file)
        rml_path = os.path.join(patient_local_dir, rml_files[0])  # Assume first RML file
        
        try:
            # Load apnea events
            print(f"   üìã Loading apnea events from {rml_files[0]}...")
            apnea_events = parse_respironics_rml(rml_path)
            print(f"   ‚úÖ Found {len(apnea_events)} apnea events")
            
            # Load EDF file
            print(f"   üéµ Loading EDF file...")
            raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
            fs = int(raw.info['sfreq'])
            total_duration_sec = raw.n_times / fs
            total_duration_min = total_duration_sec / 60
            
            print(f"   üìä Sample rate: {fs} Hz")
            print(f"   ‚è±Ô∏è  Total duration: {total_duration_min:.1f} minutes ({total_duration_sec:.0f} seconds)")
            print(f"   üé§ Available channels: {raw.ch_names}")
            
            # Check if microphone channel exists
            if AUDIO_CHANNEL_NAME not in raw.ch_names:
                print(f"   ‚ùå Channel '{AUDIO_CHANNEL_NAME}' not found in {edf_file}")
                continue
                
            raw.pick_channels([AUDIO_CHANNEL_NAME])
            print(f"   ‚úÖ Selected channel: {AUDIO_CHANNEL_NAME}")
            
            # Calculate frame parameters
            frame_size_samples = int(FRAME_DURATION_SEC * fs)
            overlap_samples = int(frame_size_samples * OVERLAP_RATIO)
            step_samples = frame_size_samples - overlap_samples
            
            # Estimate number of 30-second frames
            total_frames = max(1, (raw.n_times - frame_size_samples) // step_samples + 1)
            
            print(f"   üì¶ Processing in {FRAME_DURATION_SEC}-second frames with {OVERLAP_RATIO*100}% overlap")
            print(f"   üéûÔ∏è  Estimated frames: {total_frames}")
            
            frame_count = 0
            
            # Process in overlapping 30-second frames
            for frame_start in range(0, raw.n_times - frame_size_samples + 1, step_samples):
                frame_end = min(frame_start + frame_size_samples, raw.n_times)
                
                if frame_end - frame_start < frame_size_samples * 0.8:  # Skip short frames (< 80% of target)
                    continue
                
                frame_progress = (frame_count + 1) / total_frames * 100
                print(f"      üîÑ Frame {frame_count+1}/{total_frames} ({frame_progress:.1f}%): samples {frame_start}-{frame_end}")
                
                # Load frame
                audio_frame, _ = raw[:, frame_start:frame_end]
                audio_frame = audio_frame.flatten()
                
                # Calculate timestamp
                timestamp = frame_start / fs
                
                print(f"         ‚è∞ Time: {timestamp:.1f}s - {timestamp + FRAME_DURATION_SEC:.1f}s")
                
                # Calculate apnea label using proportion-based approach
                apnea_label, apnea_proportion = label_temporal_frame(
                    timestamp, FRAME_DURATION_SEC, apnea_events, APNEA_THRESHOLD
                )
                
                print(f"         üè∑Ô∏è  Apnea proportion: {apnea_proportion:.3f}, Label: {apnea_label}")
                
                # Extract comprehensive features
                print(f"         üéØ Extracting features...")
                features = extract_comprehensive_features(audio_frame, fs)
                
                # Create feature record
                feature_record = {
                    'patient_id': patient_folder_name,
                    'timestamp': timestamp,
                    'frame_duration': FRAME_DURATION_SEC,
                    'apnea_label': apnea_label,
                    'apnea_proportion': apnea_proportion,
                    **{f'clean_{k}': v for k, v in features.items()}
                }
                
                patient_features.append(feature_record)
                frame_count += 1
                
                print(f"         ‚úÖ Frame {frame_count} processed successfully")
                
        except Exception as e:
            print(f"   ‚ùå Error processing {edf_file}: {e}")
            import traceback
            traceback.print_exc()
            continue
    
    # Final patient summary
    elapsed_time = time.time() - start_time
    apnea_count = sum(1 for f in patient_features if f['apnea_label'] == 1)
    apnea_percentage = (apnea_count / len(patient_features) * 100) if patient_features else 0
    
    if patient_features:
        avg_apnea_proportion = np.mean([f['apnea_proportion'] for f in patient_features])
    else:
        avg_apnea_proportion = 0
    
    print(f"\nüèÅ PATIENT {patient_folder_name} COMPLETE:")
    print(f"   üìä Total 30-second frames extracted: {len(patient_features)}")
    print(f"   üö® Apnea frames (>{APNEA_THRESHOLD*100}% threshold): {apnea_count} ({apnea_percentage:.1f}%)")
    print(f"   üò¥ Normal frames: {len(patient_features) - apnea_count} ({100-apnea_percentage:.1f}%)")
    print(f"   üìà Average apnea proportion: {avg_apnea_proportion:.3f}")
    print(f"   ‚è±Ô∏è  Processing time: {elapsed_time:.1f} seconds")
    
    if elapsed_time > 0:
        print(f"   üìà Frames per minute: {len(patient_features)/(elapsed_time/60):.1f}")
    
    return patient_features

# COMMENTED OUT - Noise injection and denoising functions for future use
# def add_noise(clean_signal, noise_signal, sr, noise_level_rms_ratio):
#     """Mixes a clean signal with a noise signal using the provided RMS ratio logic."""
#     # Implementation preserved for future use
#     pass
# 
# def run_denoiser_script(script_name, input_wav_path, output_wav_path, denoiser_script_map, sr, current_temp_dir):
#     """Wrapper for calling external denoiser scripts"""
#     # Implementation preserved for future use
#     pass

print("Enhanced helper functions and process_single_patient function defined for 30-second temporal feature extraction.")

--- Defining helper functions and main processing function for 30-second temporal feature extraction... ---
Enhanced helper functions and process_single_patient function defined for 30-second temporal feature extraction.


In [10]:
# Cell 4: Parallel Processing Setup (Before Main Execution)
print("--- Setting up parallel processing capabilities... ---")

from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import multiprocessing as mp
import sys
import os

def setup_parallel_processing():
    """Setup directories and tracking for parallel processing"""
    temp_dir = Path('./temp_features/')
    temp_dir.mkdir(exist_ok=True)
    
    final_dataset_path = './clean_features_dataset_30sec_incremental.csv'
    progress_file = temp_dir / 'progress.txt'
    
    return temp_dir, final_dataset_path, progress_file

def get_completed_patients(progress_file):
    """Get list of already processed patients"""
    if progress_file.exists():
        with open(progress_file, 'r') as f:
            return set(line.strip() for line in f)
    return set()

def append_to_final_dataset(patient_csv, final_dataset_path, patient_id, progress_file):
    """Thread-safe append to final dataset"""
    patient_df = pd.read_csv(patient_csv)
    
    # Lock-free append (pandas handles this well)
    if Path(final_dataset_path).exists():
        # Read, append, write
        existing_df = pd.read_csv(final_dataset_path)
        combined_df = pd.concat([existing_df, patient_df], ignore_index=True)
        combined_df.to_csv(final_dataset_path, index=False)
    else:
        # First patient - just copy
        patient_df.to_csv(final_dataset_path, index=False)
    
    # Update progress tracking
    with open(progress_file, 'a') as f:
        f.write(f"{patient_id}\n")
    
    current_size = len(pd.read_csv(final_dataset_path))
    print(f"üìä Dataset now has {current_size:,} total frames")

def process_patient_safe(patient_id, temp_dir, final_dataset_path, progress_file, patient_base_dir):
    """Safe wrapper for patient processing with proper resource management"""
    lock_file = temp_dir / f"{patient_id}.lock"
    patient_csv = temp_dir / f"{patient_id}_features.csv"
    
    try:
        # Check if already completed
        completed_patients = get_completed_patients(progress_file)
        if patient_id in completed_patients:
            print(f"‚úÖ {patient_id} already in final dataset")
            return True
        
        # Check if CSV exists (completed but not merged)
        if patient_csv.exists():
            print(f"üìÅ {patient_id} features found, merging to dataset...")
            append_to_final_dataset(patient_csv, final_dataset_path, patient_id, progress_file)
            patient_csv.unlink()  # Remove temp file after merging
            return True
        
        # Try to acquire processing lock
        try:
            lock_file.touch(exist_ok=False)
            print(f"üîÑ Processing {patient_id}...")
            
            # Process the patient with resource management
            patient_features = process_single_patient_safe(patient_id, patient_base_dir)
            
            if patient_features:
                # Save individual patient features
                patient_df = pd.DataFrame(patient_features)
                patient_df.to_csv(patient_csv, index=False)
                
                # Append to final dataset
                append_to_final_dataset(patient_csv, final_dataset_path, patient_id, progress_file)
                
                # Cleanup
                patient_csv.unlink()  # Remove temp file after merging
            
            lock_file.unlink()  # Remove lock
            print(f"‚úÖ {patient_id} completed and merged")
            return True
            
        except FileExistsError:
            print(f"‚è≥ {patient_id} being processed by another core...")
            return False
            
    except Exception as e:
        print(f"‚ùå Error processing {patient_id}: {e}")
        import traceback
        traceback.print_exc()
        if lock_file.exists():
            lock_file.unlink()
        return False

def process_single_patient_safe(patient_folder_name, patient_base_dir):
    """Memory-efficient version of process_single_patient for multiprocessing"""
    # Re-import everything needed in the subprocess
    import os
    import time
    import numpy as np
    import pandas as pd
    import librosa
    import mne
    import sys
    sys.path.append('../src')
    from working_with_xml import extract_apnea_events
    
    # Suppress warnings in subprocess
    import warnings
    warnings.filterwarnings('ignore')
    
    # Use configuration from global scope (these need to be defined)
    AUDIO_CHANNEL_NAME = 'Mic'
    FRAME_DURATION_SEC = 30.0
    OVERLAP_RATIO = 0.5
    APNEA_THRESHOLD = 0.1
    
    def parse_respironics_rml_local(rml_path):
        apnea_event_data = extract_apnea_events(rml_path, output_csv=None)
        return [(float(start), float(end)) for event_type, start, end in apnea_event_data]
    
    def calculate_apnea_proportion_local(frame_start_sec, frame_duration_sec, apnea_events):
        frame_end_sec = frame_start_sec + frame_duration_sec
        apnea_seconds = 0
        for start, end in apnea_events:
            overlap_start = max(frame_start_sec, start)
            overlap_end = min(frame_end_sec, end)
            if overlap_start < overlap_end:
                apnea_seconds += (overlap_end - overlap_start)
        return apnea_seconds / frame_duration_sec
    
    def label_temporal_frame_local(frame_start_sec, frame_duration_sec, apnea_events, threshold=0.1):
        proportion = calculate_apnea_proportion_local(frame_start_sec, frame_duration_sec, apnea_events)
        return 1 if proportion > threshold else 0, proportion
    
    def extract_features_local(frame, sr):
        """Simplified feature extraction for multiprocessing"""
        if len(frame) == 0:
            return {}
        
        try:
            # Basic features
            rms = librosa.feature.rms(y=frame).mean()
            zcr = librosa.feature.zero_crossing_rate(y=frame).mean()
            centroid = librosa.feature.spectral_centroid(y=frame, sr=sr).mean()
            bandwidth = librosa.feature.spectral_bandwidth(y=frame, sr=sr).mean()
            rolloff = librosa.feature.spectral_rolloff(y=frame, sr=sr).mean()
            
            # MFCCs
            mfccs = librosa.feature.mfcc(y=frame, sr=sr, n_mfcc=13)
            mfccs_mean = mfccs.mean(axis=1)
            mfccs_std = mfccs.std(axis=1)
            
            features = {
                'rms': rms, 'zcr': zcr, 'centroid': centroid,
                'bandwidth': bandwidth, 'rolloff': rolloff
            }
            
            # Add MFCCs
            for i, (mean_val, std_val) in enumerate(zip(mfccs_mean, mfccs_std), 1):
                features[f'mfcc_{i}_mean'] = mean_val
                features[f'mfcc_{i}_std'] = std_val
            
            return features
            
        except Exception as e:
            print(f"Feature extraction error: {e}")
            return {}
    
    # Main processing
    start_time = time.time()
    print(f"üîÑ Starting {patient_folder_name}...")
    
    patient_local_dir = os.path.join(patient_base_dir, patient_folder_name)
    patient_features = []
    
    try:
        # Find files
        edf_files = [f for f in os.listdir(patient_local_dir) if f.endswith('.edf')]
        rml_files = [f for f in os.listdir(patient_local_dir) if f.endswith('.rml')]
        
        if not edf_files or not rml_files:
            print(f"‚ùå Missing files for {patient_folder_name}")
            return []
        
        # Load apnea events
        rml_path = os.path.join(patient_local_dir, rml_files[0])
        apnea_events = parse_respironics_rml_local(rml_path)
        
        # Process EDF file
        edf_path = os.path.join(patient_local_dir, edf_files[0])
        raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
        
        if AUDIO_CHANNEL_NAME not in raw.ch_names:
            print(f"‚ùå No {AUDIO_CHANNEL_NAME} channel in {patient_folder_name}")
            return []
        
        raw.pick_channels([AUDIO_CHANNEL_NAME])
        fs = int(raw.info['sfreq'])
        
        # Frame parameters
        frame_size_samples = int(FRAME_DURATION_SEC * fs)
        step_samples = int(frame_size_samples * (1 - OVERLAP_RATIO))
        
        frame_count = 0
        max_frames = 200  # Limit frames per patient to prevent memory issues
        
        # Process frames
        for frame_start in range(0, raw.n_times - frame_size_samples + 1, step_samples):
            if frame_count >= max_frames:
                break
                
            frame_end = frame_start + frame_size_samples
            
            # Load frame data
            audio_frame, _ = raw[:, frame_start:frame_end]
            audio_frame = audio_frame.flatten()
            
            timestamp = frame_start / fs
            
            # Get labels
            apnea_label, apnea_proportion = label_temporal_frame_local(
                timestamp, FRAME_DURATION_SEC, apnea_events, APNEA_THRESHOLD
            )
            
            # Extract features
            features = extract_features_local(audio_frame, fs)
            
            if features:  # Only add if feature extraction succeeded
                feature_record = {
                    'patient_id': patient_folder_name,
                    'timestamp': timestamp,
                    'frame_duration': FRAME_DURATION_SEC,
                    'apnea_label': apnea_label,
                    'apnea_proportion': apnea_proportion,
                    **{f'clean_{k}': v for k, v in features.items()}
                }
                patient_features.append(feature_record)
                frame_count += 1
        
        elapsed = time.time() - start_time
        print(f"‚úÖ {patient_folder_name}: {len(patient_features)} frames in {elapsed:.1f}s")
        return patient_features
        
    except Exception as e:
        print(f"‚ùå {patient_folder_name} failed: {e}")
        return []

# Global variables for multiprocessing (required to avoid pickle errors)
_global_temp_dir = None
_global_final_dataset_path = None
_global_progress_file = None
_global_patient_base_dir = None

def _process_wrapper_global(patient_id):
    """Global wrapper function that can be pickled by multiprocessing"""
    return process_patient_safe(
        patient_id, 
        _global_temp_dir, 
        _global_final_dataset_path, 
        _global_progress_file, 
        _global_patient_base_dir
    )

def run_parallel_extraction(patient_list, patient_base_dir, max_workers=4):
    """Run parallel feature extraction with better resource management"""
    global _global_temp_dir, _global_final_dataset_path, _global_progress_file, _global_patient_base_dir
    
    # Reduce workers to prevent memory issues
    max_workers = min(max_workers, 4)  # Limit to 4 cores max
    
    # Setup
    temp_dir, final_dataset_path, progress_file = setup_parallel_processing()
    
    # Set globals for multiprocessing
    _global_temp_dir = temp_dir
    _global_final_dataset_path = final_dataset_path  
    _global_progress_file = progress_file
    _global_patient_base_dir = patient_base_dir
    
    print(f"üöÄ Starting parallel processing with {max_workers} workers...")
    print(f"üìÅ Temp dir: {temp_dir}")
    print(f"üìä Final dataset: {final_dataset_path}")
    print(f"üë• Processing patients: {patient_list}")
    
    completed = 0
    failed = 0
    
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all jobs using the global wrapper
        futures = {executor.submit(_process_wrapper_global, pid): pid for pid in patient_list}
        
        for future in futures:
            try:
                result = future.result(timeout=600)  # 10 minute timeout per patient
                if result:
                    completed += 1
                else:
                    failed += 1
                patient_id = futures[future]
                print(f"üéØ Progress: {completed}/{len(patient_list)} completed, {failed} failed ({patient_id})")
            except Exception as e:
                patient_id = futures[future]
                print(f"‚ùå Error with {patient_id}: {e}")
                failed += 1
    
    # Final summary
    if Path(final_dataset_path).exists():
        final_df = pd.read_csv(final_dataset_path)
        print(f"\nüéâ PARALLEL PROCESSING COMPLETE!")
        print(f"üìä Final dataset: {final_df.shape}")
        print(f"üë• Patients: {final_df['patient_id'].nunique()}")
        print(f"‚è±Ô∏è  Total frames: {len(final_df):,}")
        print(f"‚úÖ Completed: {completed}, ‚ùå Failed: {failed}")
        if len(final_df) > 0:
            print(f"üö® Apnea frames: {final_df['apnea_label'].sum():,} ({final_df['apnea_label'].mean()*100:.1f}%)")
        return final_dataset_path
    else:
        print("‚ùå No final dataset created")
        return None

# Set the flag for parallel processing
USE_PARALLEL_PROCESSING = True  # Set to True to enable parallel processing

print("‚úÖ Improved parallel processing functions ready! (4 cores max, better error handling)")

--- Setting up parallel processing capabilities... ---
‚úÖ Improved parallel processing functions ready! (4 cores max, better error handling)


In [11]:
# Cell 5: Execute Processing - Choose Sequential or Parallel Mode
print("--- Executing clean feature extraction with progress tracking... ---")

# Find patient folders
if not PATIENT_FOLDERS_TO_PROCESS:
    # Auto-detect patient folders
    print("üîç Auto-detecting patient folders...")
    all_folders = [f for f in os.listdir(RAW_PATIENT_DATA_BASE_DIR) if os.path.isdir(os.path.join(RAW_PATIENT_DATA_BASE_DIR, f))]
    patient_folders = [f for f in all_folders if 'patient' in f.lower()]
    PATIENT_FOLDERS_TO_PROCESS = sorted(patient_folders)

print(f"üìÅ Found {len(PATIENT_FOLDERS_TO_PROCESS)} patient folders: {PATIENT_FOLDERS_TO_PROCESS}")

if DEBUG_MODE:
    PATIENT_FOLDERS_TO_PROCESS = PATIENT_FOLDERS_TO_PROCESS[:DEBUG_PATIENT_COUNT]
    print(f"üêõ DEBUG MODE: Processing only {len(PATIENT_FOLDERS_TO_PROCESS)} patients")

# Check processing mode
if USE_PARALLEL_PROCESSING and len(PATIENT_FOLDERS_TO_PROCESS) > 1:
    print(f"\nüöÄ PARALLEL PROCESSING MODE ENABLED")
    print(f"üë• Processing {len(PATIENT_FOLDERS_TO_PROCESS)} patients with 6 cores")
    print(f"‚ö° Expected to complete in one cycle!")
    
    # Run parallel processing
    final_dataset_path = run_parallel_extraction(
        PATIENT_FOLDERS_TO_PROCESS, 
        RAW_PATIENT_DATA_BASE_DIR, 
        max_workers=6
    )
    
    if final_dataset_path:
        # Load and display results
        df = pd.read_csv(final_dataset_path)
        print(f"\nüìä FINAL RESULTS:")
        print(f"‚úÖ Dataset saved to: {final_dataset_path}")
        print(f"üìã Dataset shape: {df.shape}")
        print(f"üë• Unique patients: {df['patient_id'].nunique()}")
        display(df.head())
        
else:
    print(f"\nüîÑ SEQUENTIAL PROCESSING MODE")
    print(f"üë• Processing {len(PATIENT_FOLDERS_TO_PROCESS)} patients one by one")
    
    # Sequential processing (original code)
    total_patients = len(PATIENT_FOLDERS_TO_PROCESS)
    all_features = []
    overall_start_time = time.time()

    print(f"\nüöÄ STARTING PROCESSING OF {total_patients} PATIENTS")
    print(f"üìä Estimated time: {total_patients * 2:.0f}-{total_patients * 5:.0f} minutes")
    print(f"{'='*80}")

    # Process all patients with enhanced progress tracking
    for patient_idx, patient_folder in enumerate(PATIENT_FOLDERS_TO_PROCESS):
        overall_progress = (patient_idx + 1) / total_patients * 100
        elapsed_overall = time.time() - overall_start_time
        
        print(f"\nüîÑ OVERALL PROGRESS: {patient_idx+1}/{total_patients} ({overall_progress:.1f}%)")
        print(f"‚è±Ô∏è  Overall elapsed time: {elapsed_overall/60:.1f} minutes")
        
        if patient_idx > 0:
            avg_time_per_patient = elapsed_overall / patient_idx
            estimated_remaining = avg_time_per_patient * (total_patients - patient_idx)
            print(f"‚è≥ Estimated remaining time: {estimated_remaining/60:.1f} minutes")
        
        # Process single patient with detailed tracking
        patient_features = process_single_patient(patient_folder, RAW_PATIENT_DATA_BASE_DIR, patient_idx, total_patients)
        all_features.extend(patient_features)
        
        # Running totals
        running_total_features = len(all_features)
        running_apnea_count = sum(1 for f in all_features if f['apnea_label'] == 1)
        running_apnea_percentage = (running_apnea_count / running_total_features * 100) if running_total_features else 0
        
        print(f"\nüìà RUNNING TOTALS AFTER {patient_idx+1} PATIENTS:")
        print(f"   üìä Total features so far: {running_total_features:,}")
        print(f"   üö® Total apnea frames: {running_apnea_count:,} ({running_apnea_percentage:.1f}%)")
        print(f"   üò¥ Total normal frames: {running_total_features - running_apnea_count:,} ({100-running_apnea_percentage:.1f}%)")

    # Final processing summary
    total_elapsed = time.time() - overall_start_time
    print(f"\n{'='*80}")
    print(f"üéâ ALL PROCESSING COMPLETE!")
    print(f"‚è±Ô∏è  Total processing time: {total_elapsed/60:.1f} minutes ({total_elapsed:.1f} seconds)")
    print(f"üìä Total features extracted: {len(all_features):,}")
    print(f"üë• Patients processed: {total_patients}")
    print(f"üìà Average features per patient: {len(all_features)/total_patients:.0f}")
    print(f"‚ö° Processing rate: {len(all_features)/total_elapsed:.1f} features/second")

    # Convert to DataFrame and save with enhanced reporting
    if all_features:
        print(f"\nüíæ Converting to DataFrame and saving...")
        df = pd.DataFrame(all_features)
        
        # Enhanced dataset statistics
        print(f"\nüìä FINAL DATASET STATISTICS:")
        print(f"{'='*50}")
        print(f"Total frames: {len(df):,}")
        print(f"Unique patients: {df['patient_id'].nunique()}")
        print(f"Average frames per patient: {len(df)/df['patient_id'].nunique():.0f}")
        print(f"Total duration: {df['timestamp'].max()/3600:.1f} hours")
        print(f"Apnea frames: {df['apnea_label'].sum():,} ({df['apnea_label'].mean()*100:.2f}%)")
        print(f"Non-apnea frames: {(df['apnea_label'] == 0).sum():,} ({(1-df['apnea_label'].mean())*100:.2f}%)")
        
        # Per-patient breakdown
        print(f"\nüë• PER-PATIENT BREAKDOWN:")
        print(f"{'='*50}")
        patient_stats = df.groupby('patient_id').agg({
            'apnea_label': ['count', 'sum', 'mean'],
            'timestamp': ['min', 'max']
        }).round(3)
        
        for patient in patient_stats.index:
            frame_count = patient_stats.loc[patient, ('apnea_label', 'count')]
            apnea_count = patient_stats.loc[patient, ('apnea_label', 'sum')]
            apnea_rate = patient_stats.loc[patient, ('apnea_label', 'mean')] * 100
            duration_hours = (patient_stats.loc[patient, ('timestamp', 'max')] - 
                             patient_stats.loc[patient, ('timestamp', 'min')]) / 3600
            print(f"{patient}: {frame_count:,} frames, {apnea_count:,} apnea ({apnea_rate:.1f}%), {duration_hours:.1f}h")
        
        # Save to CSV
        output_file = os.path.join(CSV_OUTPUT_PATH, 'clean_features_dataset_30sec.csv')
        print(f"\nüíæ Saving dataset to CSV...")
        df.to_csv(output_file, index=False)
        file_size_mb = os.path.getsize(output_file) / (1024*1024)
        
        print(f"‚úÖ Dataset saved successfully!")
        print(f"üìÅ File path: {output_file}")
        print(f"üìè File size: {file_size_mb:.1f} MB")
        print(f"üî¢ Columns: {len(df.columns)}")
        
        # Display sample data
        print(f"\nüìã SAMPLE DATA (First 5 rows):")
        display(df.head())
        
        print(f"\nüéØ Dataset is ready for model training!")
        
    else:
        print("‚ùå No features extracted. Check your patient data and configuration.")

print(f"\n{'='*80}")
print(f"üèÅ FEATURE EXTRACTION COMPLETE!")
print(f"{'='*80}")

--- Executing clean feature extraction with progress tracking... ---
üîç Auto-detecting patient folders...
üìÅ Found 24 patient folders: ['patient_01', 'patient_02', 'patient_03', 'patient_04', 'patient_05', 'patient_06', 'patient_07', 'patient_08', 'patient_09', 'patient_10', 'patient_11', 'patient_12', 'patient_13', 'patient_14', 'patient_15', 'patient_16', 'patient_17', 'patient_18', 'patient_19', 'patient_20', 'patient_21', 'patient_22', 'patient_23', 'patient_24']
üêõ DEBUG MODE: Processing only 6 patients

üöÄ PARALLEL PROCESSING MODE ENABLED
üë• Processing 6 patients with 6 cores
‚ö° Expected to complete in one cycle!
üöÄ Starting parallel processing with 4 workers...
üìÅ Temp dir: temp_features
üìä Final dataset: ./clean_features_dataset_30sec_incremental.csv
üë• Processing patients: ['patient_01', 'patient_02', 'patient_03', 'patient_04', 'patient_05', 'patient_06']


‚ùå Error with patient_01: A process in the process pool was terminated abruptly while the future was running or pending.
‚ùå Error with patient_02: A process in the process pool was terminated abruptly while the future was running or pending.
‚ùå Error with patient_03: A process in the process pool was terminated abruptly while the future was running or pending.
‚ùå Error with patient_04: A process in the process pool was terminated abruptly while the future was running or pending.
‚ùå Error with patient_05: A process in the process pool was terminated abruptly while the future was running or pending.
‚ùå Error with patient_06: A process in the process pool was terminated abruptly while the future was running or pending.
‚ùå No final dataset created

üèÅ FEATURE EXTRACTION COMPLETE!


In [None]:
# Cell 6: Execute Processing and Generate Dataset with Enhanced Progress Tracking
print("--- Executing clean feature extraction with detailed progress... ---")

# Find patient folders
if not PATIENT_FOLDERS_TO_PROCESS:
    # Auto-detect patient folders
    print("üîç Auto-detecting patient folders...")
    all_folders = [f for f in os.listdir(RAW_PATIENT_DATA_BASE_DIR) if os.path.isdir(os.path.join(RAW_PATIENT_DATA_BASE_DIR, f))]
    patient_folders = [f for f in all_folders if 'patient' in f.lower()]
    PATIENT_FOLDERS_TO_PROCESS = sorted(patient_folders)

print(f"üìÅ Found {len(PATIENT_FOLDERS_TO_PROCESS)} patient folders: {PATIENT_FOLDERS_TO_PROCESS}")

if DEBUG_MODE:
    PATIENT_FOLDERS_TO_PROCESS = PATIENT_FOLDERS_TO_PROCESS[:DEBUG_PATIENT_COUNT]
    print(f"üêõ DEBUG MODE: Processing only {len(PATIENT_FOLDERS_TO_PROCESS)} patients")

# Overall progress tracking
total_patients = len(PATIENT_FOLDERS_TO_PROCESS)
all_features = []
overall_start_time = time.time()

print(f"\nüöÄ STARTING PROCESSING OF {total_patients} PATIENTS")
print(f"üìä Estimated time: {total_patients * 2:.0f}-{total_patients * 5:.0f} minutes")
print(f"{'='*80}")

# Process all patients with enhanced progress tracking
for patient_idx, patient_folder in enumerate(PATIENT_FOLDERS_TO_PROCESS):
    overall_progress = (patient_idx + 1) / total_patients * 100
    elapsed_overall = time.time() - overall_start_time
    
    print(f"\nüîÑ OVERALL PROGRESS: {patient_idx+1}/{total_patients} ({overall_progress:.1f}%)")
    print(f"‚è±Ô∏è  Overall elapsed time: {elapsed_overall/60:.1f} minutes")
    
    if patient_idx > 0:
        avg_time_per_patient = elapsed_overall / patient_idx
        estimated_remaining = avg_time_per_patient * (total_patients - patient_idx)
        print(f"‚è≥ Estimated remaining time: {estimated_remaining/60:.1f} minutes")
    
    # Process single patient with detailed tracking
    patient_features = process_single_patient(patient_folder, RAW_PATIENT_DATA_BASE_DIR, patient_idx, total_patients)
    all_features.extend(patient_features)
    
    # Running totals
    running_total_features = len(all_features)
    running_apnea_count = sum(1 for f in all_features if f['apnea_label'] == 1)
    running_apnea_percentage = (running_apnea_count / running_total_features * 100) if running_total_features else 0
    
    print(f"\nüìà RUNNING TOTALS AFTER {patient_idx+1} PATIENTS:")
    print(f"   üìä Total features so far: {running_total_features:,}")
    print(f"   üö® Total apnea frames: {running_apnea_count:,} ({running_apnea_percentage:.1f}%)")
    print(f"   üò¥ Total normal frames: {running_total_features - running_apnea_count:,} ({100-running_apnea_percentage:.1f}%)")

# Final processing summary
total_elapsed = time.time() - overall_start_time
print(f"\n{'='*80}")
print(f"üéâ ALL PROCESSING COMPLETE!")
print(f"‚è±Ô∏è  Total processing time: {total_elapsed/60:.1f} minutes ({total_elapsed:.1f} seconds)")
print(f"üìä Total features extracted: {len(all_features):,}")
print(f"üë• Patients processed: {total_patients}")
print(f"üìà Average features per patient: {len(all_features)/total_patients:.0f}")
print(f"‚ö° Processing rate: {len(all_features)/total_elapsed:.1f} features/second")

# Convert to DataFrame and save with enhanced reporting
if all_features:
    print(f"\nüíæ Converting to DataFrame and saving...")
    df = pd.DataFrame(all_features)
    
    # Enhanced dataset statistics
    print(f"\nüìä FINAL DATASET STATISTICS:")
    print(f"{'='*50}")
    print(f"Total frames: {len(df):,}")
    print(f"Unique patients: {df['patient_id'].nunique()}")
    print(f"Average frames per patient: {len(df)/df['patient_id'].nunique():.0f}")
    print(f"Total duration: {df['timestamp'].max()/3600:.1f} hours")
    print(f"Apnea frames: {df['apnea_label'].sum():,} ({df['apnea_label'].mean()*100:.2f}%)")
    print(f"Non-apnea frames: {(df['apnea_label'] == 0).sum():,} ({(1-df['apnea_label'].mean())*100:.2f}%)")
    
    # Per-patient breakdown
    print(f"\nüë• PER-PATIENT BREAKDOWN:")
    print(f"{'='*50}")
    patient_stats = df.groupby('patient_id').agg({
        'apnea_label': ['count', 'sum', 'mean'],
        'timestamp': ['min', 'max']
    }).round(3)
    
    for patient in patient_stats.index:
        frame_count = patient_stats.loc[patient, ('apnea_label', 'count')]
        apnea_count = patient_stats.loc[patient, ('apnea_label', 'sum')]
        apnea_rate = patient_stats.loc[patient, ('apnea_label', 'mean')] * 100
        duration_hours = (patient_stats.loc[patient, ('timestamp', 'max')] - 
                         patient_stats.loc[patient, ('timestamp', 'min')]) / 3600
        print(f"{patient}: {frame_count:,} frames, {apnea_count:,} apnea ({apnea_rate:.1f}%), {duration_hours:.1f}h")
    
    # Save to CSV
    print(f"\nüíæ Saving dataset to CSV...")
    df.to_csv(CSV_OUTPUT_PATH, index=False)
    file_size_mb = os.path.getsize(CSV_OUTPUT_PATH) / (1024*1024)
    
    print(f"‚úÖ Dataset saved successfully!")
    print(f"üìÅ File path: {CSV_OUTPUT_PATH}")
    print(f"üìè File size: {file_size_mb:.1f} MB")
    print(f"üî¢ Columns: {len(df.columns)}")
    
    # Display sample data
    print(f"\nüìã SAMPLE DATA (First 5 rows):")
    display(df.head())
    
    print(f"\nüéØ Dataset is ready for model training!")
    
else:
    print("‚ùå No features extracted. Check your patient data and configuration.")

print(f"\n{'='*80}")
print(f"üèÅ CLEAN FEATURE EXTRACTION COMPLETE!")
print(f"{'='*80}")

--- Executing clean feature extraction with detailed progress... ---
üìÅ Found 6 patient folders: ['patient_01', 'patient_02', 'patient_03', 'patient_04', 'patient_05', 'patient_06']
üêõ DEBUG MODE: Processing only 6 patients

üöÄ STARTING PROCESSING OF 6 PATIENTS
üìä Estimated time: 12-30 minutes

üîÑ OVERALL PROGRESS: 1/6 (16.7%)
‚è±Ô∏è  Overall elapsed time: 0.0 minutes

PATIENT 1/6: patient_01
üîç Scanning files in F:\Solo All In One Docs\Evaluating-Noise-Reduction-Techniques\data\sleep_data\patient_01...
   Found 5 EDF files: ['00001000-100507[001].edf', '00001000-100507[002].edf', '00001000-100507[003].edf', '00001000-100507[004].edf', '00001000-100507[005].edf']
   Found 1 RML files: ['00001000-100507.rml']

üìÅ Processing EDF 1/5: 00001000-100507[001].edf
   üìã Loading apnea events from 00001000-100507.rml...
ObstructiveApnea: 41.0s to 54.5s (duration: 13.5s)
ObstructiveApnea: 91.5s to 110.5s (duration: 19.0s)
ObstructiveApnea: 436.0s to 448.0s (duration: 12.0s)
Obstru