In [1]:
# Cell 1: Imports and Configuration
print("--- Setting up MULTI-EDF threading-based feature extraction (16kHz) ---")

import os
import time
import numpy as np
import pandas as pd
import librosa
import mne
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add your custom XML parser
import sys
sys.path.append('../src')

# Test import first
try:
    from working_with_xml import extract_apnea_events
    print("✅ XML parser imported successfully")
except Exception as e:
    print(f"❌ XML parser import failed: {e}")

# Configuration - Multi-EDF processing
PATIENT_DATA_DIR = "F:/Solo All In One Docs/Evaluating-Noise-Reduction-Techniques/data/sleep_data"
OUTPUT_FILE = "../data/datasets/final_local_dataset.csv" # New filename for 16kHz dataset
PATIENTS_TO_PROCESS = ['patient_01', 'patient_02', 'patient_03', 'patient_04', 'patient_05', 'patient_06','patient_07',
                       'patient_07','patient_08','patient_09','patient_10','patient_11','patient_12','patient_13','patient_14','patient_15'
                       ,'patient_16','patient_17','patient_18','patient_19','patient_20','patient_21','patient_22','patient_23','patient_24']
MAX_WORKERS = 4  # Keep at 4 for memory safety with multiple EDFs

# Audio processing settings
AUDIO_CHANNEL = 'Mic'
FRAME_DURATION = 30.0  # seconds
OVERLAP_RATIO = 0.5
APNEA_THRESHOLD = 0.1

print(f"✅ Configuration set for {len(PATIENTS_TO_PROCESS)} patients with {MAX_WORKERS} threads")
print(f"📁 Data directory: {PATIENT_DATA_DIR}")
print(f"📊 Output file: {OUTPUT_FILE}")
print(f"🎯 Processing: ALL EDF files per patient (5 EDFs × 6 patients = 30 hours)")
print(f"🧵 Memory-safe: Sequential EDF processing within each patient")
print(f"⚡ Threading: Between patients (4 concurrent patients)")
print(f"🎵 Audio: Downsampling to 16kHz for optimized processing")

--- Setting up MULTI-EDF threading-based feature extraction (16kHz) ---
✅ XML parser imported successfully
✅ Configuration set for 25 patients with 4 threads
📁 Data directory: F:/Solo All In One Docs/Evaluating-Noise-Reduction-Techniques/data/sleep_data
📊 Output file: ../data/datasets/final_local_dataset.csv
🎯 Processing: ALL EDF files per patient (5 EDFs × 6 patients = 30 hours)
🧵 Memory-safe: Sequential EDF processing within each patient
⚡ Threading: Between patients (4 concurrent patients)
🎵 Audio: Downsampling to 16kHz for optimized processing


In [2]:
# Cell 2: Threading-Compatible Processing Function (All EDFs)
def process_patient_threading(patient_id, data_dir):
    """Process a single patient - ALL EDF files sequentially for memory safety"""
    
    start_time = time.time()
    print(f"🔄 Starting {patient_id} (Thread)...")
    
    # Target sample rate for processing
    TARGET_SAMPLE_RATE = 16000  # 16kHz for optimized processing
    
    def extract_comprehensive_features(audio_frame, sample_rate):
        """Extract comprehensive features from audio frame"""
        try:
            if len(audio_frame) == 0:
                return None
                
            # Basic acoustic features
            rms = float(librosa.feature.rms(y=audio_frame).mean())
            zcr = float(librosa.feature.zero_crossing_rate(y=audio_frame).mean())
            centroid = float(librosa.feature.spectral_centroid(y=audio_frame, sr=sample_rate).mean())
            bandwidth = float(librosa.feature.spectral_bandwidth(y=audio_frame, sr=sample_rate).mean())
            rolloff = float(librosa.feature.spectral_rolloff(y=audio_frame, sr=sample_rate).mean())
            
            # MFCCs (first 8 coefficients)
            mfccs = librosa.feature.mfcc(y=audio_frame, sr=sample_rate, n_mfcc=8)
            mfcc_means = mfccs.mean(axis=1)
            mfcc_stds = mfccs.std(axis=1)
            
            # Temporal features for breathing patterns (5-second windows)
            window_size = int(5 * sample_rate)  # 5 seconds
            num_windows = len(audio_frame) // window_size
            
            if num_windows >= 2:
                rms_windows = []
                zcr_windows = []
                
                for i in range(num_windows):
                    start_idx = i * window_size
                    end_idx = start_idx + window_size
                    window = audio_frame[start_idx:end_idx]
                    
                    rms_windows.append(librosa.feature.rms(y=window).mean())
                    zcr_windows.append(librosa.feature.zero_crossing_rate(y=window).mean())
                
                rms_variability = float(np.std(rms_windows))
                zcr_variability = float(np.std(zcr_windows))
                breathing_regularity = float(1.0 / (1.0 + rms_variability))  # Higher = more regular
            else:
                rms_variability = 0.0
                zcr_variability = 0.0
                breathing_regularity = 0.5
            
            # Silence detection
            silence_threshold = np.percentile(np.abs(audio_frame), 20)  # Bottom 20% as silence
            silence_mask = np.abs(audio_frame) < silence_threshold
            silence_ratio = float(np.mean(silence_mask))
            
            # Breathing pause detection (continuous silence periods)
            silence_changes = np.diff(silence_mask.astype(int))
            pause_starts = np.where(silence_changes == 1)[0]
            pause_ends = np.where(silence_changes == -1)[0]
            
            if len(pause_starts) > 0 and len(pause_ends) > 0:
                if len(pause_ends) < len(pause_starts):
                    pause_ends = np.append(pause_ends, len(audio_frame))
                pause_durations = (pause_ends[:len(pause_starts)] - pause_starts) / sample_rate
                avg_pause_duration = float(np.mean(pause_durations))
                max_pause_duration = float(np.max(pause_durations))
            else:
                avg_pause_duration = 0.0
                max_pause_duration = 0.0
            
            # Combine all features
            features = {
                'clean_rms': rms,
                'clean_zcr': zcr,
                'clean_centroid': centroid,
                'clean_bandwidth': bandwidth,
                'clean_rolloff': rolloff,
                'clean_rms_variability': rms_variability,
                'clean_zcr_variability': zcr_variability,
                'clean_breathing_regularity': breathing_regularity,
                'clean_silence_ratio': silence_ratio,
                'clean_avg_pause_duration': avg_pause_duration,
                'clean_max_pause_duration': max_pause_duration
            }
            
            # Add MFCCs
            for i, (mean_val, std_val) in enumerate(zip(mfcc_means, mfcc_stds), 1):
                features[f'clean_mfcc_{i}_mean'] = float(mean_val)
                features[f'clean_mfcc_{i}_std'] = float(std_val)
            
            return features
            
        except Exception as e:
            print(f"   ⚠️  Feature extraction error in {patient_id}: {e}")
            return None
    
    def get_apnea_label(timestamp, duration, apnea_events, threshold=0.1):
        """Calculate apnea label based on overlap with events"""
        try:
            frame_end = timestamp + duration
            apnea_seconds = 0
            
            for start, end in apnea_events:
                overlap_start = max(timestamp, start)
                overlap_end = min(frame_end, end)
                if overlap_start < overlap_end:
                    apnea_seconds += (overlap_end - overlap_start)
            
            proportion = apnea_seconds / duration
            label = 1 if proportion > threshold else 0
            return label, proportion
        except:
            return 0, 0.0
    
    try:
        # Check if patient directory exists
        patient_dir = os.path.join(data_dir, patient_id)
        if not os.path.exists(patient_dir):
            print(f"❌ {patient_id}: Directory not found: {patient_dir}")
            return []
        
        # Find EDF and RML files
        try:
            edf_files = sorted([f for f in os.listdir(patient_dir) if f.endswith('.edf')])
            rml_files = [f for f in os.listdir(patient_dir) if f.endswith('.rml')]
        except Exception as e:
            print(f"❌ {patient_id}: Cannot list directory files: {e}")
            return []
        
        if not edf_files or not rml_files:
            print(f"❌ {patient_id}: Missing EDF ({len(edf_files)}) or RML ({len(rml_files)}) files")
            return []
        
        print(f"   📁 {patient_id}: Found {len(edf_files)} EDF and {len(rml_files)} RML files")
        
        # Load apnea events (same for all EDF files)
        try:
            rml_path = os.path.join(patient_dir, rml_files[0])
            apnea_data = extract_apnea_events(rml_path, output_csv=None)
            apnea_events = [(float(start), float(end)) for event_type, start, end in apnea_data]
            print(f"   📋 {patient_id}: Found {len(apnea_events)} apnea events")
        except Exception as e:
            print(f"❌ {patient_id}: Cannot load RML file: {e}")
            return []
        
        # Process ALL EDF files sequentially for memory safety
        all_patient_features = []
        total_frames = 0
        
        for edf_idx, edf_file in enumerate(edf_files, 1):
            print(f"   🎵 {patient_id}: Processing EDF {edf_idx}/{len(edf_files)} - {edf_file}")
            
            try:
                edf_path = os.path.join(patient_dir, edf_file)
                raw = mne.io.read_raw_edf(edf_path, preload=False, verbose=False)
                
                if AUDIO_CHANNEL not in raw.ch_names:
                    print(f"   ⚠️  {patient_id}: No {AUDIO_CHANNEL} channel in {edf_file}, skipping")
                    continue
                
                raw.pick_channels([AUDIO_CHANNEL])
                original_sample_rate = int(raw.info['sfreq'])
                duration_minutes = raw.n_times / original_sample_rate / 60
                print(f"      ⏱️  Duration: {duration_minutes:.1f} min, Original SR: {original_sample_rate} Hz → Target: {TARGET_SAMPLE_RATE} Hz")
                
                # Frame parameters using TARGET_SAMPLE_RATE
                frame_samples = int(FRAME_DURATION * TARGET_SAMPLE_RATE)
                step_samples = int(frame_samples * (1 - OVERLAP_RATIO))
                
                # Calculate original frame size for loading
                original_frame_samples = int(FRAME_DURATION * original_sample_rate)
                original_step_samples = int(original_frame_samples * (1 - OVERLAP_RATIO))
                
                # Calculate time offset for this EDF (assuming sequential recordings)
                time_offset = (edf_idx - 1) * 60 * 60  # Each EDF = 1 hour, offset in seconds
                
                # Extract features from frames in this EDF
                edf_frame_count = 0
                
                for frame_start in range(0, raw.n_times - original_frame_samples + 1, original_step_samples):
                    frame_end = frame_start + original_frame_samples
                    timestamp = (frame_start / original_sample_rate) + time_offset  # Add offset for multi-EDF
                    
                    # Load audio frame at original sample rate
                    try:
                        audio_frame, _ = raw[:, frame_start:frame_end]
                        audio_frame = audio_frame.flatten()
                        
                        # Downsample to target sample rate
                        if original_sample_rate != TARGET_SAMPLE_RATE:
                            audio_frame = librosa.resample(audio_frame, 
                                                         orig_sr=original_sample_rate, 
                                                         target_sr=TARGET_SAMPLE_RATE)
                        
                    except Exception as e:
                        print(f"      ⚠️  Frame {edf_frame_count} load/downsample failed: {e}")
                        continue
                    
                    # Extract features using TARGET_SAMPLE_RATE
                    features = extract_comprehensive_features(audio_frame, TARGET_SAMPLE_RATE)
                    if features is None:
                        continue
                    
                    # Get apnea label
                    apnea_label, apnea_proportion = get_apnea_label(timestamp, FRAME_DURATION, apnea_events, APNEA_THRESHOLD)
                    
                    # Create record
                    record = {
                        'patient_id': patient_id,
                        'edf_file': edf_file,  # Track which EDF file
                        'timestamp': float(timestamp),
                        'frame_duration': FRAME_DURATION,
                        'sample_rate': TARGET_SAMPLE_RATE,  # Track downsampled rate
                        'apnea_label': int(apnea_label),
                        'apnea_proportion': float(apnea_proportion),
                        **features
                    }
                    
                    all_patient_features.append(record)
                    edf_frame_count += 1
                    total_frames += 1
                    
                    # Progress every 100 frames per EDF
                    if edf_frame_count % 100 == 0:
                        print(f"      🔄 {edf_file}: {edf_frame_count} frames processed...")
                
                print(f"      ✅ {edf_file}: {edf_frame_count} frames extracted (downsampled to {TARGET_SAMPLE_RATE} Hz)")
                
                # Clear memory after each EDF
                del raw
                
            except Exception as e:
                print(f"   ❌ {patient_id}: Failed processing {edf_file}: {e}")
                continue
        
        elapsed = time.time() - start_time
        apnea_count = sum(1 for f in all_patient_features if f['apnea_label'] == 1)
        print(f"✅ {patient_id}: {total_frames} total frames, {apnea_count} apnea ({elapsed:.1f}s)")
        
        return all_patient_features
        
    except Exception as e:
        print(f"❌ {patient_id}: Critical failure: {e}")
        import traceback
        traceback.print_exc()
        return []

print("✅ Multi-EDF threading-compatible processing function defined (with 16kHz downsampling)")

✅ Multi-EDF threading-compatible processing function defined (with 16kHz downsampling)


In [3]:
# Cell 3: Run Threading-Based Processing
def run_threading_processing():
    """Execute threading-based processing of all patients"""
    
    print(f"🧵 STARTING THREADING-BASED PROCESSING")
    print(f"👥 Patients: {PATIENTS_TO_PROCESS}")
    print(f"🔧 Threads: {MAX_WORKERS}")
    print(f"{'='*60}")
    
    start_time = time.time()
    all_features = []
    completed = 0
    failed = 0
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all jobs
        future_to_patient = {
            executor.submit(process_patient_threading, patient, PATIENT_DATA_DIR): patient 
            for patient in PATIENTS_TO_PROCESS
        }
        
        print(f"📤 Submitted {len(future_to_patient)} patient processing jobs")
        
        # Collect results as they complete
        for future in as_completed(future_to_patient):
            patient = future_to_patient[future]
            try:
                # Get result with timeout
                result = future.result(timeout=600)  # 10 minute timeout per patient
                
                if result:
                    all_features.extend(result)
                    completed += 1
                    print(f"🎯 Progress: {completed}/{len(PATIENTS_TO_PROCESS)} completed ({patient}) - {len(result)} frames")
                else:
                    failed += 1
                    print(f"⚠️  {patient}: No features extracted")
                    
            except Exception as e:
                failed += 1
                print(f"❌ {patient}: Exception occurred: {e}")
                import traceback
                traceback.print_exc()
    
    total_time = time.time() - start_time
    
    # Results summary
    print(f"\n{'='*60}")
    print(f"🏁 THREADING PROCESSING COMPLETE!")
    print(f"⏱️  Total time: {total_time:.1f} seconds ({total_time/60:.1f} minutes)")
    print(f"✅ Completed: {completed}")
    print(f"❌ Failed: {failed}")
    print(f"📊 Total features: {len(all_features):,}")
    
    if all_features:
        # Convert to DataFrame
        df = pd.DataFrame(all_features)
        
        # Save to CSV
        df.to_csv(OUTPUT_FILE, index=False)
        
        # Dataset statistics
        apnea_count = df['apnea_label'].sum()
        apnea_rate = df['apnea_label'].mean() * 100
        
        print(f"\n📋 DATASET STATISTICS:")
        print(f"📄 File saved: {OUTPUT_FILE}")
        print(f"📊 Shape: {df.shape}")
        print(f"👥 Unique patients: {df['patient_id'].nunique()}")
        print(f"🚨 Apnea frames: {apnea_count:,} ({apnea_rate:.1f}%)")
        print(f"😴 Normal frames: {len(df) - apnea_count:,} ({100-apnea_rate:.1f}%)")
        
        # Feature columns
        feature_cols = [col for col in df.columns if col.startswith('clean_')]
        print(f"🎯 Feature count: {len(feature_cols)}")
        
        # Per-patient breakdown
        print(f"\n👤 PER-PATIENT BREAKDOWN:")
        patient_stats = df.groupby('patient_id').agg({
            'apnea_label': ['count', 'sum', 'mean']
        }).round(3)
        
        for patient in df['patient_id'].unique():
            count = patient_stats.loc[patient, ('apnea_label', 'count')]
            apnea = patient_stats.loc[patient, ('apnea_label', 'sum')]
            rate = patient_stats.loc[patient, ('apnea_label', 'mean')] * 100
            print(f"   {patient}: {count} frames, {apnea} apnea ({rate:.1f}%)")
        
        # Display sample
        print(f"\n📋 SAMPLE DATA:")
        print(df.head())
        
        # Feature correlation analysis
        if len(feature_cols) > 0:
            correlations = df[feature_cols].corrwith(df['apnea_label']).abs().sort_values(ascending=False)
            print(f"\n🔗 TOP 5 FEATURE CORRELATIONS WITH APNEA:")
            for feature, corr in correlations.head().items():
                print(f"   {feature}: {corr:.3f}")
        
        return df
    else:
        print(f"❌ No features extracted from any patient")
        return None

print("✅ Threading processing function ready")

✅ Threading processing function ready


In [4]:
# Cell 4: Execute Threading Processing
import time
print("🧵 EXECUTING THREADING-BASED FEATURE EXTRACTION")
print(f"Time started: {time.strftime('%Y-%m-%d %H:%M:%S')}")

# Run the threading processing
dataset = run_threading_processing()

if dataset is not None:
    print(f"\n🎉 SUCCESS! Dataset ready for model training.")
    print(f"📁 Saved as: {OUTPUT_FILE}")
    print(f"🚀 Threading approach avoided Windows multiprocessing issues!")
else:
    print(f"\n❌ Processing failed. Check error messages above.")

print(f"\nTime finished: {time.strftime('%Y-%m-%d %H:%M:%S')}")

🧵 EXECUTING THREADING-BASED FEATURE EXTRACTION
Time started: 2025-07-28 14:51:31
🧵 STARTING THREADING-BASED PROCESSING
👥 Patients: ['patient_01', 'patient_02', 'patient_03', 'patient_04', 'patient_05', 'patient_06', 'patient_07', 'patient_07', 'patient_08', 'patient_09', 'patient_10', 'patient_11', 'patient_12', 'patient_13', 'patient_14', 'patient_15', 'patient_16', 'patient_17', 'patient_18', 'patient_19', 'patient_20', 'patient_21', 'patient_22', 'patient_23', 'patient_24']
🔧 Threads: 4
🔄 Starting patient_01 (Thread)...
🔄 Starting patient_02 (Thread)...
🔄 Starting patient_03 (Thread)...
   📁 patient_01: Found 5 EDF and 1 RML files
🔄 Starting patient_04 (Thread)...
   📁 patient_02: Found 5 EDF and 1 RML files
📤 Submitted 25 patient processing jobs
   📁 patient_03: Found 4 EDF and 1 RML files
   📁 patient_04: Found 5 EDF and 1 RML files
ObstructiveApnea: 41.0s to 54.5s (duration: 13.5s)
ObstructiveApnea: 91.5s to 110.5s (duration: 19.0s)
ObstructiveApnea: 436.0s to 448.0s (duration: 1