# Phase 2: EEG Data Preprocessing
## CHB-MIT Dataset - Data Cleaning and Feature Extraction

This notebook preprocesses EEG data from selected subjects (5-6) for model training.

## 1. Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# EEG processing
import mne
from scipy import signal
from scipy.stats import skew, kurtosis

# Data storage
import h5py
import joblib

print("✓ Libraries imported")

✓ Libraries imported


## 2. Configuration

In [8]:
# Paths
BASE_DIR = Path(r"c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection")
RAW_DATA_DIR = BASE_DIR / "data" / "raw" / "chb-mit-scalp-eeg-database-1.0.0"
PROCESSED_DATA_DIR = BASE_DIR / "data" / "processed"

# Selected subjects (top 6 with most seizures)
SELECTED_SUBJECTS = ['chb01', 'chb02', 'chb03', 'chb04', 'chb05', 'chb24']

# EEG parameters - OPTIMIZED FOR DISK SPACE
ORIGINAL_SAMPLING_RATE = 256  # Hz (original)
SAMPLING_RATE = 128  # Hz (downsampled to save space - still adequate for seizure detection)
DOWNSAMPLE_FACTOR = ORIGINAL_SAMPLING_RATE // SAMPLING_RATE
WINDOW_SIZE = 4  # seconds
OVERLAP = 2  # seconds
N_SAMPLES_PER_WINDOW = SAMPLING_RATE * WINDOW_SIZE  # 512 samples (was 1024)

# Frequency bands
FREQ_BANDS = {
    'delta': (0.5, 4),
    'theta': (4, 8),
    'alpha': (8, 13),
    'beta': (13, 30),
    'gamma': (30, 50)
}

print(f"Base Directory: {BASE_DIR}")
print(f"Selected Subjects: {SELECTED_SUBJECTS}")
print(f"Sampling Rate: {ORIGINAL_SAMPLING_RATE} Hz → {SAMPLING_RATE} Hz (downsampled {DOWNSAMPLE_FACTOR}x)")
print(f"Window: {WINDOW_SIZE}s with {OVERLAP}s overlap ({N_SAMPLES_PER_WINDOW} samples)")
print(f"Memory savings: ~50% from downsampling + 50% from float32 = ~75% total reduction")

Base Directory: c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection
Selected Subjects: ['chb01', 'chb02', 'chb03', 'chb04', 'chb05', 'chb24']
Sampling Rate: 256 Hz → 128 Hz (downsampled 2x)
Window: 4s with 2s overlap (512 samples)
Memory savings: ~50% from downsampling + 50% from float32 = ~75% total reduction


## 3. Parse Summary Files for Seizure Information

In [3]:
def parse_summary_file(summary_path):
    """Extract seizure information from subject summary file."""
    seizure_info = []
    
    with open(summary_path, 'r') as f:
        content = f.read()
        lines = content.split('\n')
        
        current_file = None
        for i, line in enumerate(lines):
            if 'File Name:' in line:
                current_file = line.split(':')[1].strip()
            elif 'Seizure Start Time:' in line and current_file:
                start_time = int(line.split(':')[1].strip().split()[0])
                # Find end time in next line
                if i + 1 < len(lines) and 'Seizure End Time:' in lines[i + 1]:
                    end_time = int(lines[i + 1].split(':')[1].strip().split()[0])
                    seizure_info.append({
                        'file': current_file,
                        'start': start_time,
                        'end': end_time
                    })
    
    return seizure_info

# Load seizure information for selected subjects
seizure_data = {}
for subject in SELECTED_SUBJECTS:
    summary_file = RAW_DATA_DIR / subject / f"{subject}-summary.txt"
    if summary_file.exists():
        seizure_data[subject] = parse_summary_file(summary_file)
        print(f"{subject}: {len(seizure_data[subject])} seizure events")

print(f"\n✓ Loaded seizure information")

chb01: 7 seizure events
chb02: 3 seizure events
chb03: 7 seizure events
chb04: 2 seizure events
chb05: 5 seizure events
chb24: 16 seizure events

✓ Loaded seizure information


## 4. Preprocessing Functions

In [9]:
def preprocess_eeg(raw_data, sampling_rate=256, target_rate=128):
    """Apply bandpass filter, downsample, and normalization."""
    # Bandpass filter (0.5-50 Hz)
    nyquist = sampling_rate / 2
    low, high = 0.5 / nyquist, 50 / nyquist
    b, a = signal.butter(4, [low, high], btype='band')
    filtered = signal.filtfilt(b, a, raw_data, axis=1)
    
    # Downsample to save space (256 Hz -> 128 Hz)
    if sampling_rate > target_rate:
        downsample_factor = sampling_rate // target_rate
        filtered = filtered[:, ::downsample_factor]
    
    # Normalization (z-score)
    mean = np.mean(filtered, axis=1, keepdims=True)
    std = np.std(filtered, axis=1, keepdims=True)
    normalized = (filtered - mean) / (std + 1e-8)
    
    return normalized

def extract_spectral_features(data, sampling_rate, freq_bands):
    """Extract power in frequency bands."""
    features = []
    
    for ch_data in data:
        freqs, psd = signal.welch(ch_data, fs=sampling_rate, nperseg=256)
        
        ch_features = []
        for band_name, (low, high) in freq_bands.items():
            idx = np.logical_and(freqs >= low, freqs <= high)
            band_power = np.mean(psd[idx])
            ch_features.append(band_power)
        
        features.extend(ch_features)
    
    return np.array(features)

def extract_statistical_features(data):
    """Extract statistical features from each channel."""
    features = []
    
    for ch_data in data:
        features.extend([
            np.mean(ch_data),
            np.std(ch_data),
            skew(ch_data),
            kurtosis(ch_data),
            np.max(ch_data) - np.min(ch_data)  # range
        ])
    
    return np.array(features)

print("✓ Preprocessing functions defined (with downsampling)")

✓ Preprocessing functions defined (with downsampling)


## 5. Process EEG Files

In [5]:
def process_edf_file(edf_path, seizure_times=None):
    """Process single EDF file and extract windows."""
    raw = mne.io.read_raw_edf(str(edf_path), preload=True, verbose=False)
    data = raw.get_data()
    
    # Preprocess
    data = preprocess_eeg(data, SAMPLING_RATE)
    
    # Create sliding windows
    n_channels, n_samples = data.shape
    step_size = (WINDOW_SIZE - OVERLAP) * SAMPLING_RATE
    
    windows = []
    labels = []
    
    for start_idx in range(0, n_samples - N_SAMPLES_PER_WINDOW, step_size):
        end_idx = start_idx + N_SAMPLES_PER_WINDOW
        window_data = data[:, start_idx:end_idx]
        
        # Determine label
        window_time_start = start_idx / SAMPLING_RATE
        window_time_end = end_idx / SAMPLING_RATE
        
        is_seizure = False
        if seizure_times:
            for sz in seizure_times:
                if (window_time_start >= sz['start'] and window_time_start < sz['end']) or \
                   (window_time_end > sz['start'] and window_time_end <= sz['end']):
                    is_seizure = True
                    break
        
        windows.append(window_data)
        labels.append(1 if is_seizure else 0)
    
    return np.array(windows), np.array(labels)

print("✓ EDF processing function defined")

✓ EDF processing function defined


## 6. Process All Selected Subjects

In [10]:
# Memory-efficient processing: Stream data directly to HDF5
# Process ALL files from ALL 6 selected subjects
# Key optimizations:
# 1. Process one file at a time (never load all data into RAM)
# 2. Use float32 instead of float64 (50% memory reduction)
# 3. Process in chunks and write directly to HDF5
# 4. Use compression in HDF5 to reduce file size

output_file = PROCESSED_DATA_DIR / "preprocessed_data.h5"

print("Processing FULL dataset with memory-efficient streaming...\\n")
print(f"Selected subjects: {SELECTED_SUBJECTS}")
print(f"Processing ALL .edf files from each subject\\n")

# Initialize counters
total_windows = 0
total_seizure = 0
total_files = 0

# Create HDF5 file
with h5py.File(output_file, 'w') as hdf:
    first_write = True
    
    # Process ALL selected subjects
    for subject in tqdm(SELECTED_SUBJECTS, desc="Subjects"):
        subject_path = RAW_DATA_DIR / subject
        edf_files = sorted([f for f in subject_path.glob("*.edf") if not f.name.endswith('+')])
        
        seizure_info = seizure_data.get(subject, [])
        
        print(f"\\n{subject}: Found {len(edf_files)} files")
        
        # Process ALL files for this subject
        for edf_file in tqdm(edf_files, desc=f"  {subject}", leave=False):
            try:
                # Find seizure times for this file
                file_seizures = [s for s in seizure_info if s['file'] == edf_file.name]
                
                # Load data WITHOUT preloading (memory efficient)
                raw = mne.io.read_raw_edf(str(edf_file), preload=False, verbose=False)
                
                # Process in chunks to avoid loading all data at once
                chunk_duration = 300  # 5 minutes per chunk
                total_duration = raw.times[-1]
                n_chunks = int(np.ceil(total_duration / chunk_duration))
                
                for chunk_idx in range(n_chunks):
                    start_time = chunk_idx * chunk_duration
                    end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
                    
                    # Load only this chunk
                    start_sample = int(start_time * ORIGINAL_SAMPLING_RATE)
                    stop_sample = int(end_time * ORIGINAL_SAMPLING_RATE)
                    
                    data = raw.get_data(start=start_sample, stop=stop_sample)
                    
                    # Preprocess with downsampling and convert to float32 (memory efficient)
                    data = preprocess_eeg(data, ORIGINAL_SAMPLING_RATE, SAMPLING_RATE).astype(np.float32)
                    
                    # Create windows for this chunk
                    n_channels, n_samples = data.shape
                    step_size = int((WINDOW_SIZE - OVERLAP) * SAMPLING_RATE)
                    
                    windows_list = []
                    labels_list = []
                    
                    for start_idx in range(0, n_samples - N_SAMPLES_PER_WINDOW + 1, step_size):
                        end_idx = start_idx + N_SAMPLES_PER_WINDOW
                        window_data = data[:, start_idx:end_idx]
                        
                        # Calculate absolute time for labeling
                        window_time_start = start_time + (start_idx / SAMPLING_RATE)
                        window_time_end = start_time + (end_idx / SAMPLING_RATE)
                        
                        # Determine label
                        is_seizure = False
                        if file_seizures:
                            for sz in file_seizures:
                                if (window_time_start >= sz['start'] and window_time_start < sz['end']) or \
                                   (window_time_end > sz['start'] and window_time_end <= sz['end']):
                                    is_seizure = True
                                    break
                        
                        windows_list.append(window_data)
                        labels_list.append(1 if is_seizure else 0)
                    
                    # Write this chunk to HDF5 immediately
                    if windows_list:
                        windows = np.array(windows_list, dtype=np.float32)
                        labels = np.array(labels_list, dtype=np.int8)
                        
                        if first_write:
                            maxshape_X = (None, windows.shape[1], windows.shape[2])
                            maxshape_y = (None,)
                            
                            hdf.create_dataset('X', data=windows, maxshape=maxshape_X, 
                                             chunks=(min(100, windows.shape[0]), windows.shape[1], windows.shape[2]), 
                                             compression='gzip', compression_opts=4)
                            hdf.create_dataset('y', data=labels, maxshape=maxshape_y,
                                             chunks=True, compression='gzip', compression_opts=4)
                            first_write = False
                        else:
                            current_size = hdf['X'].shape[0]
                            hdf['X'].resize((current_size + windows.shape[0]), axis=0)
                            hdf['X'][current_size:] = windows
                            
                            hdf['y'].resize((current_size + labels.shape[0]), axis=0)
                            hdf['y'][current_size:] = labels
                        
                        total_windows += len(windows)
                        total_seizure += np.sum(labels)
                    
                    # Clear memory
                    del data, windows_list, labels_list
                    if 'windows' in locals():
                        del windows, labels
                
                total_files += 1
                
            except Exception as e:
                print(f"\\n  ⚠ Error processing {edf_file.name}: {str(e)[:80]}")
                continue
    
    # Save metadata
    if 'X' in hdf:
        hdf.attrs['n_samples'] = hdf['X'].shape[0]
        hdf.attrs['n_channels'] = hdf['X'].shape[1]
        hdf.attrs['window_size'] = WINDOW_SIZE
        hdf.attrs['sampling_rate'] = SAMPLING_RATE
        hdf.attrs['subjects'] = ','.join(SELECTED_SUBJECTS)
        hdf.attrs['total_files_processed'] = total_files

if total_windows > 0:
    print(f"\\n{'='*70}")
    print(f"✓ FULL DATASET PROCESSING COMPLETE")
    print(f"{'='*70}")
    print(f"Total files processed: {total_files}")
    print(f"Total windows: {total_windows:,}")
    print(f"Seizure windows: {total_seizure:,} ({total_seizure/total_windows*100:.2f}%)")
    print(f"Normal windows: {total_windows - total_seizure:,} ({(total_windows-total_seizure)/total_windows*100:.2f}%)")
    print(f"{'='*70}")
    print(f"\\nSaved to: {output_file}")
    print(f"File size: {output_file.stat().st_size / (1024**2):.2f} MB")
else:
    print("\\n✗ No data processed - check for errors above")

Processing FULL dataset with memory-efficient streaming...\n
Selected subjects: ['chb01', 'chb02', 'chb03', 'chb04', 'chb05', 'chb24']
Processing ALL .edf files from each subject\n


Subjects:   0%|          | 0/6 [00:00<?, ?it/s]

\nchb01: Found 42 files


  chb01:   0%|          | 0/42 [00:00<?, ?it/s]

\nchb02: Found 36 files


  chb02:   0%|          | 0/36 [00:00<?, ?it/s]

\nchb03: Found 38 files


  chb03:   0%|          | 0/38 [00:00<?, ?it/s]

\nchb04: Found 42 files


  chb04:   0%|          | 0/42 [00:00<?, ?it/s]

\n  ⚠ Error processing chb04_07.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_08.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_09.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_10.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_09.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_10.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_11.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_12.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_11.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_12.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_13.edf: Can't broadcast (149, 24, 512) -> (149, 23, 512)
\n  ⚠ Error processing chb04_14.edf: Can't broadcast (

  chb05:   0%|          | 0/39 [00:00<?, ?it/s]

\nchb24: Found 22 files


  chb24:   0%|          | 0/22 [00:00<?, ?it/s]

✓ FULL DATASET PROCESSING COMPLETE
Total files processed: 183
Total windows: 351,815
Seizure windows: 1,116 (0.32%)
Normal windows: 350,699 (99.68%)
\nSaved to: c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection\data\processed\preprocessed_data.h5
File size: 13781.49 MB


## 7. Balance Dataset (Handle Class Imbalance)

In [None]:
from sklearn.utils import resample

# Separate classes
seizure_indices = np.where(y == 1)[0]
normal_indices = np.where(y == 0)[0]

print(f"Original - Seizure: {len(seizure_indices)}, Normal: {len(normal_indices)}")

# Undersample normal class to balance (or use a ratio like 1:3)
n_normal_samples = min(len(seizure_indices) * 3, len(normal_indices))
normal_indices_sampled = resample(normal_indices, n_samples=n_normal_samples, random_state=42)

# Combine
balanced_indices = np.concatenate([seizure_indices, normal_indices_sampled])
np.random.shuffle(balanced_indices)

X_balanced = X[balanced_indices]
y_balanced = y[balanced_indices]

print(f"Balanced - Total: {len(X_balanced)}")
print(f"Seizure: {np.sum(y_balanced)} ({np.sum(y_balanced)/len(y_balanced)*100:.2f}%)")
print(f"Normal: {len(y_balanced) - np.sum(y_balanced)} ({(len(y_balanced)-np.sum(y_balanced))/len(y_balanced)*100:.2f}%)")

## 8. Save Processed Data

In [11]:
# Verify the saved data
output_file = PROCESSED_DATA_DIR / "preprocessed_data.h5"
print(f"Verifying {output_file}...\n")

with h5py.File(output_file, 'r') as f:
    print(f"Available datasets: {list(f.keys())}")
    print(f"Available attributes: {list(f.attrs.keys())}\n")
    
    if 'X' in f:
        X_data = f['X'][:]
        y_data = f['y'][:]
        
        print(f"Data shapes:")
        print(f"  X: {X_data.shape} (samples, channels, time_points)")
        print(f"  y: {y_data.shape} (samples,)")
        print(f"\nClass distribution:")
        print(f"  Seizure (1): {np.sum(y_data == 1)} ({np.sum(y_data == 1) / len(y_data) * 100:.2f}%)")
        print(f"  Normal (0): {np.sum(y_data == 0)} ({np.sum(y_data == 0) / len(y_data) * 100:.2f}%)")
        print(f"\nData statistics:")
        print(f"  X dtype: {X_data.dtype}")
        print(f"  X range: [{X_data.min():.3f}, {X_data.max():.3f}]")
        print(f"  X mean: {X_data.mean():.3f}")
        print(f"  X std: {X_data.std():.3f}")
        
        print(f"\nPreprocessed data verified successfully!")
    else:
        print("ERROR: No data found in HDF5 file!")

Verifying c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection\data\processed\preprocessed_data.h5...

Available datasets: ['X', 'y']
Available attributes: ['n_channels', 'n_samples', 'sampling_rate', 'subjects', 'total_files_processed', 'window_size']

Data shapes:
  X: (357179, 23, 512) (samples, channels, time_points)
  y: (357179,) (samples,)

Class distribution:
  Seizure (1): 1116 (0.31%)
  Normal (0): 356063 (99.69%)

Data statistics:
  X dtype: float32
Data shapes:
  X: (357179, 23, 512) (samples, channels, time_points)
  y: (357179,) (samples,)

Class distribution:
  Seizure (1): 1116 (0.31%)
  Normal (0): 356063 (99.69%)

Data statistics:
  X dtype: float32
  X range: [-65.408, 59.200]
  X range: [-65.408, 59.200]
  X mean: -0.000
  X mean: -0.000
  X std: 0.992

Preprocessed data verified successfully!
  X std: 0.992

Preprocessed data verified successfully!


## 9. Summary

**Preprocessing Complete:**
- Filtered EEG signals (0.5-50 Hz bandpass)
- Normalized data (z-score)
- Created sliding windows (4s with 2s overlap)
- Labeled windows as seizure/normal
- Balanced dataset
- Saved processed data for training

**Next: Phase 3 - Model Training**