# Phase 2: EEG Data Preprocessing
## CHB-MIT Dataset - Data Cleaning and Feature Extraction

This notebook preprocesses EEG data from selected subjects (5-6) for model training.

## 1. Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# EEG processing
import mne
from scipy import signal
from scipy.stats import skew, kurtosis

# Data storage
import h5py
import joblib

print("✓ Libraries imported")

✓ Libraries imported


## 2. Configuration

In [2]:
# Paths
BASE_DIR = Path(r"c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection")
RAW_DATA_DIR = BASE_DIR / "data" / "raw" / "chb-mit-scalp-eeg-database-1.0.0"
PROCESSED_DATA_DIR = BASE_DIR / "data" / "processed"

# Selected subjects (top 6 with most seizures)
SELECTED_SUBJECTS = ['chb01', 'chb02', 'chb03', 'chb05', 'chb24']

# EEG parameters - OPTIMIZED FOR IMBALANCED LEARNING
ORIGINAL_SAMPLING_RATE = 256  # Hz (original)
SAMPLING_RATE = 64  # Hz (downsampled 4x - still captures seizure patterns <30 Hz)
DOWNSAMPLE_FACTOR = ORIGINAL_SAMPLING_RATE // SAMPLING_RATE
WINDOW_SIZE = 4  # seconds (shorter windows = more samples)
OVERLAP = 3  # seconds (HIGH overlap for seizures to create more positive examples)
N_SAMPLES_PER_WINDOW = SAMPLING_RATE * WINDOW_SIZE  # 256 samples

# Balancing strategy
TARGET_IMBALANCE_RATIO = 10  # 1:10 (Seizure:Normal) - practical for deep learning
# This gives us more training data while keeping imbalance manageable for focal loss

# Frequency bands
FREQ_BANDS = {
    'delta': (0.5, 4),
    'theta': (4, 8),
    'alpha': (8, 13),
    'beta': (13, 30)
}

print(f"Base Directory: {BASE_DIR}")
print(f"Selected Subjects: {SELECTED_SUBJECTS}")
print(f"Sampling Rate: {ORIGINAL_SAMPLING_RATE} Hz → {SAMPLING_RATE} Hz (downsampled {DOWNSAMPLE_FACTOR}x)")
print(f"Window: {WINDOW_SIZE}s with {OVERLAP}s overlap ({N_SAMPLES_PER_WINDOW} samples)")
print(f"High overlap strategy: Maximize seizure windows via overlapping")
print(f"Target ratio: 1:{TARGET_IMBALANCE_RATIO} (Seizure:Normal)")
print(f"Expected dataset size: ~1 GB")

Base Directory: c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection
Selected Subjects: ['chb01', 'chb02', 'chb03', 'chb05', 'chb24']
Sampling Rate: 256 Hz → 64 Hz (downsampled 4x)
Window: 4s with 3s overlap (256 samples)
High overlap strategy: Maximize seizure windows via overlapping
Target ratio: 1:10 (Seizure:Normal)
Expected dataset size: ~1 GB


## 3. Parse Summary Files for Seizure Information

In [3]:
def parse_summary_file(summary_path):
    """Extract seizure information from subject summary file."""
    seizure_info = []
    
    with open(summary_path, 'r') as f:
        content = f.read()
        lines = content.split('\n')
        
        current_file = None
        for i, line in enumerate(lines):
            if 'File Name:' in line:
                current_file = line.split(':')[1].strip()
            elif 'Seizure Start Time:' in line and current_file:
                start_time = int(line.split(':')[1].strip().split()[0])
                # Find end time in next line
                if i + 1 < len(lines) and 'Seizure End Time:' in lines[i + 1]:
                    end_time = int(lines[i + 1].split(':')[1].strip().split()[0])
                    seizure_info.append({
                        'file': current_file,
                        'start': start_time,
                        'end': end_time
                    })
    
    return seizure_info

# Load seizure information for selected subjects
seizure_data = {}
total_seizure_duration = 0
for subject in SELECTED_SUBJECTS:
    summary_file = RAW_DATA_DIR / subject / f"{subject}-summary.txt"
    if summary_file.exists():
        seizure_data[subject] = parse_summary_file(summary_file)
        subject_duration = sum([sz['end'] - sz['start'] for sz in seizure_data[subject]])
        total_seizure_duration += subject_duration
        print(f"{subject}: {len(seizure_data[subject])} seizure events ({subject_duration}s total)")

print(f"\n✓ Loaded seizure information")
print(f"Total seizure duration across all subjects: {total_seizure_duration}s ({total_seizure_duration/60:.1f} minutes)")
print(f"Note: Actual window ratio will be calculated after processing")

chb01: 7 seizure events (442s total)
chb02: 3 seizure events (172s total)
chb03: 7 seizure events (402s total)
chb05: 5 seizure events (558s total)
chb24: 16 seizure events (511s total)

✓ Loaded seizure information
Total seizure duration across all subjects: 2085s (34.8 minutes)
Note: Actual window ratio will be calculated after processing


## 4. Preprocessing Functions

In [4]:
def preprocess_eeg(raw_data, sampling_rate=256, target_rate=128):
    """Apply bandpass filter, downsample, and normalization."""
    # Bandpass filter (0.5-50 Hz)
    nyquist = sampling_rate / 2
    low, high = 0.5 / nyquist, 50 / nyquist
    b, a = signal.butter(4, [low, high], btype='band')
    filtered = signal.filtfilt(b, a, raw_data, axis=1)
    
    # Downsample to save space (256 Hz -> 128 Hz)
    if sampling_rate > target_rate:
        downsample_factor = sampling_rate // target_rate
        filtered = filtered[:, ::downsample_factor]
    
    # Normalization (z-score)
    mean = np.mean(filtered, axis=1, keepdims=True)
    std = np.std(filtered, axis=1, keepdims=True)
    normalized = (filtered - mean) / (std + 1e-8)
    
    return normalized

def augment_seizure_window(window_data):
    """Augment seizure windows with slight variations to balance dataset."""
    augmented = []
    
    # Original
    augmented.append(window_data)
    
    # Time shift (circular shift by 10% of window)
    shift_amount = window_data.shape[1] // 10
    shifted = np.roll(window_data, shift_amount, axis=1)
    augmented.append(shifted)
    
    # Add small noise (0.05 std)
    noise = np.random.normal(0, 0.05, window_data.shape)
    noisy = window_data + noise
    augmented.append(noisy)
    
    return augmented

def extract_spectral_features(data, sampling_rate, freq_bands):
    """Extract power in frequency bands."""
    features = []
    
    for ch_data in data:
        freqs, psd = signal.welch(ch_data, fs=sampling_rate, nperseg=256)
        
        ch_features = []
        for band_name, (low, high) in freq_bands.items():
            idx = np.logical_and(freqs >= low, freqs <= high)
            band_power = np.mean(psd[idx])
            ch_features.append(band_power)
        
        features.extend(ch_features)
    
    return np.array(features)

def extract_statistical_features(data):
    """Extract statistical features from each channel."""
    features = []
    
    for ch_data in data:
        features.extend([
            np.mean(ch_data),
            np.std(ch_data),
            skew(ch_data),
            kurtosis(ch_data),
            np.max(ch_data) - np.min(ch_data)  # range
        ])
    
    return np.array(features)

print("✓ Preprocessing functions defined (with augmentation for seizures)")

✓ Preprocessing functions defined (with augmentation for seizures)


## 5. Process EEG Files

In [5]:
def process_edf_file(edf_path, seizure_times=None):
    """Process single EDF file and extract windows."""
    raw = mne.io.read_raw_edf(str(edf_path), preload=True, verbose=False)
    data = raw.get_data()
    
    # Preprocess
    data = preprocess_eeg(data, SAMPLING_RATE)
    
    # Create sliding windows
    n_channels, n_samples = data.shape
    step_size = (WINDOW_SIZE - OVERLAP) * SAMPLING_RATE
    
    windows = []
    labels = []
    
    for start_idx in range(0, n_samples - N_SAMPLES_PER_WINDOW, step_size):
        end_idx = start_idx + N_SAMPLES_PER_WINDOW
        window_data = data[:, start_idx:end_idx]
        
        # Determine label
        window_time_start = start_idx / SAMPLING_RATE
        window_time_end = end_idx / SAMPLING_RATE
        
        is_seizure = False
        if seizure_times:
            for sz in seizure_times:
                if (window_time_start >= sz['start'] and window_time_start < sz['end']) or \
                   (window_time_end > sz['start'] and window_time_end <= sz['end']):
                    is_seizure = True
                    break
        
        windows.append(window_data)
        labels.append(1 if is_seizure else 0)
    
    return np.array(windows), np.array(labels)

print("✓ EDF processing function defined")

✓ EDF processing function defined


## 6. Process All Selected Subjects

In [6]:
# Memory-efficient processing: Stream data directly to HDF5
# STRATEGY: Smart imbalanced learning with 1:10 ratio
# Key optimizations:
# 1. HIGH OVERLAP (3s) for seizure windows → maximize positive examples
# 2. SMART UNDERSAMPLING of normals → keep contextual negatives (pre/post seizure)
# 3. Target ratio 1:10 (manageable for Focal Loss, enough data for deep learning)
# 4. Label window as positive if ANY seizure sample present
# 5. Target: ~1GB dataset with ~20K-40K windows

output_file = PROCESSED_DATA_DIR / "preprocessed_data.h5"

print("Processing with SMART IMBALANCED LEARNING (1:10 ratio)...\n")
print(f"Selected subjects: {SELECTED_SUBJECTS}")
print(f"Strategy: High overlap for seizures + contextual negatives + smart undersampling\n")

# Initialize counters
total_windows = 0
total_seizure = 0
total_files = 0

# Temporary storage for balancing
all_seizure_windows = []
all_normal_windows = []
contextual_normal_windows = []  # Pre/post seizure windows (keep all)

# Create HDF5 file
with h5py.File(output_file, 'w') as hdf:
    first_write = True
    
    # PHASE 1: Collect all windows with contextual awareness
    print("PHASE 1: Collecting windows with high overlap...")
    for subject in tqdm(SELECTED_SUBJECTS, desc="Subjects"):
        subject_path = RAW_DATA_DIR / subject
        if not subject_path.exists():
            print(f"\n  ⚠ Skipping {subject} - directory not found")
            continue
            
        edf_files = sorted([f for f in subject_path.glob("*.edf") if not f.name.endswith('+')])
        seizure_info = seizure_data.get(subject, [])
        
        print(f"\n{subject}: Processing {len(edf_files)} files...")
        
        for edf_file in tqdm(edf_files, desc=f"  {subject}", leave=False):
            try:
                file_seizures = [s for s in seizure_info if s['file'] == edf_file.name]
                
                raw = mne.io.read_raw_edf(str(edf_file), preload=False, verbose=False)
                
                # Process in chunks
                chunk_duration = 300  # 5 minutes per chunk
                total_duration = raw.times[-1]
                n_chunks = int(np.ceil(total_duration / chunk_duration))
                
                for chunk_idx in range(n_chunks):
                    start_time = chunk_idx * chunk_duration
                    end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
                    
                    start_sample = int(start_time * ORIGINAL_SAMPLING_RATE)
                    stop_sample = int(end_time * ORIGINAL_SAMPLING_RATE)
                    
                    data = raw.get_data(start=start_sample, stop=stop_sample)
                    data = preprocess_eeg(data, ORIGINAL_SAMPLING_RATE, SAMPLING_RATE).astype(np.float32)
                    
                    n_channels, n_samples = data.shape
                    step_size = int((WINDOW_SIZE - OVERLAP) * SAMPLING_RATE)
                    
                    for start_idx in range(0, n_samples - N_SAMPLES_PER_WINDOW + 1, step_size):
                        end_idx = start_idx + N_SAMPLES_PER_WINDOW
                        window_data = data[:, start_idx:end_idx]
                        
                        window_time_start = start_time + (start_idx / SAMPLING_RATE)
                        window_time_end = start_time + (end_idx / SAMPLING_RATE)
                        
                        # Label as positive if ANY overlap with seizure
                        is_seizure = False
                        is_contextual = False  # Near seizure (±60s)
                        
                        if file_seizures:
                            for sz in file_seizures:
                                # Check for seizure overlap
                                if (window_time_start >= sz['start'] and window_time_start < sz['end']) or \
                                   (window_time_end > sz['start'] and window_time_end <= sz['end']) or \
                                   (window_time_start <= sz['start'] and window_time_end >= sz['end']):
                                    is_seizure = True
                                    break
                                
                                # Check if contextual (within ±60s of seizure)
                                if not is_seizure:
                                    if (window_time_start >= sz['start'] - 60 and window_time_start <= sz['end'] + 60) or \
                                       (window_time_end >= sz['start'] - 60 and window_time_end <= sz['end'] + 60):
                                        is_contextual = True
                        
                        if is_seizure:
                            all_seizure_windows.append(window_data)
                        elif is_contextual:
                            contextual_normal_windows.append(window_data)
                        else:
                            all_normal_windows.append(window_data)
                    
                    del data
                
                total_files += 1
                
            except Exception as e:
                print(f"\n  ⚠ Error processing {edf_file.name}: {str(e)[:80]}")
                continue
    
    # PHASE 2: Smart balancing with contextual negatives
    print(f"\n\nPHASE 2: Smart balancing...")
    print(f"Collected: {len(all_seizure_windows)} seizures, {len(contextual_normal_windows)} contextual, {len(all_normal_windows)} far normals")
    
    n_seizures = len(all_seizure_windows)
    target_normals = n_seizures * TARGET_IMBALANCE_RATIO
    
    # Keep ALL contextual windows (important for learning)
    selected_normals = contextual_normal_windows.copy()
    remaining_slots = target_normals - len(selected_normals)
    
    # Fill remaining with random far normals
    if remaining_slots > 0 and len(all_normal_windows) > 0:
        np.random.seed(42)
        n_to_sample = min(remaining_slots, len(all_normal_windows))
        normal_indices = np.random.choice(len(all_normal_windows), size=n_to_sample, replace=False)
        selected_normals.extend([all_normal_windows[i] for i in normal_indices])
    
    print(f"Balanced: {n_seizures} seizures, {len(selected_normals)} normals")
    print(f"  - Contextual normals: {len(contextual_normal_windows)}")
    print(f"  - Random normals: {len(selected_normals) - len(contextual_normal_windows)}")
    print(f"Ratio: 1:{len(selected_normals)/n_seizures:.1f}")
    
    # Combine and shuffle
    all_windows = all_seizure_windows + selected_normals
    all_labels = [1] * n_seizures + [0] * len(selected_normals)
    
    # Shuffle
    combined = list(zip(all_windows, all_labels))
    np.random.shuffle(combined)
    all_windows, all_labels = zip(*combined)
    
    # Write to HDF5
    print(f"\nWriting {len(all_windows)} windows to HDF5...")
    windows = np.array(all_windows, dtype=np.float32)
    labels = np.array(all_labels, dtype=np.int8)
    
    hdf.create_dataset('X', data=windows, 
                      compression='gzip', compression_opts=4)
    hdf.create_dataset('y', data=labels,
                      compression='gzip', compression_opts=4)
    
    total_windows = len(windows)
    total_seizure = np.sum(labels)
    
    # Save metadata
    if 'X' in hdf:
        hdf.attrs['n_samples'] = hdf['X'].shape[0]
        hdf.attrs['n_channels'] = hdf['X'].shape[1]
        hdf.attrs['window_size'] = WINDOW_SIZE
        hdf.attrs['sampling_rate'] = SAMPLING_RATE
        hdf.attrs['subjects'] = ','.join(SELECTED_SUBJECTS)
        hdf.attrs['total_files_processed'] = total_files

if total_windows > 0:
    print(f"\n{'='*70}")
    print(f"✓ SMART IMBALANCED DATASET PROCESSING COMPLETE")
    print(f"{'='*70}")
    print(f"Total files processed: {total_files}")
    print(f"Total windows: {total_windows:,}")
    print(f"Seizure windows: {total_seizure:,} ({total_seizure/total_windows*100:.2f}%)")
    print(f"Normal windows: {total_windows - total_seizure:,} ({(total_windows-total_seizure)/total_windows*100:.2f}%)")
    print(f"Seizure:Normal ratio: 1:{(total_windows-total_seizure)/max(1,total_seizure):.1f}")
    print(f"{'='*70}")
    print(f"\nSaved to: {output_file}")
    print(f"File size: {output_file.stat().st_size / (1024**3):.2f} GB")
else:
    print("\n✗ No data processed - check for errors above")

Processing with SMART IMBALANCED LEARNING (1:10 ratio)...

Selected subjects: ['chb01', 'chb02', 'chb03', 'chb05', 'chb24']
Strategy: High overlap for seizures + contextual negatives + smart undersampling

PHASE 1: Collecting windows with high overlap...


Subjects:   0%|          | 0/5 [00:00<?, ?it/s]


chb01: Processing 42 files...


  chb01:   0%|          | 0/42 [00:00<?, ?it/s]


chb02: Processing 36 files...


  chb02:   0%|          | 0/36 [00:00<?, ?it/s]


chb03: Processing 38 files...


  chb03:   0%|          | 0/38 [00:00<?, ?it/s]


chb05: Processing 39 files...


  chb05:   0%|          | 0/39 [00:00<?, ?it/s]


chb24: Processing 22 files...


  chb24:   0%|          | 0/22 [00:00<?, ?it/s]



PHASE 2: Smart balancing...
Collected: 2181 seizures, 4427 contextual, 613943 far normals
Balanced: 2181 seizures, 21810 normals
  - Contextual normals: 4427
  - Random normals: 17383
Ratio: 1:10.0

Writing 23991 windows to HDF5...

✓ SMART IMBALANCED DATASET PROCESSING COMPLETE
Total files processed: 177
Total windows: 23,991
Seizure windows: 2,181 (9.09%)
Normal windows: 21,810 (90.91%)
Seizure:Normal ratio: 1:10.0

Saved to: c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection\data\processed\preprocessed_data.h5
File size: 0.49 GB

✓ SMART IMBALANCED DATASET PROCESSING COMPLETE
Total files processed: 177
Total windows: 23,991
Seizure windows: 2,181 (9.09%)
Normal windows: 21,810 (90.91%)
Seizure:Normal ratio: 1:10.0

Saved to: c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection\data\processed\preprocessed_data.h5
File size: 0.49 GB


## 8. Save Processed Data

In [7]:
# Verify the saved data
output_file = PROCESSED_DATA_DIR / "preprocessed_data.h5"
print(f"Verifying {output_file}...\n")

with h5py.File(output_file, 'r') as f:
    print(f"Available datasets: {list(f.keys())}")
    print(f"Available attributes: {list(f.attrs.keys())}\n")
    
    if 'X' in f:
        X_data = f['X'][:]
        y_data = f['y'][:]
        
        print(f"Data shapes:")
        print(f"  X: {X_data.shape} (samples, channels, time_points)")
        print(f"  y: {y_data.shape} (samples,)")
        print(f"\nClass distribution:")
        print(f"  Seizure (1): {np.sum(y_data == 1)} ({np.sum(y_data == 1) / len(y_data) * 100:.2f}%)")
        print(f"  Normal (0): {np.sum(y_data == 0)} ({np.sum(y_data == 0) / len(y_data) * 100:.2f}%)")
        print(f"\nData statistics:")
        print(f"  X dtype: {X_data.dtype}")
        print(f"  X range: [{X_data.min():.3f}, {X_data.max():.3f}]")
        print(f"  X mean: {X_data.mean():.3f}")
        print(f"  X std: {X_data.std():.3f}")
        
        print(f"\nPreprocessed data verified successfully!")
    else:
        print("ERROR: No data found in HDF5 file!")

Verifying c:\Users\Pranaav_Prasad\OneDrive\Desktop\Projects\Epilepsy-Detection\data\processed\preprocessed_data.h5...

Available datasets: ['X', 'y']
Available attributes: ['n_channels', 'n_samples', 'sampling_rate', 'subjects', 'total_files_processed', 'window_size']

Data shapes:
  X: (23991, 23, 256) (samples, channels, time_points)
  y: (23991,) (samples,)

Class distribution:
  Seizure (1): 2181 (9.09%)
  Normal (0): 21810 (90.91%)

Data statistics:
  X dtype: float32
  X range: [-45.386, 44.584]
  X mean: 0.000
Data shapes:
  X: (23991, 23, 256) (samples, channels, time_points)
  y: (23991,) (samples,)

Class distribution:
  Seizure (1): 2181 (9.09%)
  Normal (0): 21810 (90.91%)

Data statistics:
  X dtype: float32
  X range: [-45.386, 44.584]
  X mean: 0.000
  X std: 1.069

Preprocessed data verified successfully!
  X std: 1.069

Preprocessed data verified successfully!


## 9. Summary

**Preprocessing Complete:**
- Filtered EEG signals (0.5-50 Hz bandpass)
- Normalized data (z-score)
- Created sliding windows (4s with 2s overlap)
- Labeled windows as seizure/normal
- Balanced dataset
- Saved processed data for training

**Next: Phase 3 - Model Training**