# Sleep-EDF Dataset Exploration Notebook

This notebook illustrates how to use the `SleepEDFDataset` loader to explore EEG epochs and sleep stage annotations from the Sleep-EDF dataset. We'll cover:
1. Dataset structure and basic stats
2. Mapping numeric labels to sleep stages
3. Distribution of sleep stages
4. Hypnogram plotting over time
5. Examples of EEG epochs for each sleep stage


In [1]:
import os
import sys
# Adjust this path to where your loader module resides
import sys
sys.path.append(os.path.dirname(os.getcwd()))
from time_series_datasets.sleep_edf.sleepedf_loader import SleepEDFDataset, get_sleepedf_data
import numpy as np
import matplotlib.pyplot as plt

# Initialize dataset
data_path = get_sleepedf_data()
dataset = SleepEDFDataset(data_dir=data_path)


## 1. Dataset Structure and Basic Stats

In [2]:
# Number of recordings (PSG files) available
num_recs = len(dataset)
print(f"Number of recordings: {num_recs}")

# Show first few file identifiers
print("First few PSG files:")
print(dataset.data_files[:5])

Number of recordings: 8
First few PSG files:
['SC4002E0-PSG.edf', 'SC4031E0-PSG.edf', 'SC4022E0-PSG.edf', 'SC4011E0-PSG.edf', 'SC4021E0-PSG.edf']


## 2. Sample Shape and Label Info

In [3]:
# Load the first recording's epochs and labels
data, labels = dataset[0]
print(f"Epoch data shape: {data.shape}  (n_epochs, 1 channel, n_times)")
print(f"Labels shape: {labels.shape}")

# Unique numeric labels in this recording
unique_labels = np.unique(labels)
print(f"Unique numeric labels: {unique_labels}")

Extracting EDF parameters from /Users/planger/Development/EmbedHealth/time_series_datasets/raw_data/sleep-edf-database-1.0.0/physionet.org/files/sleep-edfx/1.0.0/sleep-cassette/SC4002E0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 8489999  =      0.000 ... 84899.990 secs...


  raw = mne.io.read_raw_edf(psg_path, preload=True)
  raw = mne.io.read_raw_edf(psg_path, preload=True)
  raw = mne.io.read_raw_edf(psg_path, preload=True)


Used Annotations descriptions: [np.str_('Movement time'), np.str_('Sleep stage 1'), np.str_('Sleep stage 2'), np.str_('Sleep stage 3'), np.str_('Sleep stage 4'), np.str_('Sleep stage ?'), np.str_('Sleep stage R'), np.str_('Sleep stage W')]
Not setting metadata
151 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 151 events and 3000 original time points ...
0 bad epochs dropped
Epoch data shape: (151, 1, 3000)  (n_epochs, 1 channel, n_times)
Labels shape: (151,)
Unique numeric labels: [1 2 3 4 5 7 8]


  raw.set_annotations(ann)


## 3. Mapping Numeric Labels to Sleep Stages

In [5]:
# --- 1. Setup imports and path ---
import os
import sys
import mne

# Replace this with your actual path to the folder containing sleepedf_loader.py
PROJECT_DIR = '/absolute/path/to/time_series_datasets/sleep-edf'
os.chdir(PROJECT_DIR)              
sys.path.insert(0, PROJECT_DIR)

from sleepedf_loader import SleepEDFDataset, get_sleepedf_data

# --- 2. Initialize the dataset ---
data_path = get_sleepedf_data()
dataset   = SleepEDFDataset(data_dir=data_path)

# --- 3. Load the first PSG and its hypnogram ---
psg_file = dataset.data_files[0]
raw      = mne.io.read_raw_edf(os.path.join(data_path, psg_file), preload=True)

# Find the matching Hypnogram file (handles E0→EC naming)
prefix   = psg_file.split('-')[0][:6]
hyp_files = [f for f in os.listdir(data_path)
             if f.startswith(prefix) and f.endswith("Hypnogram.edf")]
if not hyp_files:
    raise FileNotFoundError(f"No Hypnogram found for {psg_file}")
ann = mne.read_annotations(os.path.join(data_path, hyp_files[0]))
raw.set_annotations(ann)

# --- 4. Extract events and build mapping ---
events, event_id = mne.events_from_annotations(raw)
code_to_stage   = {code: desc for desc, code in event_id.items()}

print("Annotation code → Sleep stage:")
for code, desc in code_to_stage.items():
    print(f"  {code}: {desc}")

# --- 5. Show the first 5 epochs and their stages ---
data, labels = dataset[0]
print("\nFirst 5 epochs and their stages:")
for i in range(5):
    print(f"  Epoch {i}: code={labels[i]} → {code_to_stage[labels[i]]}")


FileNotFoundError: [Errno 2] No such file or directory: '/absolute/path/to/time_series_datasets/sleep-edf'

## 4. Distribution of Sleep Stages

In [None]:
plt.figure(figsize=(8,4))
plt.hist(labels, bins=np.arange(unique_labels.min(), unique_labels.max()+2)-0.5)
plt.xticks(unique_labels, [code_to_stage[l] for l in unique_labels], rotation=45)
plt.xlabel('Sleep Stage')
plt.ylabel('Count of Epochs')
plt.title('Distribution of Sleep Stages in Recording 1')
plt.tight_layout()
plt.show()

## 5. Hypnogram Over Time

In [None]:
# Build hypnogram: times in seconds and stage codes
sfreq = raw.info['sfreq']
times = events[:, 0] / sfreq  # event onsets
stages = events[:, 2]

plt.figure(figsize=(12,4))
plt.step(times, stages, where='post')
plt.yticks(unique_labels, [code_to_stage[l] for l in unique_labels])
plt.xlabel('Time (s)')
plt.ylabel('Sleep Stage')
plt.title('Hypnogram for Recording 1')
plt.tight_layout()
plt.show()

## 6. Example Epochs for Each Sleep Stage

In [None]:
# For each unique stage, pick the first epoch and plot its waveform
for code in unique_labels:
    idx = np.where(labels == code)[0][0]
    epoch = data[idx, 0, :]
    plt.figure(figsize=(10,3))
    plt.plot(np.arange(epoch.size)/sfreq, epoch)
    plt.title(f"Stage: {code_to_stage[code]} (code={code}) - Epoch index {idx}")
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.tight_layout()
    plt.show()