In [1]:
import mne
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

In [2]:
sleep_data_dir = Path("../../sleep-edf-database-expanded-1.0.0/sleep-cassette/")
file_idx = 200
recordings = [x[:6] for x in os.listdir(sleep_data_dir)]
psg_files = list(sleep_data_dir.glob("*.edf"))
hypnogram_files = list(sleep_data_dir.glob("*-Hypnogram.edf"))

# #file that starts with recordings[file_idx]
psg_file = [x.name for x in psg_files if x.name.startswith(recordings[0]) and x.name.endswith("PSG.edf")][0]
print(psg_file)

data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
raw_data = data.get_data()
# you can get the metadata included in the file and a list of all channels:
info = data.info
channels = data.ch_names

SC4412E0-PSG.edf
Extracting EDF parameters from /home/annareisz/Documents/DeepLearningSignalProcessing/sleep-edf-database-expanded-1.0.0/sleep-cassette/SC4412E0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)


In [3]:
samples = 7950000
sampling_rate_hz = 100
duration = samples / sampling_rate_hz
hours = duration / 3600
print(f"Duration: {hours:.0f} hours")

Duration: 22 hours


# Preprocessing

In [25]:
# turn hypnogram into 30 seconds labelled data
def hypnogram_to_labelled_data(hypnogram, sampling_rate_hz):
    data = []
    for i in range(len(hypnogram)):
        onset = hypnogram.iloc[i]['onset']
        duration = hypnogram.iloc[i]['duration']
        description = hypnogram.iloc[i]['description']
        data.extend([description]*int(duration))
    return data

def chunk_data(data, window_size):
    chunks = []
    for i in range(0, len(data), window_size):
        chunk = data[i:i+window_size]
        if len(chunk) == window_size:
            chunks.append(chunk)
        else:
            print('chunk is not window size, chunk size:', len(chunk), 'window size:', window_size)
    return chunks

def remove_consecutive_duplicates(arr):
    return [arr[i] for i in range(len(arr)) if i == 0 or arr[i] != arr[i-1]]


def find_switch_idx(arr):
    return [i for i in range(len(arr)) if i == 0 or arr[i] != arr[i-1]]



In [30]:
for idx, recording in enumerate(recordings):
    print(idx)
    try:
        psg_file = [x.name for x in psg_files if x.name.startswith(recording) and x.name.endswith("PSG.edf")][0]
        hypnogram_file = [x.name for x in hypnogram_files if x.name.startswith(recording) and x.name.endswith("Hypnogram.edf")][0]
        data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
        raw_data = data.get_data()
        
        # Deal with hypnogram
        hypnogram = mne.read_annotations(sleep_data_dir / hypnogram_file)
        onsets = hypnogram.onset * sampling_rate_hz
        durations = hypnogram.duration * sampling_rate_hz
        descriptions = hypnogram.description

        # get  'EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'EMG submental' from raw data
        eeg_fpz_cz = raw_data[channels.index("EEG Fpz-Cz"), :]
        eeg_pz_oz = raw_data[channels.index("EEG Pz-Oz"), :]
        eog_horizontal = raw_data[channels.index("EOG horizontal"), :]
        emg_submental = raw_data[channels.index("EMG submental"), :]

        # trucate last stage - I assume the last duration is actually longer than the data
        trucate_amount = ((onsets[-1]+durations[-1])) - eeg_fpz_cz.shape[0]
        durations[-1] = durations[-1]-trucate_amount

        hypnogram = pd.DataFrame({"onset": onsets, "duration": durations, "description": descriptions})
        stages = hypnogram_to_labelled_data(hypnogram, sampling_rate_hz)         
        all_data = pd.DataFrame({"eeg_fpz_cz": eeg_fpz_cz, "eeg_pz_oz": eeg_pz_oz, "eog_horizontal": eog_horizontal, "emg_submental": emg_submental, "stage": stages})
        
        # Aggregate data to 30 second windows
        WINDOW_SIZE_SEC = 30
        
        
        def most_frequent(x):
            return x.value_counts().index[0]
        agg_rules = {'eeg_fpz_cz': 'mean', 'eeg_pz_oz': 'mean', 'eog_horizontal': 'mean', 'emg_submental': 'mean', 'stage': most_frequent}
        agg_data = all_data.groupby(all_data.index // (WINDOW_SIZE_SEC*sampling_rate_hz)).agg(agg_rules)

        seconds_in_30_minutes = 30*60

        # Split data into 30 minute chunks
        
        chunks = chunk_data(agg_data, seconds_in_30_minutes // WINDOW_SIZE_SEC)

        
        # save chunks
        print("Num chunks", len(chunks))
        for i, chunk in enumerate(chunks):
            if len(chunk['stage'].unique()) == 1 and chunk['stage'].unique()[0] in ['Sleep stage W', 'Movement time', 'Sleep stage ?']:
                continue
            #print('saving', f"sleep/{recording}-{i}.npy")
            np.save(f"sleep/{recording}-{i}.npy", chunk[['eeg_fpz_cz', 'eeg_pz_oz', 'eog_horizontal', 'emg_submental']].values)

            idxs = find_switch_idx(chunk['stage'].values)
            np.save(f"sleep/{recording}-{i}-labels.npy", chunk['stage'].values[idxs])

            np.save(f"sleep/{recording}-{i}-timestamps.npy", np.array(idxs))
            print('found one!')
    except Exception as e:
        print(e)
        print('error', recording)
        continue

    

    



0
Extracting EDF parameters from /home/annareisz/Documents/DeepLearningSignalProcessing/sleep-edf-database-expanded-1.0.0/sleep-cassette/SC4412E0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)


chunk is not window size, chunk size: 38 window size: 60
Num chunks 47
list indices must be integers or slices, not str
error SC4412
1
Extracting EDF parameters from /home/annareisz/Documents/DeepLearningSignalProcessing/sleep-edf-database-expanded-1.0.0/sleep-cassette/SC4242E0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)


chunk is not window size, chunk size: 10 window size: 60
Num chunks 45
list indices must be integers or slices, not str
error SC4242
2
Extracting EDF parameters from /home/annareisz/Documents/DeepLearningSignalProcessing/sleep-edf-database-expanded-1.0.0/sleep-cassette/SC4272F0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...


  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)
  data = mne.io.read_raw_edf(sleep_data_dir / psg_file)


KeyboardInterrupt: 

In [33]:
idxs = find_switch_idx(chunk['stage'].values)
chunk['stage'].values[idxs]

array(['Sleep stage W', 'Sleep stage 1', 'Sleep stage W', 'Sleep stage 1',
       'Sleep stage 2', 'Sleep stage 1', 'Sleep stage 2', 'Sleep stage 1',
       'Sleep stage 2', 'Sleep stage 1', 'Sleep stage 2', 'Sleep stage W'],
      dtype=object)

In [24]:
chunks[20].reset_index().iloc[idxs]

Unnamed: 0,index,eeg_fpz_cz,eeg_pz_oz,eog_horizontal,emg_submental,stage
0,1200,1.132891e-06,1.009902e-06,5e-06,3e-06,Sleep stage 3
1,1201,1.629748e-06,7.367082e-07,6e-06,3e-06,Sleep stage 2
2,1202,1.450323e-06,8.746676e-07,5e-06,3e-06,Sleep stage 3
6,1206,1.919015e-06,9.342603e-07,5e-06,4e-06,Sleep stage W
7,1207,6.93405e-07,7.892454e-07,6e-06,3e-06,Sleep stage 2
9,1209,5.781559e-07,1.182775e-06,5e-06,3e-06,Sleep stage 3
12,1212,1.40228e-06,9.091272e-07,5e-06,3e-06,Sleep stage 2
29,1229,4.556276e-07,1.026344e-06,5e-06,3e-06,Sleep stage 3
30,1230,1.351356e-06,9.080674e-07,5e-06,3e-06,Sleep stage 2
34,1234,1.47941e-06,1.000757e-06,6e-06,3e-06,Sleep stage 3


In [16]:
for chunk in chunks:
    print(chunk['stage'].unique())

['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W']
['Sleep stage W' 'Sleep stage 1' 'Sleep stage 2']
['Sleep stage 2' 'Sleep stage 1' 'Sleep stage 3' 'Sleep stage 4']
['Sleep stage 4' 'Sleep stage 3' 'Sleep stage 1' 'Sleep stage 2'
 'Sleep stage R']
['Sleep stage R' 'Sleep stage 1' 'Sleep stage W' 'Sleep stage 2'
 'Sleep stage 3']
['Sleep stage 3' 'Sleep stage 2' 'Sleep stage 4' 'Sleep stage W']
['Sleep stage W' 'Sleep stage 1' 'Sleep stage 2' 'Sleep stage 3']
['Sleep stage 3' 'Sleep stage 2' 'Sleep stage W' 'Sleep stage 1']
['Sleep stage 1' 'Sleep stage 2' 'Sleep stage R' 'Sleep stage W']
['Sleep stage 2' 'Sleep stage 1' 'Sleep stage W']
['Sleep stage 2' 'Sleep stage 3' 'Sleep stage W' 'Sleep stage 1']
['Sleep stage 2' 'Sleep stage 3' 'Sleep stage 4']
['Sleep stage 2' 'Sleep stage 3' 'Sl

In [18]:
len(eeg_fpz_cz)
len(eeg_pz_oz)
len(emg_submental)


6798000

In [17]:
len(stages)

8310000