In [2]:
# !pip install pyedflib mne lxml

# Imports

In [41]:
import os
import xml.etree.ElementTree as ET
import pyedflib
import pandas as pd
import numpy as np
from scipy.signal import resample

# Loading Data (LEARN)

## Annotations

In [78]:
annotations_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/annotations-events-profusion"


annotations_data = {}

idx = 1
for xml_file in os.listdir(annotations_path):
    if xml_file.endswith('.xml'):
        tree = ET.parse(os.path.join(annotations_path, xml_file))
        root = tree.getroot()
        
        annotations_data[f'patient_{idx}'] = epoch_length = root.find('EpochLength').text
    idx += 1

print(annotations_data)

{'patient_1': '30', 'patient_2': '30', 'patient_3': '30'}


In [92]:
annotations_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/annotations-events-profusion"

annotations_data = {}

idx = 1
for xml_file in os.listdir(annotations_path):
    if xml_file.endswith('.xml'):
        tree = ET.parse(os.path.join(annotations_path, xml_file))
        root = tree.getroot()

        patient_data = {
            "epoch_length": None,
            "step_channels": [],
            "scored_events": [],
            "sleep_stages": []
        }

        # Extract Epoch Length
        epoch_length = root.find('EpochLength')
        if epoch_length is not None:
            patient_data['epoch_length'] = epoch_length.text

        # Extract Step Channels
        step_channels_section = root.find('StepChannels')
        if step_channels_section is not None:
            for step_channel in step_channels_section.findall('StepChannel'):
                input_name = step_channel.find('Input').text if step_channel.find('Input') is not None else None
                labels = [label.text for label in step_channel.find('Labels').findall('Label')] if step_channel.find('Labels') is not None else []
                patient_data['step_channels'].append({
                    "Input": input_name,
                    "Labels": labels
                })

        # Extract Scored Events
        scored_events_section = root.find('ScoredEvents')
        if scored_events_section is not None:
            for event in scored_events_section.findall('ScoredEvent'):
                name = event.find('Name').text if event.find('Name') is not None else None
                start = event.find('Start').text if event.find('Start') is not None else None
                duration = event.find('Duration').text if event.find('Duration') is not None else None
                input_name = event.find('Input').text if event.find('Input') is not None else None
                patient_data['scored_events'].append({
                    "Name": name,
                    "Start": start,
                    "Duration": duration,
                    "Input": input_name
                })

        # Extract Sleep Stages (if applicable)
        sleep_stages_section = root.find('SleepStages')  
        if sleep_stages_section is not None:
            for stage in sleep_stages_section.findall('SleepStage'):
                patient_data['sleep_stages'].append({
                    'SleepStage': stage.text
                })

        # Store the patient's data
        annotations_data[f'patient_{idx}'] = patient_data
        idx += 1


In [96]:
# len(annotations_data['patient_2']['sleep_stages'])*30

In [98]:
# for patient, data in annotations_data.items():
#     print(f"Patient: {patient}")
#     print(f"Epoch Length: {data['epoch_length']}")
#     print("Step Channels:")
#     for channel in data['step_channels']:
#         print(f"  Input: {channel['Input']}, Labels: {channel['Labels']}")
#     print("Scored Events:")
#     for event in data['scored_events']:
#         print(f"  Name: {event['Name']}, Start: {event['Start']}, Duration: {event['Duration']}, Input: {event['Input']}")
#     print("Sleep Stages:")
#     for stage in data['sleep_stages']:
#         print(f"  Sleep Stage: {stage['SleepStage']}")
#     print("-" * 50)

## EDFs

In [43]:
edf_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/edfs"
edf_files = [os.path.join(edf_path, f) for f in os.listdir(edf_path) if f.endswith('.edf')]

signals = []
for edf_file in edf_files:
    with pyedflib.EdfReader(edf_file) as edf:
        n_signals = edf.signals_in_file
        signal_labels = edf.getSignalLabels()
        # Read all signals
        data = [edf.readSignal(i) for i in range(n_signals)]
        signals.append({"file": edf_file, "signals": data, "labels": signal_labels})

# print(signals)

In [40]:
for label, signal in zip(signal_labels, signals[0]['signals']):
    print(f"Label: {label} Length of singal: {len(signal)}")

Label: SaO2 Length of singal: 40920
Label: PR Length of singal: 40920
Label: EEG(sec) Length of singal: 5115000
Label: ECG Length of singal: 10230000
Label: EMG Length of singal: 5115000
Label: EOG(L) Length of singal: 2046000
Label: EOG(R) Length of singal: 2046000
Label: EEG Length of singal: 5115000
Label: AIRFLOW Length of singal: 409200
Label: THOR RES Length of singal: 409200
Label: ABDO RES Length of singal: 409200
Label: POSITION Length of singal: 40920
Label: LIGHT Length of singal: 40920
Label: OX STAT Length of singal: 40920


## Resampling

As seen above as the lengths of the signals is not equal we resample all the signals to the minimum signal length
> All the following transformations are done for one patient similar can be done for all the three

In [44]:
from scipy.signal import resample

target_length = 40920
resampled_signals = [resample(signal, target_length) for signal in signals[0]['signals']]

In [68]:
# for label, signal in zip(signal_labels, resampled_signals):
#     print(f"Label: {label} Length of singal: {len(signal)}")

Doing the operation for all the three patients

In [61]:
target_length = {}

for i in range(len(signals)):
    length = 2**32
    for label, signal in zip(signal_labels, signals[i]['signals']):
        length = min(length, len(signal))
    target_length[f"patient_{str(i)}"] = length

print(f"The minimum length of the signal is:\n--------------------------------------------")
for (key, value) in target_length.items():
    print(f"Patient: {key} | length of signal: {value}")

The minimum length of the signal is:
--------------------------------------------
Patient: patient_0 | length of signal: 40920
Patient: patient_1 | length of signal: 35850
Patient: patient_2 | length of signal: 40920


In [62]:
resampled_signals = []

for i, patient_signals in enumerate(signals):
    patient_resampled = []
    for label, signal in zip(signal_labels, patient_signals['signals']):
        # Resample each signal to the target length for the patient
        resampled_signal = resample(signal, target_length[f"patient_{str(i)}"])
        patient_resampled.append(resampled_signal)
    resampled_signals.append(patient_resampled)

In [67]:
# for label, signal in zip(signal_labels, resampled_signals[2]):
#     print(f"Label: {label} Length of singal: {len(signal)}")

## Creating Dataset

In [71]:
data = []

for i, patient_signals in enumerate(resampled_signals):
    patient_data = {"patient_number": i}
    for label, signal in zip(signal_labels, patient_signals):
        patient_data[label] = signal
    data.append(patient_data)

df = pd.DataFrame(data)
df = df.explode(column=list(signal_labels))
df = df.reset_index(drop=True)

In [74]:
df.sample(5)

Unnamed: 0,patient_number,SaO2,PR,EEG(sec),ECG,EMG,EOG(L),EOG(R),EEG,AIRFLOW,THOR RES,ABDO RES,POSITION,LIGHT,OX STAT
24295,0,96.092164,59.378958,-2.313922,0.003665,-0.085757,-64.449142,-67.149368,-4.855278,-0.138056,-0.150654,-0.766859,-0.0,1.0,-0.0
93066,2,95.115587,67.191577,1.260527,0.010967,-4.246998,0.038715,-1.15414,2.761921,-0.009033,0.014754,0.066553,3.0,1.0,-0.0
75130,1,0.10071,0.201419,0.556535,0.002384,0.585643,8.586867,0.073933,-5.510984,-0.025754,0.066968,0.027355,1.0,1.0,2.0
23489,0,92.185855,62.30869,1.187482,0.006115,0.643623,15.587332,16.193776,-4.05238,-0.170391,0.07557,-0.435677,1.0,1.0,0.0
112103,2,0.10071,0.201419,-9.751605,-0.356309,-18.880773,2.022371,2.08805,6.172137,0.052598,-0.003255,0.013863,3.0,1.0,2.0
