<a href="https://www.kaggle.com/code/rishitjakharia/working-with-nsrr?scriptVersionId=212514330" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [4]:
# !pip install pyedflib mne lxml

# Imports

In [5]:
import os
import xml.etree.ElementTree as ET
import pyedflib
import pandas as pd
import numpy as np
from scipy.signal import resample

# Loading Data (LEARN)

## Annotations

In [6]:
annotations_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/annotations-events-profusion"


annotations_data = {}

idx = 1
for xml_file in os.listdir(annotations_path):
    if xml_file.endswith('.xml'):
        tree = ET.parse(os.path.join(annotations_path, xml_file))
        root = tree.getroot()
        
        annotations_data[f'patient_{idx}'] = epoch_length = root.find('EpochLength').text
    idx += 1

print(annotations_data)

{'patient_1': '30', 'patient_2': '30', 'patient_3': '30'}


In [7]:
annotations_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/annotations-events-profusion"

annotations_data = {}

for xml_file in os.listdir(annotations_path):
    if xml_file.endswith('.xml'):
        idx = int(xml_file.split('-')[1][-1])
        tree = ET.parse(os.path.join(annotations_path, xml_file))
        root = tree.getroot()

        patient_data = {
            "epoch_length": None,
            "step_channels": [],
            "scored_events": [],
            "sleep_stages": []
        }

        # Extract Epoch Length
        epoch_length = root.find('EpochLength')
        if epoch_length is not None:
            patient_data['epoch_length'] = epoch_length.text

        # Extract Step Channels
        step_channels_section = root.find('StepChannels')
        if step_channels_section is not None:
            for step_channel in step_channels_section.findall('StepChannel'):
                input_name = step_channel.find('Input').text if step_channel.find('Input') is not None else None
                labels = [label.text for label in step_channel.find('Labels').findall('Label')] if step_channel.find('Labels') is not None else []
                patient_data['step_channels'].append({
                    "Input": input_name,
                    "Labels": labels
                })

        # Extract Scored Events
        scored_events_section = root.find('ScoredEvents')
        if scored_events_section is not None:
            for event in scored_events_section.findall('ScoredEvent'):
                name = event.find('Name').text if event.find('Name') is not None else None
                start = event.find('Start').text if event.find('Start') is not None else None
                duration = event.find('Duration').text if event.find('Duration') is not None else None
                input_name = event.find('Input').text if event.find('Input') is not None else None
                patient_data['scored_events'].append({
                    "Name": name,
                    "Start": start,
                    "Duration": duration,
                    "Input": input_name
                })

        # Extract Sleep Stages (if applicable)
        sleep_stages_section = root.find('SleepStages')  
        if sleep_stages_section is not None:
            for stage in sleep_stages_section.findall('SleepStage'):
                patient_data['sleep_stages'].append(int(stage.text))

        # Store the patient's data
        annotations_data[f'patient_{idx}'] = patient_data


In [8]:
len(annotations_data['patient_2']['sleep_stages'])*int(annotations_data['patient_2']['epoch_length'])

35850

size of the apnea events

In [15]:
# for patient, data in annotations_data.items():
#     print(f"Patient: {patient}")
#     print(f"Epoch Length: {data['epoch_length']}")
#     print("Step Channels:")
#     for channel in data['step_channels']:
#         print(f"  Input: {channel['Input']}, Labels: {channel['Labels']}")
#     print("Scored Events:")
#     for event in data['scored_events']:
#         print(f"  Name: {event['Name']}, Start: {event['Start']}, Duration: {event['Duration']}, Input: {event['Input']}")
#     print("Sleep Stages:")
#     for stage in data['sleep_stages']:
#         print(f"  Sleep Stage: {stage['SleepStage']}")
#     print("-" * 50)

for patient, data in annotations_data.items():
    print(f"Patient: {patient}")
    print(f"Epoch Length: {data['epoch_length']}")
    print("Step Channels:")
    for channel in data['step_channels']:
        print(f"  Input: {channel['Input']}, Labels: {channel['Labels']}")

Patient: patient_1
Epoch Length: 30
Step Channels:
  Input: POSITION, Labels: ['RIGHT', 'LEFT', 'BACK', 'FRONT']
  Input: LIGHT, Labels: ['ON', 'OFF']
Patient: patient_3
Epoch Length: 30
Step Channels:
  Input: POSITION, Labels: ['RIGHT', 'LEFT', 'BACK', 'FRONT']
  Input: LIGHT, Labels: ['ON', 'OFF']
Patient: patient_2
Epoch Length: 30
Step Channels:
  Input: POSITION, Labels: ['RIGHT', 'LEFT', 'BACK', 'FRONT']
  Input: LIGHT, Labels: ['ON', 'OFF']


## EDFs

In [18]:
edf_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/edfs"
edf_files = [os.path.join(edf_path, f) for f in os.listdir(edf_path) if f.endswith('.edf')]

signals = []
for edf_file in edf_files:
    with pyedflib.EdfReader(edf_file) as edf:
        n_signals = edf.signals_in_file
        signal_labels = edf.getSignalLabels()
        # Read all signals
        data = [edf.readSignal(i) for i in range(n_signals)]
        signals.append({"file": edf_file, "signals": data, "labels": signal_labels})

# print(signals)

In [19]:
for label, signal in zip(signal_labels, signals[0]['signals']):
    print(f"Label: {label} Length of singal: {len(signal)}")

Label: SaO2 Length of singal: 40920
Label: PR Length of singal: 40920
Label: EEG(sec) Length of singal: 5115000
Label: ECG Length of singal: 10230000
Label: EMG Length of singal: 5115000
Label: EOG(L) Length of singal: 2046000
Label: EOG(R) Length of singal: 2046000
Label: EEG Length of singal: 5115000
Label: AIRFLOW Length of singal: 409200
Label: THOR RES Length of singal: 409200
Label: ABDO RES Length of singal: 409200
Label: POSITION Length of singal: 40920
Label: LIGHT Length of singal: 40920
Label: OX STAT Length of singal: 40920


### Resampling
----

As seen above as the lengths of the signals is not equal we resample all the signals to the minimum signal length
> All the following transformations are done for one patient similar can be done for all the three

In [20]:
from scipy.signal import resample

target_length = 40920
resampled_signals = [resample(signal, target_length) for signal in signals[0]['signals']]

In [21]:
# for label, signal in zip(signal_labels, resampled_signals):
#     print(f"Label: {label} Length of singal: {len(signal)}")

Doing the operation for all the three patients

In [22]:
target_length = {}

for i in range(len(signals)):
    length = 2**32
    for label, signal in zip(signal_labels, signals[i]['signals']):
        length = min(length, len(signal))
    target_length[f"patient_{str(i+1)}"] = length

print(f"The minimum length of the signal is:\n--------------------------------------------")
for (key, value) in target_length.items():
    print(f"Patient: {key} | length of signal: {value}")

The minimum length of the signal is:
--------------------------------------------
Patient: patient_1 | length of signal: 40920
Patient: patient_2 | length of signal: 35850
Patient: patient_3 | length of signal: 40920


In [23]:
resampled_signals = []

for i, patient_signals in enumerate(signals):
    patient_resampled = []
    for label, signal in zip(signal_labels, patient_signals['signals']):
        # Resample each signal to the target length for the patient
        resampled_signal = resample(signal, target_length[f"patient_{str(i+1)}"])
        patient_resampled.append(resampled_signal)
    resampled_signals.append(patient_resampled)

In [24]:
# for label, signal in zip(signal_labels, resampled_signals[2]):
#     print(f"Label: {label} Length of singal: {len(signal)}")

## Creating Dataset

In [25]:
data = []

for i, patient_signals in enumerate(resampled_signals):
    patient_data = {"patient_number": i+1}
    for label, signal in zip(signal_labels, patient_signals):
        patient_data[label] = signal
    data.append(patient_data)

df = pd.DataFrame(data)
df = df.explode(column=list(signal_labels))
df = df.reset_index(drop=True)

In [26]:
df.sample(10)

Unnamed: 0,patient_number,SaO2,PR,EEG(sec),ECG,EMG,EOG(L),EOG(R),EEG,AIRFLOW,THOR RES,ABDO RES,POSITION,LIGHT,OX STAT
20094,1,97.1664,75.199512,11.400699,0.01514,1.186073,-72.883889,-22.128796,1.379608,-0.031284,0.259018,-0.011857,1.0,1.0,-0.0
1943,1,96.092164,76.371405,-3.747687,0.008462,0.126059,7.463829,10.093149,-5.120556,-0.104572,-0.910536,0.972904,3.0,1.0,2.0
91594,3,93.162432,71.293202,-3.758788,0.013601,-4.341337,-3.500618,-2.473959,-6.732017,0.061581,-0.156272,-0.091307,3.0,1.0,0.0
116174,3,0.10071,0.201419,-5.925154,0.020457,-23.597339,1.875596,2.232586,1.72567,0.062342,-0.009624,0.014386,3.0,1.0,2.0
26787,1,93.162432,51.371023,4.157637,0.006206,15.829904,33.195414,34.725387,2.500218,-0.034158,-0.015536,0.194163,1.0,1.0,-0.0
79696,3,98.142977,66.215,5.300126,0.014954,-4.433508,-1.832471,0.308201,2.244643,0.106339,-0.093067,-0.239942,1.0,1.0,-0.0
48086,2,96.092164,66.215,6.566322,0.009536,-0.526687,6.715629,4.619665,4.884675,-0.054888,0.071874,0.02768,1.0,1.0,-0.0
56605,2,96.092164,64.261845,-0.932198,0.011057,-4.143928,1.100131,-5.864859,-1.129132,-0.225917,0.067079,0.023256,3.0,1.0,0.0
42278,2,98.142977,90.238804,2.008813,0.000597,-1.756405,20.17126,8.303078,0.568653,0.196933,0.067833,0.027449,1.0,1.0,0.0
52156,2,96.092164,63.285267,2.907349,0.001152,0.799518,-4.789319,-0.133206,-12.002417,0.173963,0.066938,0.021354,3.0,1.0,0.0


### Merging Sleep Stage

In [27]:
len(annotations_data['patient_1']['sleep_stages'])*30

40920

In [28]:
sleep_stage = []

for i in [1,2,3]:
    sleep_stage.extend(np.repeat(annotations_data[f'patient_{i}']['sleep_stages'], annotations_data[f'patient_{i}']['epoch_length']))

In [29]:
df['Sleep Stage'] = sleep_stage

In [30]:
df.head()

Unnamed: 0,patient_number,SaO2,PR,EEG(sec),ECG,EMG,EOG(L),EOG(R),EEG,AIRFLOW,THOR RES,ABDO RES,POSITION,LIGHT,OX STAT,Sleep Stage
0,1,95.115587,79.301137,22.843402,0.026001,3.186218,-11.48473,-5.661157,12.348128,-0.547104,-0.096514,0.122403,0.0,0.0,0.0,0
1,1,95.115587,79.301137,5.438446,0.005783,-1.289172,-3.234281,-2.739006,-5.178353,-1.1149,0.206255,0.496497,0.0,0.0,0.0,0
2,1,95.115587,79.301137,-5.536805,0.00516,-2.063624,-2.835898,-3.02865,-0.34692,-0.870635,0.099319,-0.028908,0.0,0.0,0.0,0
3,1,94.13901,79.301137,-2.501587,-0.00126,-0.55537,10.290779,3.453601,-8.69889,-0.97568,-0.168435,-0.302637,0.0,0.0,0.0,0
4,1,94.13901,79.301137,5.645748,0.005081,1.114292,-7.547912,-5.755376,6.664345,-0.687685,-0.255385,-0.129724,0.0,0.0,0.0,0


In [31]:
df.sample(5)

Unnamed: 0,patient_number,SaO2,PR,EEG(sec),ECG,EMG,EOG(L),EOG(R),EEG,AIRFLOW,THOR RES,ABDO RES,POSITION,LIGHT,OX STAT,Sleep Stage
90752,3,95.115587,70.316625,0.985405,0.010581,-4.215213,-4.429904,-3.716188,1.582441,0.105009,0.198948,0.281245,3.0,1.0,-0.0,0
71481,2,0.10071,0.201419,-2.213157,-0.00045,-1.726723,2.587153,-2.266924,-17.291991,-0.025882,0.070843,0.027386,1.0,1.0,2.0,0
45516,2,95.115587,65.238422,0.436645,-0.000872,-0.421382,0.863867,-0.903371,-2.876717,-0.079459,0.070766,0.027505,1.0,1.0,0.0,3
4291,1,93.162432,66.215,6.238547,0.005316,0.010704,-6.453237,3.153733,-3.318819,-0.20141,-0.071228,0.010984,3.0,1.0,0.0,2
12240,1,0.10071,0.201419,9.617947,0.020918,-1.809881,-60.0832,-129.781733,-1.099843,0.099859,-0.54554,0.613787,1.0,1.0,2.0,0


### Merging Scored Events

In [32]:
scored_events = pd.DataFrame(annotations_data['patient_1']['scored_events'])

In [33]:
scored_events['Name'].unique()

array(['SpO2 artifact', 'Hypopnea', 'SpO2 desaturation', 'Arousal ()',
       'Obstructive Apnea'], dtype=object)