In [4]:
# !pip install pyedflib mne lxml

# Imports

In [5]:
import os
import xml.etree.ElementTree as ET
import pyedflib
import pandas as pd
import numpy as np
from scipy.signal import resample

# Loading Data (LEARN)

## Annotations

In [6]:
annotations_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/annotations-events-profusion"


annotations_data = {}

idx = 1
for xml_file in os.listdir(annotations_path):
    if xml_file.endswith('.xml'):
        tree = ET.parse(os.path.join(annotations_path, xml_file))
        root = tree.getroot()
        
        annotations_data[f'patient_{idx}'] = epoch_length = root.find('EpochLength').text
    idx += 1

print(annotations_data)

{'patient_1': '30', 'patient_2': '30', 'patient_3': '30'}


In [84]:
annotations_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/annotations-events-profusion"

annotations_data = {}

for xml_file in os.listdir(annotations_path):
    if xml_file.endswith('.xml'):
        idx = int(xml_file.split('-')[1][-1])
        tree = ET.parse(os.path.join(annotations_path, xml_file))
        root = tree.getroot()

        patient_data = {
            "epoch_length": None,
            "step_channels": [],
            "scored_events": [],
            "sleep_stages": []
        }

        # Extract Epoch Length
        epoch_length = root.find('EpochLength')
        if epoch_length is not None:
            patient_data['epoch_length'] = epoch_length.text

        # Extract Step Channels
        step_channels_section = root.find('StepChannels')
        if step_channels_section is not None:
            for step_channel in step_channels_section.findall('StepChannel'):
                input_name = step_channel.find('Input').text if step_channel.find('Input') is not None else None
                labels = [label.text for label in step_channel.find('Labels').findall('Label')] if step_channel.find('Labels') is not None else []
                patient_data['step_channels'].append({
                    "Input": input_name,
                    "Labels": labels
                })

        # Extract Scored Events
        scored_events_section = root.find('ScoredEvents')
        if scored_events_section is not None:
            for event in scored_events_section.findall('ScoredEvent'):
                name = event.find('Name').text if event.find('Name') is not None else None
                start = event.find('Start').text if event.find('Start') is not None else None
                duration = event.find('Duration').text if event.find('Duration') is not None else None
                input_name = event.find('Input').text if event.find('Input') is not None else None
                patient_data['scored_events'].append({
                    "Name": name,
                    "Start": start,
                    "Duration": duration,
                    "Input": input_name
                })

        # Extract Sleep Stages (if applicable)
        sleep_stages_section = root.find('SleepStages')  
        if sleep_stages_section is not None:
            for stage in sleep_stages_section.findall('SleepStage'):
                patient_data['sleep_stages'].append(int(stage.text))

        # Store the patient's data
        annotations_data[f'patient_{idx}'] = patient_data


In [85]:
len(annotations_data['patient_2']['sleep_stages'])*int(annotations_data['patient_2']['epoch_length'])

35850

In [10]:
# for patient, data in annotations_data.items():
#     print(f"Patient: {patient}")
#     print(f"Epoch Length: {data['epoch_length']}")
#     print("Step Channels:")
#     for channel in data['step_channels']:
#         print(f"  Input: {channel['Input']}, Labels: {channel['Labels']}")
#     print("Scored Events:")
#     for event in data['scored_events']:
#         print(f"  Name: {event['Name']}, Start: {event['Start']}, Duration: {event['Duration']}, Input: {event['Input']}")
#     print("Sleep Stages:")
#     for stage in data['sleep_stages']:
#         print(f"  Sleep Stage: {stage['SleepStage']}")
#     print("-" * 50)

## EDFs

In [11]:
edf_path = "/kaggle/input/nsrr-dataset/datasets/learn/polysomnography/edfs"
edf_files = [os.path.join(edf_path, f) for f in os.listdir(edf_path) if f.endswith('.edf')]

signals = []
for edf_file in edf_files:
    with pyedflib.EdfReader(edf_file) as edf:
        n_signals = edf.signals_in_file
        signal_labels = edf.getSignalLabels()
        # Read all signals
        data = [edf.readSignal(i) for i in range(n_signals)]
        signals.append({"file": edf_file, "signals": data, "labels": signal_labels})

# print(signals)

In [36]:
for label, signal in zip(signal_labels, signals[0]['signals']):
    print(f"Label: {label} Length of singal: {len(signal)}")

Label: SaO2 Length of singal: 40920
Label: PR Length of singal: 40920
Label: EEG(sec) Length of singal: 5115000
Label: ECG Length of singal: 10230000
Label: EMG Length of singal: 5115000
Label: EOG(L) Length of singal: 2046000
Label: EOG(R) Length of singal: 2046000
Label: EEG Length of singal: 5115000
Label: AIRFLOW Length of singal: 409200
Label: THOR RES Length of singal: 409200
Label: ABDO RES Length of singal: 409200
Label: POSITION Length of singal: 40920
Label: LIGHT Length of singal: 40920
Label: OX STAT Length of singal: 40920


### Resampling
----

As seen above as the lengths of the signals is not equal we resample all the signals to the minimum signal length
> All the following transformations are done for one patient similar can be done for all the three

In [13]:
from scipy.signal import resample

target_length = 40920
resampled_signals = [resample(signal, target_length) for signal in signals[0]['signals']]

In [14]:
# for label, signal in zip(signal_labels, resampled_signals):
#     print(f"Label: {label} Length of singal: {len(signal)}")

Doing the operation for all the three patients

In [60]:
target_length = {}

for i in range(len(signals)):
    length = 2**32
    for label, signal in zip(signal_labels, signals[i]['signals']):
        length = min(length, len(signal))
    target_length[f"patient_{str(i+1)}"] = length

print(f"The minimum length of the signal is:\n--------------------------------------------")
for (key, value) in target_length.items():
    print(f"Patient: {key} | length of signal: {value}")

The minimum length of the signal is:
--------------------------------------------
Patient: patient_1 | length of signal: 40920
Patient: patient_2 | length of signal: 35850
Patient: patient_3 | length of signal: 40920


In [62]:
resampled_signals = []

for i, patient_signals in enumerate(signals):
    patient_resampled = []
    for label, signal in zip(signal_labels, patient_signals['signals']):
        # Resample each signal to the target length for the patient
        resampled_signal = resample(signal, target_length[f"patient_{str(i+1)}"])
        patient_resampled.append(resampled_signal)
    resampled_signals.append(patient_resampled)

In [63]:
# for label, signal in zip(signal_labels, resampled_signals[2]):
#     print(f"Label: {label} Length of singal: {len(signal)}")

## Creating Dataset

In [66]:
data = []

for i, patient_signals in enumerate(resampled_signals):
    patient_data = {"patient_number": i+1}
    for label, signal in zip(signal_labels, patient_signals):
        patient_data[label] = signal
    data.append(patient_data)

df = pd.DataFrame(data)
df = df.explode(column=list(signal_labels))
df = df.reset_index(drop=True)

In [69]:
df.sample(10)

Unnamed: 0,patient_number,SaO2,PR,EEG(sec),ECG,EMG,EOG(L),EOG(R),EEG,AIRFLOW,THOR RES,ABDO RES,POSITION,LIGHT,OX STAT
72017,2,0.10071,0.201419,4.975156,-0.012764,-0.288513,1.266604,-2.515327,-10.246594,-0.024036,0.071522,0.027375,1.0,1.0,2.0
10555,1,92.185855,67.191577,4.823356,0.003826,-0.34629,-0.609919,-1.975137,0.659791,-0.351819,0.001739,-0.884191,1.0,1.0,-0.0
12329,1,92.185855,67.191577,10.415663,0.013184,-0.334749,-17.524515,-17.681472,3.237692,0.050468,0.051396,0.248675,1.0,1.0,0.0
92772,3,94.13901,69.340047,6.529228,0.00861,-4.429327,2.174475,0.911175,6.279098,-0.022043,0.013852,0.022189,3.0,1.0,0.0
76996,3,94.13901,72.26978,-7.326974,0.016844,-4.329423,-7.081065,-33.745336,-2.000202,0.088411,-0.040346,0.042764,2.0,1.0,0.0
81913,3,92.185855,68.36347,-1.436559,0.008226,-4.302564,-29.644552,-5.280726,-12.486643,-0.000194,0.075108,0.025875,1.0,1.0,0.0
57089,2,97.1664,59.378958,3.184193,0.010505,1.30229,0.523529,2.367944,-3.991763,-0.156844,0.066798,0.021045,3.0,1.0,0.0
57999,2,94.13901,58.207065,-2.177036,0.009868,0.078572,-1.036095,0.485845,-8.173093,0.218736,0.066778,0.023131,3.0,1.0,0.0
113961,3,0.10071,0.201419,-20.362758,0.161753,-22.436918,21.672765,22.224944,2.006552,0.062213,-0.004624,0.015585,3.0,1.0,2.0
22341,1,89.158465,73.246357,2.842326,0.00112,0.461607,2.131907,-0.340889,1.656541,-0.149059,0.078902,0.017854,1.0,1.0,-0.0


In [86]:
len(annotations_data['patient_1']['sleep_stages'])*30

40920

In [87]:
sleep_stage = []

for i in [1,2,3]:
    sleep_stage.extend(np.repeat(annotations_data[f'patient_{i}']['sleep_stages'], annotations_data[f'patient_{i}']['epoch_length']))

In [88]:
df['Sleep Stage'] = sleep_stage

In [89]:
df.sample(10)

Unnamed: 0,patient_number,SaO2,PR,EEG(sec),ECG,EMG,EOG(L),EOG(R),EEG,AIRFLOW,THOR RES,ABDO RES,POSITION,LIGHT,OX STAT,Sleep Stage
82524,3,98.142977,69.340047,-1.408439,0.013106,-4.496996,-7.081603,-3.384595,-4.120019,0.061703,-0.137189,-0.007135,1.0,1.0,-0.0,0
109527,3,0.10071,0.201419,6.683335,-0.008391,-8.443765,-46.832935,-43.337545,-46.287565,-0.222747,0.029103,-0.106379,1.0,1.0,2.0,0
91569,3,94.13901,67.191577,2.286391,0.013247,-4.377282,4.625919,3.976359,2.263674,0.051051,0.127752,0.107534,3.0,1.0,-0.0,0
26750,1,98.142977,55.277333,2.171898,-0.005759,21.632781,-20.717683,-25.616364,0.677167,-0.028645,0.152449,0.218892,1.0,1.0,-0.0,5
51049,2,95.115587,64.261845,-4.920186,0.004337,-0.932796,2.892668,-2.50446,1.452101,-0.385737,0.066708,0.020468,3.0,1.0,0.0,3
17188,1,92.185855,69.340047,2.346989,0.00568,0.721782,1.749083,-0.172194,1.714285,0.001874,-0.028546,0.027091,3.0,1.0,0.0,0
44674,2,96.092164,67.191577,5.986238,-0.009094,-0.366621,18.392529,35.176181,4.492081,-0.169092,0.07115,0.026879,1.0,1.0,0.0,0
49013,2,95.115587,61.332113,-7.695762,0.001341,-0.372275,-1.777098,-8.97571,-1.770696,0.185168,0.069948,0.027542,1.0,1.0,-0.0,3
94966,3,96.092164,67.191577,2.174837,0.011966,-4.315348,1.239018,1.21781,-0.165311,0.081438,-0.077289,0.066388,1.0,1.0,-0.0,0
114952,3,0.10071,0.201419,16.363111,0.060599,-24.06672,1.160008,1.536156,-0.254056,0.078319,0.009462,0.039121,3.0,1.0,2.0,0
