In [3]:
# !pip install pyedflib

In [4]:
import pyedflib
import pandas as pd
import numpy as np
from scipy.signal import butter, filtfilt
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import re

# Trying and building everything on 1 patient
----
## Extract
----
### Signals

In [5]:
edf_file = '/kaggle/input/svuh-dataset/files/ucddb002.rec'
edf = pyedflib.EdfReader(edf_file)
signals = {}
for i in range(edf.signals_in_file):
    label = edf.getLabel(i)
    signal = edf.readSignal(i)
    signals[label] = signal
edf._close()

### Annotations

In [47]:
import pandas as pd
import re

# Define the path to the file
txt_file = '/kaggle/input/svuh-dataset/files/ucddb006_respevt.txt'

# Columns you expect in the dataframe
columns = ['Time', 'Type', 'PB/CS', 'Duration', 'Low', '%Drop', 'Snore', 'Arousal', 'Rate', 'Change']

# Step 1: Read the file with a custom function to handle splitting issues
def read_custom_file(txt_file):
    with open(txt_file, 'r') as f:
        # Read the content, skipping the first 3 lines
        lines = f.readlines()[3:]
        
    # Process lines manually to avoid splitting on 'PB EVENT'
    processed_lines = []
    for line in lines:
        # Replace multiple spaces with a single space
        line = re.sub(r'\s+', ' ', line.strip())
        
        # Ensure 'PB EVENT' is correctly kept as part of the 'PB/CS' column
        line = re.sub(r'\s(PB EVENT)\s', r' PBEVENT ', line)  # This ensures that 'PB EVENT' stays intact
        # Append the processed line to the list
        processed_lines.append(line)
    
    return processed_lines

# Step 2: Load the data using the custom processed lines
processed_lines = read_custom_file(txt_file)

# Step 3: Create a DataFrame from the processed lines
# Now, the data will be correctly parsed without splitting the 'PB/CS' column.
from io import StringIO
data = "\n".join(processed_lines)
annotations_1_1 = pd.read_csv(StringIO(data), delimiter=' ', header=None, names=columns)

# Display a sample of the data
annotations_1_1.sample(10)

Unnamed: 0,Time,Type,PB/CS,Duration,Low,%Drop,Snore,Arousal,Rate,Change
43,02:55:54,PBEVENT,PB,12.0,-,-,,,,
66,03:10:44,HYP-O,8,86.8,4,-,-,,,
222,,,,,,,,,,
143,05:18:47,APNEA-C,13,86.8,5.2,-,-,,,
110,05:00:33,APNEA-C,15,88.0,2.9,-,-,,,
85,04:17:40,HYP-C,11,83.9,7,+,-,,,
79,04:04:29,HYP-C,18,88.0,4,-,-,,,
71,03:42:36,HYP-C,23,86.7,2.1,+,-,,,
127,05:09:33,HYP-C,15,87.3,2.7,-,-,,,
35,02:51:03,APNEA-C,12,86.8,3.2,-,+,,,


In [48]:
import re

def clean_annotations(df):
    # Define a regex pattern to match integer values (positive integers)
    int_pattern = r"^\d+$"  # This matches any string that represents a positive integer
    
    # Iterate through each row and check if the 'PB/CS' value matches the integer pattern
    for idx, row in df.iterrows():
        # Check if 'PB/CS' matches the integer regex pattern (i.e., is an integer-like string)
        if (not isinstance(row['PB/CS'], str)) or (isinstance(row['PB/CS'], str) and re.match(int_pattern, row['PB/CS'])):
            # Shift the columns of this row by 1
            
            # For each column (except the last one), move the value to the next column
            for col_idx in range(len(df.columns) - 1, 2, -1):
                df.iloc[idx, col_idx] = df.iloc[idx, col_idx - 1]
            # Set the first column to None (or NaN, to maintain consistent length)
            df.loc[idx, 'PB/CS'] = None

        if (isinstance(row['Low'], str) and re.match('\-|\+', row['Low'])):
            for col_idx in range(len(df.columns)-1, 5, -1):
                df.iloc[idx, col_idx] = df.iloc[idx, col_idx-2]
            df.loc[idx, ['Low', '%Drop']] = [None, None]

    return df

# Apply the function to your DataFrame
annotations_1_1_cleaned = clean_annotations(annotations_1_1.copy());

  df.iloc[idx, col_idx] = df.iloc[idx, col_idx - 1]


In [49]:
annotations_1_1_cleaned.sample(10)

Unnamed: 0,Time,Type,PB/CS,Duration,Low,%Drop,Snore,Arousal,Rate,Change
179,05:38:17,PBEVENT,PB,14,,,-,-,,
153,05:23:55,HYP-C,,14,85.9,4.1,-,+,,
39,02:53:47,HYP-C,,14,88.0,4.0,-,+,,
105,04:58:12,HYP-C,,13,88.8,3.2,-,-,71.8,10.4
185,05:41:36,HYP-C,,8,85.9,2.1,-,-,,
149,05:21:48,HYP-C,,17,86.7,4.2,-,+,73.2,7.5
29,02:47:43,HYP-C,,15,88.0,2.8,-,+,85.1,5.9
119,05:05:34,APNEA-C,,12,86.8,4.0,-,-,72.3,9.6
78,04:03:56,APNEA-C,,16,85.9,2.9,-,+,,
67,03:26:54,APNEA-C,,15,86.7,2.1,-,-,,


In [50]:
annotations_1_1.loc[[171, 191]]

Unnamed: 0,Time,Type,PB/CS,Duration,Low,%Drop,Snore,Arousal,Rate,Change
171,05:34:06,PBEVENT,PB,12,-,-,,,,
191,06:01:15,PBEVENT,PB,12,-,-,,,,
