# MIT-BIH Long-Term ECG Database (_ltdb_)

Part of the ECG Database Collection:

| Short Name | Long Name |
| :--- | :--- |
| _mitdb_ | MIT-BIH Arrhythmia Database |
| _svdb_ | MIT-BIH Supraventricular Arrhythmia Database |
| _ltdb_ | MIT-BIH Long-Term ECG Database |

[Docu](https://wfdb.readthedocs.io/en/latest) of the `wfdb`-package.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import wfdb
import os
from typing import Final
from collections.abc import Callable
from config import data_raw_folder, data_processed_folder

In [2]:
dataset_collection_name = "LTDB"
source_folder = os.path.join(data_raw_folder, "MIT-BIH Long-Term ECG Database")
target_folder = data_processed_folder

Metadata handling

In [3]:
# type
DatasetMetadataRecord = {
    "collection_name": str,
    "dataset_name": str,
    "train_path": str,
    "test_path": str,
    "dataset_type": str,
    "datetime_index": bool,
    "split_at": int,
    "train_type": str,
    "train_is_normal": bool,
    "input_type": str,
    "length": int
}

class DatasetMetadata:
    """
    ATTENTION: Not thread-safe! There is no check for changes to the underlying `dataset.csv` file while this class is loaded.
    """
    
    FILENAME: Final[str] = "datasets.csv"
    
    _filepath: str
    _df: pd.DataFrame
    _dirty: bool

    def __init__(self, target_folder: str):
        self._filepath = os.path.join(target_folder, self.FILENAME)
        self._dirty = False
        if not os.path.isfile(self._filepath):
            self._df = self._create_metadata_file()
        else:
            self.refresh(force = True)
    
    def __enter__(self) -> 'DatasetMetadata':
        return self
    
    def __exit__(self, exception_type, exception_value, exception_traceback) -> 'DatasetMetadata':
        self.save()
        return self
    
    def __repr__(self) -> str:
        return repr(self._df)
    
    def __str__(self) -> str:
        return str(self._df)
        
    def _create_metadata_file(self) -> pd.DataFrame:
        df_temp = pd.DataFrame(columns=["dataset_name", "collection_name", "train_path", "test_path", "type", "datetime_index", "split_at", "train_type", "train_is_normal", "input_type", "length"])
        df_temp.set_index(["collection_name", "dataset_name"], inplace=True)
        df_temp.to_csv(self._filepath)
        return df_temp
    
    def add_dataset(self,
        dataset_name: str,
        collection_name: str,
        train_path: str,
        test_path: str,
        dataset_type: str,
        datetime_index: bool,
        split_at: int,
        train_type: str,
        train_is_normal: bool,
        input_type: str,
        dataset_length: int
    ) -> 'DatasetMetadata':
        df_new = pd.DataFrame({
            "train_path": train_path,
            "test_path": test_path,
            "type": dataset_type,
            "datetime_index": datetime_index,
            "split_at": split_at,
            "train_type": train_type,
            "train_is_normal": train_is_normal,
            "input_type": input_type,
            "length": dataset_length
        }, index=[(dataset_collection_name, dataset_name)])
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
        return self
    
    def add_datasets(self, datasets: list[DatasetMetadataRecord]) -> 'DatasetMetadata':
        df_new = pd.DataFrame(datasets)
        df_new.set_index(["collection_name", "dataset_name"], inplace = True)
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
        return self
    
    def refresh(self, force: bool = False) -> None:
        if not force and self._dirty:
            raise Exception("There are unsaved changes in memory that would get lost by reading from disk again!")
        else:
            self._df = pd.read_csv(self._filepath, index_col=["collection_name", "dataset_name"])
    
    def save(self) -> None:
        self._df.to_csv(self._filepath)
        self._dirty = False

In [4]:
def load_dataset_names() -> list[str]:
    with open(os.path.join(source_folder, "RECORDS"), 'r') as f:
        records = [l.rstrip('\n') for l in f]
    return records

For the explaination of the following transformation function `transform_and_label`, see the tranformation walk-through of the _MIT-BIH Arrhythmia Database.ipynb_-notebook.

The following annotations are present in this dataset:

| Annotation | Description |
| :--------- | :---------- |
|| **Considered normal** |
| `N` | Normal beat |
|| **Anomalous beats** (use double-window labeling) |
| `F` | Fusion of ventricular and normal beat |
| `S` | Supraventricular premature or ectopic beat |
| `a` | Aberrated atrial premature beat |
| `V` | Premature ventricular contraction |
| `J` | Nodal (junctional) premature beat |
|| **Anomaly from `x` until next beat window start** |
| - ||
|| **Entire section of fibrillation is regarded anomalous** (a single window from `[` to `]`) |
| - ||
|| **External anomalies** (single window labeling) |
| - ||
|| **Ignored, bc hard to parse and to label** |
| `~` | Change in signal quality (usually noise level changes) |

In [5]:
ann_normal = ["N", "/", "L", "R"]
ann_beat = ["F", "f", "S", "A", "a", "V", "J", "j", "E", "e"]
ann_no_beat = ["x"]
ann_fibr_start = "["
ann_fibr_end = "]"
ann_fibr = [ann_fibr_start, "!", ann_fibr_end]
ann_ext = ["Q", "|"]
ann_ignore = ["+", "~", '"']

def transform_and_label(source_file: str, target: str) -> int:
    print(f"Transforming {os.path.basename(source_file)}")
    # load dataset
    record = wfdb.rdrecord(source_file)
    df_record = pd.DataFrame(record.p_signal, columns=record.sig_name)
    print(f"  record {record.file_name[0]} loaded")

    # load annotation file
    atr = wfdb.rdann(source_file, "atr")
    assert record.fs == atr.fs, "Sample frequency of records and annotations does not match!"
    df_annotation = pd.DataFrame({"position": atr.sample, "label": atr.symbol})
    # remove ignored annotations
    df_annotation = df_annotation[~df_annotation["label"].isin(ann_ignore)]
    df_annotation = df_annotation.reset_index(drop=True)
    print(f"  {len(df_annotation)}/{atr.ann_len} beat annotations for {source_file} loaded (others were ignored)")

    # calculate normal beat length
    print("  preparing windows for labeling...")
    df_normal_beat = df_annotation.copy()
    df_normal_beat["prev_position"] = df_annotation["position"].shift()
    df_normal_beat["prev_label"] = df_annotation["label"].shift()
    df_normal_beat = df_normal_beat[(df_normal_beat["label"].isin(ann_normal)) & (df_normal_beat["prev_label"].isin(ann_normal))]
    df_normal_beat = df_normal_beat.drop(columns=["label", "prev_label"])
    s_normal_beat_lengths = df_normal_beat["position"] - df_normal_beat["prev_position"]
    print(f"    normal beat distance samples = {len(s_normal_beat_lengths)}")
    normal_beat_length = s_normal_beat_lengths.median()
    if (normal_beat_length % 2) == 0:
        normal_beat_length += 1
    beat_window_size = int(normal_beat_length)
    beat_window_margin = (beat_window_size - 1)//2
    del df_normal_beat
    del s_normal_beat_lengths
    print(f"    window size = {beat_window_size}")
    print(f"    window margins (left and right) = {beat_window_margin}")

    # calculate beat windows
    ## ~ and other annotations are ignored!
    ## for fibrillation
    # we only need start and end marked with `[` and `]` respectively
    s_fibr_start = df_annotation.loc[df_annotation["label"] == ann_fibr_start, "position"]
    s_index = s_fibr_start.index
    s_fibr_start = s_fibr_start.reset_index(drop=True)
    s_fibr_end = df_annotation.loc[df_annotation["label"] == ann_fibr_end, "position"]
    s_fibr_end = s_fibr_end.reset_index(drop=True)
    df_fibr = pd.DataFrame({"index": s_index, "window_start": s_fibr_start, "window_end": s_fibr_end})
    df_fibr = df_fibr.set_index("index")
    df_fibr["position"] = df_fibr["window_start"]
    print(f"    {len(df_fibr)} windows for fibrillation anomalies ({','.join(ann_fibr)})")
    ## for external anomalies
    df_ext = df_annotation[df_annotation["label"].isin(ann_ext)].copy()
    df_ext["window_start"] = np.maximum(0, df_ext["position"]-beat_window_margin)
    df_ext["window_end"] = np.minimum(record.sig_len - 1, df_ext["position"]+beat_window_margin)
    df_ext = df_ext[["position", "window_start", "window_end"]]
    print(f"    {len(df_ext)} windows for external anomalies ({','.join(ann_ext)})")
    ## anomalous beats
    # exclude additional non-beat annotations
    df_svf = df_annotation[~df_annotation["label"].isin(["|", ann_fibr_start, ann_fibr_end])].copy()
    df_svf["position_next"] = df_svf["position"].shift(-1)
    df_svf["position_prev"] = df_svf["position"].shift(1)
    #df_svf = df_svf[(df_svf["position_prev"].notnull()) & (df_svf["position_next"].notnull())]
    df_svf = df_svf[df_svf["label"].isin(ann_beat)]
    df_svf["window_start"] = np.maximum(0, np.minimum(df_svf["position"].values-beat_window_margin, df_svf["position_prev"].values+beat_window_margin))
    df_svf["window_end"] = np.minimum(record.sig_len - 1, np.maximum(df_svf["position"].values+beat_window_margin, df_svf["position_next"].values-beat_window_margin))
    df_svf = df_svf[["position", "window_start", "window_end"]]
    print(f"    {len(df_svf)} windows for anomalous beats ({','.join(ann_beat)})")
    # missing beats
    df_no_beat = df_annotation[df_annotation["label"].isin(ann_no_beat)].drop(columns=["label"]).copy()
    df_no_beat["window_start"] = df_no_beat["position"]
    if not df_no_beat.empty:
        df_normal_windows = df_annotation[df_annotation["label"].isin(ann_normal)].copy()
        df_normal_windows = df_normal_windows.drop(columns=["label"])
        df_normal_windows["window_start"] = np.maximum(0, df_normal_windows["position"]-beat_window_margin)
        df_normal_windows["window_end"] = np.minimum(record.sig_len - 1, df_normal_windows["position"]+beat_window_margin)
        df_lut = df_annotation[~df_annotation["label"].isin(ann_no_beat)].merge(pd.concat([df_ext, df_svf, df_fibr, df_normal_windows]), on="position", how="left")
        def find_next_window_start(pos: int):
            next_window_start = df_lut.loc[df_lut["position"] > pos, "window_start"].iloc[0]
            return max(pos, next_window_start)
        df_no_beat["window_end"] = df_no_beat["position"].transform(find_next_window_start)
        del df_normal_windows
        del df_lut
    else:
        df_no_beat["window_end"] = df_no_beat["position"]
    print(f"    {len(df_no_beat)} windows for missing beats ({','.join(ann_no_beat)})")
    ## merge
    df_windows = pd.concat([df_ext, df_svf, df_fibr, df_no_beat])
    df_windows.sort_index(inplace=True)
    print(f"  ...done.")

    # add labels based on anomaly windows
    print("  labeling")
    df_record["is_anomaly"] = 0
    for _, (_, t1, t2) in df_windows.iterrows():
        tmp = df_record[df_record.index >= t1]
        tmp = tmp[tmp.index <= t2]
        df_record["is_anomaly"].values[tmp.index] = 1
    del tmp

    # reconstruct timestamps and set as index
    print("  reconstructing timestamps")
    df_record["timestamp"] = pd.to_datetime(df_record.index.values * 1e+9/record.fs, unit='ns')
    df_record = df_record.set_index("timestamp")
    df_record.to_csv(target)
    print(f"Dataset {os.path.basename(source_file)} transformed and saved!")
    
    # return dataset length
    return record.sig_len

In [6]:
# shared by all datasets
dataset_type = "real"
input_type = "multivariate"
datetime_index = True
train_type = "unsupervised"
train_is_normal = False

dm = DatasetMetadata(target_folder)

# create target directory
dataset_subfolder = os.path.join(target_folder, input_type, dataset_collection_name)
try:
    os.makedirs(dataset_subfolder)
    print(f"Created directories {dataset_subfolder}")
except FileExistsError:
    print(f"Directories {dataset_subfolder} already exist")
    pass

Directories data-processed/multivariate/LTDB already exist


In [7]:
# dataset transformation
transform_file: Callable[[str, str], int] = transform_and_label

for dataset_name in load_dataset_names():
    # intentionally no file suffix (.dat)
    source_file = os.path.join(source_folder, dataset_name)
    filename = f"{dataset_name}.test.csv"
    path = os.path.join(dataset_subfolder, filename)
            
    # transform file and label it
    dataset_length = transform_file(source_file, path)
    print(f"Processed source dataset {source_file} -> {path}")

    # save metadata
    dm.add_dataset(
        dataset_name = dataset_name,
        collection_name = dataset_collection_name,
        train_path = None,
        test_path = path,
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )

# save metadata of benchmark
dm.save()

Transforming 14046
  record 14046.dat loaded
  115278/115278 beat annotations for data-raw/MIT-BIH Long-Term ECG Database/14046 loaded (others were ignored)
  preparing windows for labeling...
    normal beat distance samples = 96721
    window size = 95
    window margins (left and right) = 47
    0 windows for fibrillation anomalies ([,!,])
    0 windows for external anomalies (Q,|)
    9864 windows for anomalous beats (F,f,S,A,a,V,J,j,E,e)
    0 windows for missing beats (x)
  ...done.
  labeling
  reconstructing timestamps
Dataset 14046 transformed and saved!
Processed source dataset data-raw/MIT-BIH Long-Term ECG Database/14046 -> data-processed/multivariate/LTDB/14046.test.csv
Transforming 14134
  record 14134.dat loaded
  49632/49769 beat annotations for data-raw/MIT-BIH Long-Term ECG Database/14134 loaded (others were ignored)
  preparing windows for labeling...
    normal beat distance samples = 27925
    window size = 125
    window margins (left and right) = 62
    0 windows

In [8]:
dm.refresh()
dm._df

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
WebscopeS5,A1Benchmark-6,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1439
WebscopeS5,A1Benchmark-3,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1461
WebscopeS5,A1Benchmark-40,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1427
WebscopeS5,A1Benchmark-20,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1422
WebscopeS5,A1Benchmark-4,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1423
...,...,...,...,...,...,...,...,...,...,...
LTDB,14149,,data-processed/multivariate/LTDB/14149.test.csv,real,True,,unsupervised,False,multivariate,10997760
LTDB,14157,,data-processed/multivariate/LTDB/14157.test.csv,real,True,,unsupervised,False,multivariate,9454080
LTDB,14172,,data-processed/multivariate/LTDB/14172.test.csv,real,True,,unsupervised,False,multivariate,9753600
LTDB,14184,,data-processed/multivariate/LTDB/14184.test.csv,real,True,,unsupervised,False,multivariate,10252800


## Experimentation

In [15]:
records = load_dataset_names()
records

['14046', '14134', '14149', '14157', '14172', '14184', '15814']

In [16]:
# find all annotations
annotations = {}
for r in records:
    atr = wfdb.rdann(os.path.join(source_folder, r), "atr")
    df_annotation = pd.DataFrame(atr.symbol, index=atr.sample, columns=["Label"])
    for an in df_annotation["Label"].unique():
        if an not in annotations:
            annotations[an] = set()
        annotations[an].add(atr.record_name)

for an in annotations:
    annotations[an] = ", ".join(annotations[an])
annotations

{'N': '14149, 14172, 14184, 14157, 14134, 14046, 15814',
 'V': '14149, 14172, 14184, 14157, 14134, 14046, 15814',
 'F': '14172, 14184, 14157, 14134, 14046, 15814',
 'J': '14172, 14046',
 'S': '14172, 14184, 14157, 14134, 14046, 15814',
 '~': '14149, 14172, 14184, 14157, 14134, 15814',
 'a': '14172'}