# MIT-BIH Long-Term ECG Database (_ltdb_)

Part of the ECG Database Collection:

| Short Name | Long Name |
| :--- | :--- |
| _mitdb_ | MIT-BIH Arrhythmia Database |
| _svdb_ | MIT-BIH Supraventricular Arrhythmia Database |
| _ltdb_ | MIT-BIH Long-Term ECG Database |

[Docu](https://wfdb.readthedocs.io/en/latest) of the `wfdb`-package.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import wfdb
import os
from typing import Final
from collections.abc import Callable
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [2]:
dataset_collection_name = "LTDB"
source_folder = os.path.join(data_raw_folder, "MIT-BIH Long-Term ECG Database")
target_folder = data_processed_folder

from pathlib import Path
print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Long-Term ECG Database and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


In [3]:
def load_dataset_names() -> list[str]:
    with open(os.path.join(source_folder, "RECORDS"), 'r') as f:
        records = [l.rstrip('\n') for l in f]
    return records

For the explaination of the following transformation function `transform_and_label`, see the tranformation walk-through of the _MIT-BIH Arrhythmia Database.ipynb_-notebook.

The following annotations are present in this dataset:

| Annotation | Description |
| :--------- | :---------- |
|| **Considered normal** |
| `N` | Normal beat |
|| **Anomalous beats** (use double-window labeling) |
| `F` | Fusion of ventricular and normal beat |
| `S` | Supraventricular premature or ectopic beat |
| `a` | Aberrated atrial premature beat |
| `V` | Premature ventricular contraction |
| `J` | Nodal (junctional) premature beat |
|| **Anomaly from `x` until next beat window start** |
| - ||
|| **Entire section of fibrillation is regarded anomalous** (a single window from `[` to `]`) |
| - ||
|| **External anomalies** (single window labeling) |
| - ||
|| **Ignored, bc hard to parse and to label** |
| `~` | Change in signal quality (usually noise level changes) |

In [4]:
ann_normal = ["N", "/", "L", "R"]
ann_beat = ["F", "f", "S", "A", "a", "V", "J", "j", "E", "e"]
ann_no_beat = ["x"]
ann_fibr_start = "["
ann_fibr_end = "]"
ann_fibr = [ann_fibr_start, "!", ann_fibr_end]
ann_ext = ["Q", "|"]
ann_ignore = ["+", "~", '"']

def transform_and_label(source_file: str, target: str) -> int:
    print(f"Transforming {os.path.basename(source_file)}")
    # load dataset
    record = wfdb.rdrecord(source_file)
    df_record = pd.DataFrame(record.p_signal, columns=record.sig_name)
    print(f"  record {record.file_name[0]} loaded")

    # load annotation file
    atr = wfdb.rdann(source_file, "atr")
    assert record.fs == atr.fs, "Sample frequency of records and annotations does not match!"
    df_annotation = pd.DataFrame({"position": atr.sample, "label": atr.symbol})
    # remove ignored annotations
    df_annotation = df_annotation[~df_annotation["label"].isin(ann_ignore)]
    df_annotation = df_annotation.reset_index(drop=True)
    print(f"  {len(df_annotation)}/{atr.ann_len} beat annotations for {source_file} loaded (others were ignored)")

    # calculate normal beat length
    print("  preparing windows for labeling...")
    df_normal_beat = df_annotation.copy()
    df_normal_beat["prev_position"] = df_annotation["position"].shift()
    df_normal_beat["prev_label"] = df_annotation["label"].shift()
    df_normal_beat = df_normal_beat[(df_normal_beat["label"].isin(ann_normal)) & (df_normal_beat["prev_label"].isin(ann_normal))]
    df_normal_beat = df_normal_beat.drop(columns=["label", "prev_label"])
    s_normal_beat_lengths = df_normal_beat["position"] - df_normal_beat["prev_position"]
    print(f"    normal beat distance samples = {len(s_normal_beat_lengths)}")
    normal_beat_length = s_normal_beat_lengths.median()
    if (normal_beat_length % 2) == 0:
        normal_beat_length += 1
    beat_window_size = int(normal_beat_length)
    beat_window_margin = (beat_window_size - 1)//2
    del df_normal_beat
    del s_normal_beat_lengths
    print(f"    window size = {beat_window_size}")
    print(f"    window margins (left and right) = {beat_window_margin}")

    # calculate beat windows
    ## ~ and other annotations are ignored!
    ## for fibrillation
    # we only need start and end marked with `[` and `]` respectively
    s_fibr_start = df_annotation.loc[df_annotation["label"] == ann_fibr_start, "position"]
    s_index = s_fibr_start.index
    s_fibr_start = s_fibr_start.reset_index(drop=True)
    s_fibr_end = df_annotation.loc[df_annotation["label"] == ann_fibr_end, "position"]
    s_fibr_end = s_fibr_end.reset_index(drop=True)
    df_fibr = pd.DataFrame({"index": s_index, "window_start": s_fibr_start, "window_end": s_fibr_end})
    df_fibr = df_fibr.set_index("index")
    df_fibr["position"] = df_fibr["window_start"]
    print(f"    {len(df_fibr)} windows for fibrillation anomalies ({','.join(ann_fibr)})")
    ## for external anomalies
    df_ext = df_annotation[df_annotation["label"].isin(ann_ext)].copy()
    df_ext["window_start"] = np.maximum(0, df_ext["position"]-beat_window_margin)
    df_ext["window_end"] = np.minimum(record.sig_len - 1, df_ext["position"]+beat_window_margin)
    df_ext = df_ext[["position", "window_start", "window_end"]]
    print(f"    {len(df_ext)} windows for external anomalies ({','.join(ann_ext)})")
    ## anomalous beats
    # exclude additional non-beat annotations
    df_svf = df_annotation[~df_annotation["label"].isin(["|", ann_fibr_start, ann_fibr_end])].copy()
    df_svf["position_next"] = df_svf["position"].shift(-1)
    df_svf["position_prev"] = df_svf["position"].shift(1)
    #df_svf = df_svf[(df_svf["position_prev"].notnull()) & (df_svf["position_next"].notnull())]
    df_svf = df_svf[df_svf["label"].isin(ann_beat)]
    df_svf["window_start"] = np.maximum(0, np.minimum(df_svf["position"].values-beat_window_margin, df_svf["position_prev"].values+beat_window_margin))
    df_svf["window_end"] = np.minimum(record.sig_len - 1, np.maximum(df_svf["position"].values+beat_window_margin, df_svf["position_next"].values-beat_window_margin))
    df_svf = df_svf[["position", "window_start", "window_end"]]
    print(f"    {len(df_svf)} windows for anomalous beats ({','.join(ann_beat)})")
    # missing beats
    df_no_beat = df_annotation[df_annotation["label"].isin(ann_no_beat)].drop(columns=["label"]).copy()
    df_no_beat["window_start"] = df_no_beat["position"]
    if not df_no_beat.empty:
        df_normal_windows = df_annotation[df_annotation["label"].isin(ann_normal)].copy()
        df_normal_windows = df_normal_windows.drop(columns=["label"])
        df_normal_windows["window_start"] = np.maximum(0, df_normal_windows["position"]-beat_window_margin)
        df_normal_windows["window_end"] = np.minimum(record.sig_len - 1, df_normal_windows["position"]+beat_window_margin)
        df_lut = df_annotation[~df_annotation["label"].isin(ann_no_beat)].merge(pd.concat([df_ext, df_svf, df_fibr, df_normal_windows]), on="position", how="left")
        def find_next_window_start(pos: int):
            next_window_start = df_lut.loc[df_lut["position"] > pos, "window_start"].iloc[0]
            return max(pos, next_window_start)
        df_no_beat["window_end"] = df_no_beat["position"].transform(find_next_window_start)
        del df_normal_windows
        del df_lut
    else:
        df_no_beat["window_end"] = df_no_beat["position"]
    print(f"    {len(df_no_beat)} windows for missing beats ({','.join(ann_no_beat)})")
    ## merge
    df_windows = pd.concat([df_ext, df_svf, df_fibr, df_no_beat])
    df_windows.sort_index(inplace=True)
    print(f"  ...done.")

    # add labels based on anomaly windows
    print("  labeling")
    df_record["is_anomaly"] = 0
    for _, (_, t1, t2) in df_windows.iterrows():
        tmp = df_record[df_record.index >= t1]
        tmp = tmp[tmp.index <= t2]
        df_record["is_anomaly"].values[tmp.index] = 1
    del tmp

    # reconstruct timestamps and set as index
    print("  reconstructing timestamps")
    df_record["timestamp"] = pd.to_datetime(df_record.index.values * 1e+9/record.fs, unit='ns')
    df_record = df_record.set_index("timestamp")
    df_record.to_csv(target)
    print(f"Dataset {os.path.basename(source_file)} transformed and saved!")
    
    # return dataset length
    return record.sig_len

In [5]:
# shared by all datasets
dataset_type = "real"
input_type = "multivariate"
datetime_index = True
train_type = "unsupervised"
train_is_normal = False

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/LTDB already exist


In [6]:
# dataset transformation
transform_file: Callable[[str, str], int] = transform_and_label

for dataset_name in load_dataset_names():
    # intentionally no file suffix (.dat)
    source_file = os.path.join(source_folder, dataset_name)
    filename = f"{dataset_name}.test.csv"
    path = os.path.join(dataset_subfolder, filename)
    target_filepath = os.path.join(target_subfolder, filename)
            
    # transform file and label it
    dataset_length = transform_file(source_file, target_filepath)
    print(f"Processed source dataset {source_file} -> {target_filepath}")

    # save metadata
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = None,
        test_path = path,
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )

# save metadata of benchmark
dm.save()

Transforming 14046
  record 14046.dat loaded
  115278/115278 beat annotations for /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Long-Term ECG Database/14046 loaded (others were ignored)
  preparing windows for labeling...
    normal beat distance samples = 96721
    window size = 95
    window margins (left and right) = 47
    0 windows for fibrillation anomalies ([,!,])
    0 windows for external anomalies (Q,|)
    9864 windows for anomalous beats (F,f,S,A,a,V,J,j,E,e)
    0 windows for missing beats (x)
  ...done.
  labeling
  reconstructing timestamps
Dataset 14046 transformed and saved!
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Long-Term ECG Database/14046 -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/LTDB/14046.test.csv
Transforming 14134
  record 14134.dat loaded
  49632/49769 beat annotations for /home/projects/akita/data/benchmark-data/data-raw/MIT-BIH Long-Term ECG Database/14134 loaded (others wer

In [8]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LTDB,14046,,multivariate/LTDB/14046.test.csv,real,True,,unsupervised,False,multivariate,10828800
LTDB,14134,,multivariate/LTDB/14134.test.csv,real,True,,unsupervised,False,multivariate,6420480
LTDB,14149,,multivariate/LTDB/14149.test.csv,real,True,,unsupervised,False,multivariate,10997760
LTDB,14157,,multivariate/LTDB/14157.test.csv,real,True,,unsupervised,False,multivariate,9454080
LTDB,14172,,multivariate/LTDB/14172.test.csv,real,True,,unsupervised,False,multivariate,9753600
LTDB,14184,,multivariate/LTDB/14184.test.csv,real,True,,unsupervised,False,multivariate,10252800
LTDB,15814,,multivariate/LTDB/15814.test.csv,real,True,,unsupervised,False,multivariate,10237440


## Experimentation

In [None]:
records = load_dataset_names()
records

In [None]:
# find all annotations
annotations = {}
for r in records:
    atr = wfdb.rdann(os.path.join(source_folder, r), "atr")
    df_annotation = pd.DataFrame(atr.symbol, index=atr.sample, columns=["Label"])
    for an in df_annotation["Label"].unique():
        if an not in annotations:
            annotations[an] = set()
        annotations[an].add(atr.record_name)

for an in annotations:
    annotations[an] = ", ".join(annotations[an])
annotations