# Typical training data for ECG classifiers

## Notebook setup

In [None]:
import os
import configparser
from pathlib import Path
from typing import List

import wfdb
import numpy as np
import pandas as pd

In [None]:
#|include: false
# If the current working directory is the nbs/ folder, change to the project 
# root directory instead.

if Path.cwd().stem == "nbs":
    os.chdir(Path.cwd().parent)
print(f"The current working directory is {Path.cwd()}")

In [None]:
#|include: false
# Import configuration settings, like location of data directory.
config = configparser.ConfigParser()
if not Path("config.ini").exists():
    print("WARNING: Please generate a config.ini file by running scripts/get_datasets.py")
else:
    config.read("config.ini")
    data_dir = Path((config["datasets"]["path"])).expanduser()
    print(f"Datasets are located at {data_dir.resolve()}")

## Dataset quickstart

***Table: Characteristics of 12-lead ECG recordings used for PhysioNet Challenge 2020***

| Dataset | Sampling frequency \[Hz\] | Recording length \[s\] | Notes |
|-|-|-|-|
| `cpsc`/`cpsc_extra` | 500 | 6 to 60 | Per HIPAA guidelines ages over 89 are removed |
| `incart` | 257 | 1800 | Holter recordings, annotations included |
| `ptb` | 1000 | ? |  |
| `ptbxl` | 500 | 10 |  |
| `georgia` | 500 | 10 | Southeastern US patient demographic |

Probably just use `ptbxl` and `georgia` for this project.


In [None]:
# Path to each training dataset

training_dir = data_dir / "challenge-2020" / "1.0.2" / "training"

georgia_dir = training_dir / "georgia"
cpsc_dir = training_dir / "cpsc_2018"
cpscextra_dir = training_dir / "cpsc_2018_extra"
ptb_dir = training_dir / "ptb"
ptbxl_dir = training_dir / "ptb-xl"
incart_dir = training_dir / "st_petersburg_incart"

`wfdb.rdheader` to read metadata about record.

`wfdb.rdrecord` to read signal data.

In [None]:
# record = wfdb.rdheader(georgia_dir / "g1" / "E00001")
# record = wfdb.rdheader(ptbxl_dir / "g1" / "HR00001")

record = wfdb.rdrecord(ptbxl_dir / "g1" / "HR00001")

In [None]:
# Signal data as pandas dataframe
signals = record.to_dataframe()
signals.head()

In [None]:
# Each individual signal is a `Series` object
signals.I.head()

In [None]:
# Units for each channel (should all be in mV)
record.units

In [None]:
# Sampling frequency (Hz)
record.fs

# If not 500 Hz, resample.
# https://pandas.pydata.org/docs/getting_started/intro_tutorials/09_timeseries.html#resample-a-time-series-to-another-frequency

In [None]:
# Length of recording (should always be 10 s)
print(f"Length (s): {len(record.to_dataframe()) * (1/record.fs)}")

## Medical terminologies and codes

SNOMED-CT codes are provided as labels for each training dataset.

Dx stands for diagnosis?

In [None]:
record = wfdb.rdheader(ptbxl_dir / "g1" / "HR00001")
record.comments # [2] is the diagnosis comment

We could use a package such as [PyMedTermino2](https://owlready2.readthedocs.io/en/latest/pymedtermino2.html) to access medical terminologies from a UMLS database. 
However, we only need a very small subset of diagnosis codes. We'll just keep a 
Python dictionary of ~10 codes.

In [None]:
diagnosis_codes = {
    426783006:  "Normal Sinus Rhythm",
}

In [None]:
def extract_snomed_ct_codes_from_comment(dx_comment: str) -> List[int]:
    """Returns a list of SNOMED-CT codes related to ECG diagnoses.

    Assumes that `dx_comment` is in the form "Dx: code1,code2,etc."

    Example usage:
    ```
    record = wfdb.rdheader(ptbxl_dir / "g1" / "HR00001")
    dx_comment = record.comments[2]
    dx_comment
    > 'Dx: 251146004,426783006'

    extract_snomed_ct_codes_from_comment(dx_comment)
    > [251146004, 426783006]
    ```
    """
    # Just the diagnosis codes (ignore the "Dx: " prefix)
    comment = dx_comment.split(': ')[1]

    # Split codes into list, convert to integers
    code_text = comment.split(',')
    return list(map(int, code_text))


In [None]:
# Example usage
extract_snomed_ct_codes_from_comment(record.comments[2])

## Extract demographics and labels for population analysis

TODO.

Demographics: age, sex

Labels: subset of SNOMED-CT codes

In [None]:
def get_all_records(dataset_dir: Path) -> List[Path]:
    """Returns a list of every record in a PhysioNet Challenge 2020 dataset
    """
    records = []
    # For every folder (e.g. g1/, g2/, etc.)
    for folder in dataset_dir.iterdir():
        if folder.is_dir():
            # print(f"Searching folder: {folder}")
            # For every record (each record has a `.hea` header file)
            for file in folder.iterdir():
                if file.suffix == '.hea':
                    records.append( folder / file.stem )
    # print(f"Found {len(records)} records")
    return records

In [None]:
ptbxl_records = get_all_records(ptbxl_dir)

In [None]:
class DemographicInfo(TypedDict):
    age: int
    sex: str

def records_to_demographics_table(records: List[Path]) -> pd.DataFrame:
    data = []
    for record in records:
        header = wfdb.rdheader(record)

        # Extract demographic info
        age_str = header.comments[0].split(': ')[1]
        age = int( age_str ) if age_str.isnumeric() else None
        sex = header.comments[1].split(': ')[1]
        demographics: DemographicInfo = {
            'age': age,
            'sex': sex,
        }
        data.append(demographics)

        # # Extract diagnostic info
        # diagnoses = extract_snomed_ct_codes_from_comment( header.comments[2] )
        # # TODO: save relevant ecg results as flags
    return pd.DataFrame(data)

In [None]:
ptbxl_df = records_to_demographics_table(ptbxl_records)

### Data integrity issues

In [None]:
# Note: There seem to be errors in some of the comments. 
# Like this 300 year old lady.
header = wfdb.rdheader(ptbxl_records[248])
header.comments

In [None]:
# Some records don't have an age recorded. Represented with "NaN".
header = wfdb.rdheader("data/challenge-2020/1.0.2/training/ptb-xl/g6/HR05040")
header.comments

In [None]:
# Some examples of NaN ages
ptbxl_df[ptbxl_df.age.isna()].head()