# Misclassification of athlete ECG by GE Marquette SL12 algorithm

## Notebook setup

In [None]:
#| code-fold: true
#| code-summary: "Click to see packages imported"
import os
import configparser
from pathlib import Path
from typing import TypedDict, List
from enum import Enum

import wfdb
import pandas as pd

In [None]:
#|include: false
# If the current working directory is the nbs/ folder, change to the project 
# root directory instead.

if Path.cwd().stem == "nbs":
    os.chdir(Path.cwd().parent)
print(f"The current working directory is {Path.cwd()}")

In [None]:
#|include: false
# Import configuration settings, like location of data directory.
config = configparser.ConfigParser()
if not Path("config.ini").exists():
    print("WARNING: Please generate a config.ini file by running scripts/get_datasets.py")
else:
    config.read("config.ini")
    data_dir = Path((config["datasets"]["path"])).expanduser()
    print(f"Datasets are located at {data_dir.resolve()}")

## The norwegian-athlete-ecg dataset

The [Norwegian Endurance Athlete ECG Database](https://physionet.org/content/norwegian-athlete-ecg/1.0.0/) (norwegian-athlete-ecg) contains 12-lead ECG recordings from 28 elite athletes from various sports in Norway. All recordings are 10 seconds resting ECGs recorded with a General Electric (GE) MAC VUE 360 electrocardiograph. All ECGs are interpreted with both the GE Marquette SL12 algorithm (version 23 (v243)) and one cardiologist with training in interpretation of athlete's ECG. The data was collected at the University of Oslo in February and March 2020.

In [None]:
athlete_ecg_dir = data_dir / "norwegian-athlete-ecg" / "1.0.0"

In [None]:
#| code-fold: show
#| code-summary: "12-lead ECG recording from subject ath_001"
record = wfdb.rdrecord(athlete_ecg_dir / "ath_001")
wfdb.plot_wfdb(record=record, title='ath_001 from Norwegian Athlete ECG database')

In [None]:
#| code-fold: show
#| code-summary: "Machine (SL12) and Cardiologist (C) interpretation of ath_001 ECG recording"
record = wfdb.rdheader(athlete_ecg_dir / "ath_001")
record.__dict__["comments"]

In [None]:
#| code-fold: true
#| code-summary: Put ECG finding reports into a pandas dataframe
class AthleteReport():
    athlete_id: str
    cardiologist: str
    machine: str

reports_list = []
for i in range(1, 29):
    athlete_id = f"ath_00{i}" if i < 10 else f"ath_0{i}"
    record = wfdb.rdheader(athlete_ecg_dir / athlete_id)
    comments = record.__dict__["comments"]
    report: AthleteReport = {
        "athlete_id": athlete_id,
        "cardiologist": comments[1],
        "machine": comments[0],
    }
    reports_list.append(report)
athlete_ecg_df = pd.DataFrame(reports_list)

In [None]:
athlete_ecg_df.head()

## Findings from ECG reports

In the norwegian-athlete-ecg dataset, findings in ECG reports are delimited by 
a comma (`,`). However, some machine findings also make use of a comma to make 
a follow-up comment on a finding. This is not done in any of the human 
cardiologist reports in the dataset.

***Table: Examples of findings with follow-up comment***

| Finding with follow-up comment | Record |
|-|-|
| `Minimal voltage criteria for LVH, may be normal variant` | `ath_024` |
| `ST elevation, probably due to early repolarization` | `ath_024` |
| `ST elevation, consider early repolarization, pericarditis, or injury` | `ath_27` |

Follow-up comments from SL12 all seem to start with a lower-case letter, so 
they can be detected this way.

In [None]:
#| code-fold: true
#| code-summary: Click to see function for extracting a list of findings from a single line report

def extract_findings(report: str, follow_on: bool=True) -> List[str]:
    """Extract a list of all findings in a single line cardiologist report
    """
    comments = report.split(': ', maxsplit=1)[1].split(', ')

    # Cleanup (e.g. remove leading/trailing whitespace)
    comments[:] = list(map(str.strip, comments))

    if not follow_on:
        return comments     # i.e. assume every comment is a new finding

    # Combine follow-on comments with parent comment to produce full finding 
    # for SL12 machine comments.
    #
    # e.g. ST elevation, consider early repolarization, pericarditis, or injury
    findings = []
    for i, comment in enumerate(comments):
        if comment[0].isupper() or comment[0] == '*':
            findings.append(comment)
        else:
            findings[-1] = ''.join([findings[-1], ", ", comment])
    return findings

In [None]:
# Example usage of `extract_findings()`
report = athlete_ecg_df.loc[23].machine
extract_findings(report)

In [None]:
#| code-summary: Find every unique finding in dataset
unique_findings_sl12 = []
unique_findings_c = []
for i in range(1, 29):
    athlete_id = f"ath_00{i}" if i < 10 else f"ath_0{i}"
    record = wfdb.rdheader(athlete_ecg_dir / athlete_id)
    comments = record.__dict__["comments"]

    # Machine algorithm findings
    findings_sl12 = extract_findings(comments[0])
    for finding in findings_sl12:
        if finding not in unique_findings_sl12:
            unique_findings_sl12.append(finding)
    
    # Cardiologist findings
    findings_c = extract_findings(comments[1], follow_on=False)
    for finding in findings_c:
        if finding not in unique_findings_c:
            unique_findings_c.append(finding)


In [None]:
unique_findings_c

In [None]:
unique_findings_sl12

## Disagreement between machine and cardiologist

In [None]:
# Classifying findings by the type of abnormality

class AbnormalityClass(Enum):
    # overall = "Overall ECG recording"   # Normal/Abnormal/Borderline etc.
    rhythm = "Rhythm"                   # e.g. sinus rhythm
    conduction = "Conduction"           # e.g. bundle branch block, AV block
    ischemia = "Ischemia"               # e.g. ST-segment, T-wave inversion
    structural = "Structural"           # e.g. chamber enlargement, hypertrophy
    measurement = "Measurement"         # e.g. axis deviation, wide QRS, PR interval
    equipment = "Equipment"             # e.g. Misplaced electrodes
    other = "Other"


In [None]:
# The final finding in each report is an "overall" classification for the 
# entire ECG recording.

# We can use the difference between machine and cardiologist

class OverallFinding(Enum):
    Unknown = -5
    Normal = 0
    Borderline = 1
    Abnormal = 2

def classifyOverallFinding(findings: List[str]) -> OverallFinding:
    """Classifies the overall finding for an ECG recording.

    Assumes that the final finding in `findings` list comments on overall 
    finding.
    """
    overall = findings[-1].lower()
    if overall.find("abnormal") != -1:
        return OverallFinding.Abnormal
    elif overall.find("borderline") != -1:
        return OverallFinding.Borderline
    elif overall.find("normal") != -1:
        return OverallFinding.Normal
    else:
        return OverallFinding.Unknown

# Example usage of `classifyOverallFindings`:
# Quantify the "overall disagreement" between cardiologist and SL12 algorithm.
for i in range(1, 29):
    athlete_id = f"ath_00{i}" if i < 10 else f"ath_0{i}"

    record = wfdb.rdheader(athlete_ecg_dir / athlete_id)
    comments = record.__dict__["comments"]

    findings_sl12 = extract_findings(comments[0])
    findings_c = extract_findings(comments[1])

    overall_sl12 = classifyOverallFinding(findings_sl12)
    overall_c = classifyOverallFinding(findings_c)

    print(f"{athlete_id} disagreement = {overall_sl12.value - overall_c.value}")

TODO: Disagreement between cardiologist and machine for individual abnormality 
classes.