# Final evaluation tests

## Notebook setup

In [1]:
#| code-fold: true
#| code-summary: "Click to see packages imported"
import os
import configparser
import random
import shutil
from pathlib import Path

import torch
import wfdb
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
#|include: false
# If the current working directory is the nbs/ folder, change to the project 
# root directory instead.
if Path.cwd().stem == "nbs":
    os.chdir(Path.cwd().parent)
print(f"The current working directory is {Path.cwd()}")

The current working directory is c:\Users\Shaun\source\Thesis\MisdiagnosisOfAthleteECG


In [3]:
from src.data.util import get_all_records, codes_to_label_vector, get_predicted_findings
from src.data.challenge2020 import extract_snomed_ct_codes_from_comment
import src.data.norwegian as norwegian

In [4]:
#|include: false
# Import configuration settings, like location of data directory.
config = configparser.ConfigParser()
if not Path("config.ini").exists():
    print("WARNING: Please generate a config.ini file by running scripts/get_datasets.py")
else:
    config.read("config.ini")
    data_dir = Path((config["datasets"]["path"])).expanduser()
    print(f"Datasets are located at {data_dir.resolve()}")

Datasets are located at C:\Users\Shaun\source\Thesis\MisdiagnosisOfAthleteECG\data


## Eval setup

In [5]:
sinus_labels = [426177001, 426783006, 427084000, 427393009]
# sinus_labels = [426177001, 426783006]       # Bradycardia or Normal
rbbb_labels = [713427006, 713426002]        # Incomplete RBBB, Complete RBBB
# won't do t-wave inversion, because no output for lead number provided.
athlete_labels = sinus_labels + rbbb_labels

In [6]:
config_dir = Path.cwd() / "config"

# Make sure benchmark directory exists
benchmark_dir = data_dir / "benchmark"
if not benchmark_dir.exists():
    benchmark_dir.mkdir()

### Evaluation dataset

In [7]:
# Note: Change weights_dir directory to run benchmark on different dataset
# eval_dataset_dir = data_dir / "pf12red" / "extracted"
# eval_dataset_dir = data_dir / "challenge-2020" / "1.0.2" / "training" / "ptb" / "g1"
eval_dataset_dir = data_dir / "challenge-2020" / "1.0.2" / "training" / "st_petersburg_incart" / "g1"
# eval_dataset_dir = data_dir / "norwegian-athlete-ecg" / "1.0.0"

In [8]:
len(get_all_records(eval_dataset_dir))

74

## Run benchmarks

In [9]:
# Note: Change weights_dir directory to run benchmark on different model
weights_dir = Path.cwd() / "checkpoints" / "finetune_1"
output_dir = benchmark_dir / eval_dataset_dir.parent / weights_dir.stem

if output_dir.exists():
    print(f"{output_dir} already exists. Benchmark has already been run.")
else:
    # Run benchmark using modified PhysioNet Challenge 2020 driver.py
    output_dir.mkdir(parents=True)
    !python PhysioNet2020_driver.py {weights_dir} {config_dir} {eval_dataset_dir} {output_dir}

data\benchmark\data\challenge-2020\1.0.2\training\st_petersburg_incart\finetune_1 already exists. Benchmark has already been run.


## Results

In [10]:
def print_classifier_metrics(tn, fp, fn, tp):
    P = tp + fn
    N = fp + tn
    print("Population")
    print("----------")
    print(f"Total population: {P+N}")
    print(f"Positive: {P}")
    print(f"Negative: {N}\n")

    acc = (tp + tn) / (P+N)
    ppv = tp / (tp + fp)
    f1 = 2 * tp / (2*tp + fp + fn)
    fpr = fp / N
    print("Performance")
    print("-----------")
    print(f"Accuracy: {acc}")
    print(f"Precision: {ppv}")
    print(f"F1-Score: {f1}")
    print(f"False Positive Rate: {fpr}")

In [11]:
total_confusion = np.zeros((2,2), dtype=int)
for entry in get_all_records(eval_dataset_dir):
    record = wfdb.rdrecord(eval_dataset_dir / entry)

    if eval_dataset_dir == data_dir / "norwegian-athlete-ecg" / "1.0.0":
        # Actual finding labels (norwegian dataset only)
        comments_c = record.comments[1]
        findings_c = norwegian.extract_findings(comments_c)
        actual_findings = norwegian.classify_relevant_findings(findings_c)
        actual_labels = codes_to_label_vector(actual_findings, athlete_labels)
        actual_scores = np.array(actual_labels, dtype=float)
    else:
        # Actual finding labels
        if record.comments[2] == 'Dx:':
            finding_codes = []
        else:
            finding_codes = extract_snomed_ct_codes_from_comment(record.comments[2])
        actual_labels = codes_to_label_vector(finding_codes, athlete_labels)

    # Predicted label from model
    file = output_dir / (entry+'.csv')
    predicted_findings = get_predicted_findings(file)
    predicted_labels = codes_to_label_vector(predicted_findings, athlete_labels)

    # Hack: If no sinus rhythm findings, assume normal sinus rhythm (426783006)
    if sum(actual_labels) == 0:
        actual_labels[1] = 1
    if sum(predicted_labels) == 0:
        predicted_labels[1] = 1
    
    # Calculate confusion matrix for entry, add to total
    total_confusion += confusion_matrix(actual_labels, predicted_labels)

In [12]:
total_confusion

array([[349,  21],
       [ 21,  53]])

In [13]:
tn, fp, fn, tp = total_confusion.ravel()
print_classifier_metrics(tn, fp, fn, tp)

Population
----------
Total population: 444
Positive: 74
Negative: 370

Performance
-----------
Accuracy: 0.9054054054054054
Precision: 0.7162162162162162
F1-Score: 0.7162162162162162
False Positive Rate: 0.05675675675675676
