# Finetuning classifier output

## Notebook setup

In [1]:
#| code-fold: true
#| code-summary: "Click to see packages imported"
import os
import configparser
import random
from pathlib import Path

import torch
import wfdb
import numpy as np
import dsail
from dsail.model.model_utils import get_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#|include: false
# If the current working directory is the nbs/ folder, change to the project 
# root directory instead.

if Path.cwd().stem == "nbs":
    os.chdir(Path.cwd().parent)
print(f"The current working directory is {Path.cwd()}")

The current working directory is /Users/shaun/source/Thesis/PhysioNetChallenge2020


In [3]:
#| code-fold: true
#| code-summary: "Click to see local packages imported"
from src.run_12ECG_classifier import load_12ECG_model, run_12ECG_classifier
from src.data.util import get_all_records, get_predicted_findings, diagnosis_codes, codes_to_label_vector
import src.data.norwegian as norwegian

In [4]:
#|include: false
# Import configuration settings, like location of data directory.
config = configparser.ConfigParser()
if not Path("config.ini").exists():
    print("WARNING: Please generate a config.ini file by running scripts/get_datasets.py")
else:
    config.read("config.ini")
    data_dir = Path((config["datasets"]["path"])).expanduser()
    print(f"Datasets are located at {data_dir.resolve()}")

Datasets are located at /Users/shaun/source/Thesis/PhysioNetChallenge2020/data


## Scoring settings

In [5]:
# sinus_labels = [426177001, 426783006, 427084000, 427393009]
sinus_labels = [426177001, 426783006]       # Bradycardia or Normal
rbbb_labels = [713427006, 713426002]        # Incomplete RBBB, Complete RBBB
# won't do t-wave inversion, because no output for lead number provided.
athlete_labels = sinus_labels + rbbb_labels

## Scratch

In [6]:
original_weights_dir = Path.cwd() / "checkpoints" / "original"
finetune_dir = Path.cwd() / "checkpoints" / "finetune_1"

config_dir = Path.cwd() / "config"
training_data_dir = data_dir / "challenge-2020" / "1.0.2" / "training"
target_data_dir = data_dir / "norwegian-athlete-ecg" / "1.0.0"

# Ensure output directory exists
if finetune_dir.exists():
    print(f"{finetune_dir} already exists. Are we overwriting an existing finetune?")
else:
    finetune_dir.mkdir()

/Users/shaun/source/Thesis/PhysioNetChallenge2020/checkpoints/finetune_1 already exists. Are we overwriting an existing finetune?


In [7]:
# Load model config from disk
data_cfg = dsail.config.DataConfig(config_dir / "data.json")
preprocess_cfg = dsail.config.PreprocessConfig(config_dir / "preprocess.json")
model_cfg = dsail.config.ModelConfig(config_dir / "model.json")
run_cfg = dsail.config.RunConfig(config_dir / "run.json")

# Check if CUDA device available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Borrowed from DSAIL_SNU

def set_seeds(seed):
    """ set random seeds """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [9]:
# for fold in range(10):
#     set_seeds(2020)

#     # Load data for finetuning and evaluation

#     # Initialize model
#     model, params = get_model(model_cfg, data_cfg.num_channels, len(data_cfg.scored_classes))

#     # Training loop

#     # Save network (note that full model is ensemble of 10 networks)

In [10]:
print('Loading 12ECG model...')
model = load_12ECG_model(original_weights_dir, config_dir)

Loading 12ECG model...


In [11]:
print(f"This model is an ensemble of {len(model[3])} networks")

This model is an ensemble of 10 networks


In [12]:
# For each classification output, there are weights for 256 inputs to adjust.
# Can we use partial backpropogation for this layer only?
model[3][0].linear.weight.shape

torch.Size([24, 256])

In [13]:
# Alternatively, is just adding bias enough?
model[3][0].linear.bias.shape

torch.Size([24])

In [14]:
for entry in get_all_records(target_data_dir):
    # 12-lead ECG signals (input data), and header info (e.g. sampling frequency)
    record = wfdb.rdrecord(target_data_dir / entry)
    signals = record.p_signal.transpose()
    with open((target_data_dir / entry).with_suffix(".hea"), 'r') as f:
        header_data=f.readlines()

    # Actual labels from cardiologist
    comments_c = record.comments[1]
    findings_c = norwegian.extract_findings(comments_c)
    actual_findings = norwegian.classify_relevant_findings(findings_c)
    actual_labels = codes_to_label_vector(actual_findings, athlete_labels)
    actual_scores = np.array(actual_labels, dtype=float)

    # Run model, get predictions
    current_label, current_score, classes = run_12ECG_classifier(signals, header_data, model)
    predicted_scores = np.zeros(len(athlete_labels))
    for i, code in enumerate(classes):
        if int(code) in athlete_labels:
            index = athlete_labels.index(int(code))
            predicted_scores[index] = current_score[i]
            
    # Find error


    # Adjust output stem

In [15]:
predicted_scores

array([0.03358536, 0.35342378, 0.59899798, 0.69184965])

In [16]:
actual_labels

[0, 1, 0, 0]

In [17]:
print(np.finfo(float).eps)

2.220446049250313e-16
