# Reweighting source datasets

## Notebook setup

In [1]:
#| code-fold: true
#| code-summary: "Click to see packages imported"
import os
import configparser
import random
import shutil
from pathlib import Path

import torch
import wfdb
import numpy as np
import dsail
from dsail.model.model_utils import get_model
from dsail.train import Trainer
from dsail.data import get_loss_weights_and_flags

In [2]:
#|include: false
# If the current working directory is the nbs/ folder, change to the project 
# root directory instead.
if Path.cwd().stem == "nbs":
    os.chdir(Path.cwd().parent)
print(f"The current working directory is {Path.cwd()}")

The current working directory is c:\Users\Shaun\source\Thesis\MisdiagnosisOfAthleteECG


In [3]:
#| code-fold: true
#| code-summary: "Click to see local packages imported"
from src.run_12ECG_classifier import load_12ECG_model, run_12ECG_classifier
from src.data.util import get_all_records, get_predicted_findings, diagnosis_codes, codes_to_label_vector

In [4]:
#|include: false
# Import configuration settings, like location of data directory.
config = configparser.ConfigParser()
if not Path("config.ini").exists():
    print("WARNING: Please generate a config.ini file by running scripts/get_datasets.py")
else:
    config.read("config.ini")
    data_dir = Path((config["datasets"]["path"])).expanduser()
    print(f"Datasets are located at {data_dir.resolve()}")

Datasets are located at C:\Users\Shaun\source\Thesis\MisdiagnosisOfAthleteECG\data


## Young subset of training datasets

In [None]:
physionet_data_dir = data_dir / "challenge-2020" / "1.0.2" / "training"
georgia_dataset_dir = physionet_data_dir / "georgia"
ptbxl_dataset_dir = physionet_data_dir / "ptb-xl"
cpsc_dataset_dir = physionet_data_dir / "cpsc_2018"
cpscextra_dataset_dir = physionet_data_dir / "cpsc_2018_extra"

In [20]:
source_records = \
    get_all_records(georgia_dataset_dir) + \
    get_all_records(ptbxl_dataset_dir) + \
    get_all_records(cpsc_dataset_dir) + \
    get_all_records(cpscextra_dataset_dir)

young_subset = []   # Every record under a certain age
vampires = 0        # Vampires don't age
for entry in source_records:
    header = wfdb.rdheader(entry)
    age_str = header.comments[0].split(': ')[1]
    age = int( age_str ) if age_str.isnumeric() else None
    if age == None:
        vampires += 1
    elif age < 40:
        young_subset.append(entry)

In [21]:
len(source_records)

41414

In [16]:
len(young_subset)

1089

## Model setup

In [7]:
original_weights_dir = Path.cwd() / "checkpoints" / "original"
reweight_dir = Path.cwd() / "checkpoints" / "reweight_1"

config_dir = Path.cwd() / "config"
training_data_dir = data_dir / "challenge-2020" / "1.0.2" / "training"
target_data_dir = data_dir / "norwegian-athlete-ecg" / "1.0.0"

# Ensure output directory exists
if reweight_dir.exists():
    print(f"{reweight_dir} already exists. Are we overwriting an existing model?")
else:
    reweight_dir.mkdir()

## Training loop