# Pro-Football 12-lead Resting Electrocardiogram Database

**Data collected**

The data were gathered from La Liga, Spain, from professional football players. 

Resting in supine position, each participantʼs 12-lead electrocardiogram (ECG) was captured with General Electrics (GE) USB-CAM 14 for a duration of 10 s at 500 Hz using the GE CardioSoft software.

## Notebook setup

In [1]:
#| code-fold: true
#| code-summary: "Click to see packages imported"
import os
import configparser
import random
import shutil
import base64
from pathlib import Path

import torch
import wfdb
import xmltodict
import numpy as np
import pandas as pd

In [2]:
#|include: false
# If the current working directory is the nbs/ folder, change to the project 
# root directory instead.
if Path.cwd().stem == "nbs":
    os.chdir(Path.cwd().parent)
print(f"The current working directory is {Path.cwd()}")

The current working directory is c:\Users\Shaun\source\Thesis\MisdiagnosisOfAthleteECG


In [3]:
#|include: false
# Import configuration settings, like location of data directory.
config = configparser.ConfigParser()
if not Path("config.ini").exists():
    print("WARNING: Please generate a config.ini file by running scripts/get_datasets.py")
else:
    config.read("config.ini")
    data_dir = Path((config["datasets"]["path"])).expanduser()
    print(f"Datasets are located at {data_dir.resolve()}")

Datasets are located at C:\Users\Shaun\source\Thesis\MisdiagnosisOfAthleteECG\data


## Convert from GE Muse XML to WFDB

https://github.com/PierreElias/IntroECG/tree/master/1-Waveform%20Extraction

In [4]:
pf12red_xml_dir = data_dir / "pf12red" / "5 163XML"
pf12red_extracted_dir  = data_dir / "pf12red" / "extracted"
pf12red_labels_file = data_dir / "pf12red" / "labels.csv"

if not pf12red_extracted_dir.exists():
    pf12red_extracted_dir.mkdir()

In [5]:
labels_df = pd.read_csv(pf12red_labels_file)
labels_df.head()

Unnamed: 0,AthleteID,SR,SB,iRBBB
0,1,,X,X
1,2,X,,
2,3,,X,X
3,4,,X,
4,5,,X,


In [6]:
labels_df[labels_df.AthleteID == 10]

Unnamed: 0,AthleteID,SR,SB,iRBBB
9,10,,X,


In [7]:
(labels_df[labels_df.AthleteID == 10].SB == "X").item()

True

In [8]:
for file in pf12red_xml_dir.iterdir():
    # Only process XML files
    if file.suffix != '.XML':
        print(f"How did {file} get there?")
        continue
    
    # Attempt to parse XML file using xmltodict package
    with open(file, 'rb') as f:
        xml_str = f.read().decode('utf8')
    try:
        xml_dict = xmltodict.parse(xml_str)
    except:
        print(f"Couldn't parse {file}")
    
    # Extract patient info
    age = xml_dict['CardiologyXML']['PatientInfo']['Age']['#text']
    gender = xml_dict['CardiologyXML']['PatientInfo']['Gender']

    # Extract diagnosis info (save as SNOMED-CT codes)
    athlete_id = int(file.stem.split('_')[0])
    labels = labels_df[labels_df.AthleteID == athlete_id]
    dx_comment = "Dx: "
    # TODO: What does SR label mean?
    # if labels.SR == 'X':
    #     dx_comment += "426783006"   # Normal sinus rhythm
    #     dx_comment += "427393009"   # Sinus arrhythmia
    if (labels.SB == 'X').item():
        dx_comment += "426177001,"
    if (labels.iRBBB == 'X').item():
        dx_comment += "713426002,"

    # Extract lead waveforms (version 1)
    lead_samples = []
    for i in range(12):
        # sample_str = xml_dict['CardiologyXML']['RestingECGMeasurements']['MedianSamples']['WaveformData'][i]['#text']
        sample_str = xml_dict['CardiologyXML']['StripData']['WaveformData'][0]['#text']
        sample_list = list(map(int, sample_str.split(',')))
        lead_samples.append(np.array(sample_list, dtype=int))
    p_signal = np.stack(lead_samples) * 0.005
    p_signal= p_signal.transpose()
    
    # Write new WFDB record
    # print(f"Writing {file.stem}")
    try:
        wfdb.wrsamp(
            file.stem, 
            fs=500, 
            units=['mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV'],
            sig_name=['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'],
            p_signal=p_signal,
            comments=[f"age: {age}", f"gender: {gender}", {dx_comment}],
            write_dir=pf12red_extracted_dir,
        )
    except:
        print(f"Couldn't write {file.stem}")

Couldn't write 10_1819Pst
Couldn't write 11_1819Pst
Couldn't write 11_1920Pre
Couldn't write 12_1819Pst
Couldn't write 12_1920Pre
Couldn't write 12_1920Pst
Couldn't write 12_2021Pre
Couldn't write 12_2122Pre
Couldn't write 13_1819Pst
Couldn't write 13_1920Pre
Couldn't write 13_1920Pst
Couldn't write 13_2021Pre
Couldn't write 13_2122Pre
Couldn't write 14_1819Pst
Couldn't write 14_1920Pre
Couldn't write 14_1920Pst
Couldn't write 15_1920Pre
Couldn't write 15_1920Pst
Couldn't write 15_2021Pre
Couldn't write 15_2122Pre
Couldn't write 16_1920Pre
Couldn't write 16_2021Pre
Couldn't write 17_1819Pst
Couldn't write 18_1819Pst
Couldn't write 18_1920Pre
Couldn't write 18_1920Pst
Couldn't write 18_2021Pre
Couldn't write 18_2122Pre
Couldn't write 19_1819Pst
Couldn't write 19_1920Pre
Couldn't write 19_1920Pst
Couldn't write 19_2021Pre
Couldn't write 19_2122Pre
Couldn't write 1_1819Pst
Couldn't write 20_1819Pst
Couldn't write 20_1920Pre
Couldn't write 21_1819Pst
Couldn't write 21_1920Pre
Couldn't writ

In [9]:
# What should the p_signal look like?
norwegian_dataset_dir = data_dir / "norwegian-athlete-ecg" / "1.0.0"
record = wfdb.rdrecord(norwegian_dataset_dir / "ath_001")
record.p_signal.shape

(5000, 12)

## Extracting waveform from PDF