# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 1. Download and format dataset

# Setup Notebook

In [3]:
# Import 3rd party libraries
import os
import sys
import json
import numpy as np
import pandas as pd
from urllib.parse import quote
from urllib.request import urlopen

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.abspath(os.getcwd())))
from kardioml import DATA_PATH, EXTRACTED_FOLDER_NAMES

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Find SNOMED-CT Code Mappings

In [2]:
# SNOMED-CT request
baseUrl = 'https://browser.ihtsdotools.org/snowstorm/snomed-ct'
edition = 'MAIN'
version = '2019-07-31'

def get_concept_by_id(id_number):
    url = baseUrl + '/browser/' + edition + '/' + version + '/concepts/' + id_number
    response = urlopen(url).read()
    data = json.loads(response.decode('utf-8'))
    return data['fsn']['term']

### Tranche 1 (February 7, 2020)

In [3]:
# Get a list of .hea files
filenames = [filename.split('.')[0] for filename in
             os.listdir(os.path.join(DATA_PATH, 'physionet_2020_1', 'raw', EXTRACTED_FOLDER_NAMES[0])) if 'hea' in filename]

# List of all labels
labels = list()

# Loop through files and extract #Dx: label
for filename in filenames:

    # Load file
    file = open(os.path.join(DATA_PATH, 'physionet_2020_1', 'raw', EXTRACTED_FOLDER_NAMES[0], '{}.hea'.format(filename)), 'r')
    content = file.read().split('\n')
    file.close()

    # Get patient attributes
    labels.extend([label for label in content[15].split(':')[-1].strip().split(',')])

In [4]:
# Get list of unique labels
labels = np.unique(labels).tolist()

# Get label names from SNOMED-CT
labels = {label: get_concept_by_id(id_number=label) for label in labels}
for key, value in labels.items():
    print(key, value)

164884008 Electrocardiogram: ventricular ectopics (finding)
164889003 Electrocardiographic atrial fibrillation (finding)
164909002 Electrocardiographic left bundle branch block (finding)
164931005 ST elevation (observable entity)
270492004 First degree atrioventricular block (disorder)
284470004 Premature atrial contraction (disorder)
426783006 Electrocardiogram: sinus rhythm (finding)
429622005 ST Depression (observable entity)
59118001 Right bundle branch block (disorder)


### Tranche 2 (May 20, 2020)

In [8]:
# Get a list of .hea files
filenames = [filename.split('.')[0] for filename in
             os.listdir(os.path.join(DATA_PATH, 'physionet_2020_2', 'raw', EXTRACTED_FOLDER_NAMES[1])) if 'hea' in filename]

# List of all labels
labels = list()

# Loop through files and extract #Dx: label
for filename in filenames:

    # Load file
    file = open(os.path.join(DATA_PATH, 'physionet_2020_2', 'raw', EXTRACTED_FOLDER_NAMES[1], '{}.hea'.format(filename)), 'r')
    content = file.read().split('\n')
    file.close()

    # Get patient attributes
    labels.extend([label for label in content[15].split(':')[-1].strip().split(',')])

In [9]:
# Get list of unique labels
labels = np.unique(labels).tolist()

# Get label names from SNOMED-CT
labels = {label: get_concept_by_id(id_number=label) for label in labels}
for key, value in labels.items():
    print(key, value)

10370003 Rhythm from artificial pacing (finding)
111288001 Ventricular flutter (disorder)
11157007 Ventricular bigeminy (disorder)
111975006 Prolonged QT interval (finding)
164861001 Electrocardiographic myocardial ischemia (finding)
164865005 Electrocardiographic myocardial infarction (finding)
164867002 Electrocardiographic old myocardial infarction (finding)
164873001 Electrocardiographic left ventricle hypertrophy (finding)
164889003 Electrocardiographic atrial fibrillation (finding)
164890007 Electrocardiographic atrial flutter (finding)
164895002 Electrocardiographic ventricular tachycardia (finding)
164896001 Electrocardiographic ventricular fibrillation (finding)
164909002 Electrocardiographic left bundle branch block (finding)
164917005 Electrocardiographic Q wave abnormal (finding)
164921003 Electrocardiographic R wave abnormal (finding)
164930006 Electrocardiographic ST interval abnormal (finding)
164931005 ST elevation (observable entity)
164934002 Electrocardiographic T wa

# Format SNOMED-CT Codes

In [5]:
# Import csv from competition organizers
mappings = pd.read_csv(os.path.join(DATA_PATH, 'labels.csv'))

# View DataFrame
mappings.head()

Unnamed: 0,dx,SNOMED code,Abbreviation
0,1st degree av block,270492004,IAVB
1,2nd degree av block,195042002,IIAVB
2,abnormal QRS,164951009,abQRS
3,accelerated idioventricular rhythm,61277005,AIVR
4,accelerated junctional rhythm,426664006,AJR


In [24]:
mappings_list = dict()
for index, row in mappings.iterrows():
    mappings_list[row['SNOMED code']] = {'label': row['Abbreviation'], 'label_full': row['dx']}