# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 2. Create Training Dataset

# Setup Notebook

In [69]:
# Import 3rd party libraries
import os
import sys
import time
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH, ECG_LEADS

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Process Meta Data

In [None]:
# Set save path
save_path = os.path.join(DATA_PATH, 'training', 'physionet_2017')

# Load 
meta_data = pd.read_csv(os.path.join(save_path, 'meta_data.csv'))

In [124]:
# Import labels
labels_scored = pd.read_csv(os.path.join(DATA_PATH, 'labels_scored.csv'))

# Set save path
save_path = os.path.join(DATA_PATH, 'training', 'physionet_2017')

# Create directory for formatted data
os.makedirs(save_path, exist_ok=True)

# List of meta data
meta_data = list()

# Loop through datasets
for dataset in ['A', 'B', 'C', 'D', 'E', 'F']:
    
    # Set load path
    load_path = os.path.join(DATA_PATH, dataset, 'formatted')
    
    # Get list of files
    filenames = [file for file in os.listdir(load_path) if 'json' in file]

    # Open files and extract meta data
    for filename in filenames:

        # Import features
        data = {'dataset': dataset}
        loaded_data = json.load(open(os.path.join(load_path, filename)))
        data.update({key: loaded_data[key] for key in ['filename', 'age', 'sex', 'fs', 'length', 'labels_SNOMEDCT', 
                                                       'labels', 'labels_full']})
        meta_data.append(data)

# Generate DataFrame
meta_data = pd.DataFrame(meta_data)

# Set numeric
meta_data['age'] = meta_data['age'].apply(pd.to_numeric, errors='coerce')

# Add int labels
meta_data['labels_int'] = meta_data['labels_SNOMEDCT'].map(lambda val: 
                                                           None if val is None else 
                                                           [labels_scored[labels_scored['SNOMED CT Code'] == label].index[0] 
                                                            for label in val])

# Drop no labels
meta_data = meta_data.loc[meta_data['labels'].notnull()].reset_index()

# Add training labels
mlb = MultiLabelBinarizer(classes=labels_scored.index.tolist())
meta_data['labels_train'] = meta_data['labels_int'].map(lambda val: mlb.fit_transform([val])[0].tolist())

# Save 
meta_data.to_csv(os.path.join(save_path, 'meta_data.csv'), index=False)
    
# View DataFrame
meta_data.head()

Unnamed: 0,index,dataset,filename,age,sex,fs,length,labels_SNOMEDCT,labels,labels_full,labels_int,labels_train
0,0,A,A0001,74.0,male,500,7500,[59118001],[RBBB],[right bundle branch block],[18],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,A,A0002,49.0,female,500,5000,[426783006],[SNR],[sinus rhythm],[21],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,A,A0003,81.0,female,500,5000,[164889003],[AF],[atrial fibrillation],[1],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,A,A0004,45.0,male,500,5974,[164889003],[AF],[atrial fibrillation],[1],"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,A,A0006,29.0,male,500,7000,[59118001],[RBBB],[right bundle branch block],[18],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Process Labels

In [122]:
# Set save path
save_path = os.path.join(DATA_PATH, 'training', 'physionet_2017')

# Load 
labels = pd.read_csv(os.path.join(save_path, 'labels.csv'))

In [136]:
# Get labels
labels = pd.DataFrame(data=np.asarray(meta_data['labels_train'].values.tolist(),  dtype=np.int32), 
                      columns=labels_scored['SNOMED CT Code'].tolist())

# Save 
labels.to_csv(os.path.join(save_path, 'labels.csv'), index=False)

# View DataFrame
labels.head()

Unnamed: 0,270492004,164889003,164890007,426627000,713427006,713426002,445118002,39732003,164909002,251146004,...,47665007,59118001,427393009,426177001,426783006,427084000,63593006,164934002,59931005,17338001
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Process Features

In [None]:
# Set save path
save_path = os.path.join(DATA_PATH, 'training', 'physionet_2017')

# Save 
features = pd.read_csv(os.path.join(save_path, 'features.csv'))

In [130]:
# List of meta data
features = list()

# Loop through datasets
for values in meta_data.to_dict(orient='records'):
      
    # Set load path
    load_path = os.path.join(DATA_PATH, values['dataset'], 'features_physionet_2017')
    
    # Loop through leads
    for lead in ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']:
  
        if os.path.isfile(os.path.join(load_path, '{}_lead_{}.json'.format(values['filename'], lead))):

            # Import features
            data_features = {'dataset': values['dataset'], 'filename': values['filename'], 'lead': lead}
            data_features.update(json.load(open(os.path.join(load_path, '{}_lead_{}.json'.format(values['filename'], lead)))))
            features.append(data_features)

# Generate DataFrame
features = pd.DataFrame(features)
features = features.sort_values(by=['dataset', 'filename', 'lead'], ascending=[True, True, True])

# Set numeric
features['age'] = features['age'].apply(pd.to_numeric, errors='coerce')

# Binarize
features['sex'] = features['sex'].map(lambda val: 1 if val == 'male' else 0)

# Save 
features.to_csv(os.path.join(save_path, 'features.csv'), index=False)
    
# View DataFrame
features.head()

Unnamed: 0,dataset,filename,lead,full_waveform_min,full_waveform_max,full_waveform_mean,full_waveform_median,full_waveform_std,full_waveform_skew,full_waveform_kurtosis,...,rpeak_entropy,rpeak_higuchi_fractal_dimension,template_corr_coeff_mean,template_corr_coeff_std,qrs_corr_coeff_mean,qrs_corr_coeff_std,p_wave_corr_coeff_mean,p_wave_corr_coeff_std,t_wave_corr_coeff_mean,t_wave_corr_coeff_std
0,A,A0001,I,-0.976816,1.061664,0.000247,-0.001915,0.267624,0.035163,4.168482,...,3.317816,2.465088,0.987405,0.003816,0.995713,0.002541,0.866373,0.067781,0.942713,0.022492
1,A,A0002,I,-0.625051,1.044477,-0.000199,-0.005681,0.167347,2.358342,15.457311,...,3.091042,,0.979281,0.006906,0.990972,0.006404,0.886879,0.045675,0.688547,0.088916
2,A,A0003,I,-0.360254,1.081835,0.000477,-0.025866,0.196894,3.331854,12.586657,...,3.135494,2.286132,0.67216,0.178956,0.984642,0.007182,0.471329,0.307302,0.060507,0.327653
3,A,A0004,I,-0.595681,1.099412,0.000537,-0.015885,0.208745,2.669416,9.934822,...,2.877468,2.719416,0.849063,0.125237,0.977497,0.01182,0.376888,0.213236,0.121306,0.238374
4,A,A0006,I,-0.671019,1.047365,0.000157,-0.010219,0.150394,2.383495,18.528395,...,2.862201,,0.98607,0.006084,0.992082,0.005623,0.945174,0.030141,0.94174,0.030778


In [132]:
# Data check
print(meta_data.shape)
print(labels.shape)
print(features.shape)

(37749, 12)
(37749, 27)
(37749, 158)
