# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 2. Create Training Dataset

# Setup Noteboook

In [12]:
# Import 3rd party libraries
import os
import sys
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Split Physionet 2020 Training Data
## Create Training Lookup File

In [6]:
# Get meta data files
filenames = [file.split('.')[0] for file in os.listdir(os.path.join(DATA_PATH, 'physionet_2020', 'formatted')) 
             if 'json' in file]

# Get label for each file
labels = list()
for filename in filenames:
    meta_data = json.load(open(os.path.join(DATA_PATH, 'physionet_2020', 'formatted', '{}.json'.format(filename))))
    labels.append('{}-{}'.format(meta_data['labels'][0], meta_data['labels'][1]) if len(meta_data['labels']) > 1 
                                 else meta_data['labels'][0])

# Combine
data = pd.DataFrame({'filename': filenames, 'labels': labels})

# Add path
data['path'] = data['filename'].map(lambda filename: os.path.join(DATA_PATH, 'physionet_2020', 'formatted', filename))

# View DataFrame
data.head()

Unnamed: 0,filename,labels,path
0,A0001,RBBB,C:\Users\sebig\Documents\Code\physionet-challe...
1,A0002,Normal,C:\Users\sebig\Documents\Code\physionet-challe...
2,A0003,AF,C:\Users\sebig\Documents\Code\physionet-challe...
3,A0004,AF,C:\Users\sebig\Documents\Code\physionet-challe...
4,A0005,PVC,C:\Users\sebig\Documents\Code\physionet-challe...


## Split Data 

In [8]:
# Set split ratios
train_size = 0.8
val_size = 0.2
assert train_size + val_size == 1

# Split dataset into train/evaluate
data_train, data_val = train_test_split(data, test_size=val_size, stratify=data['labels'], random_state=0, shuffle=True)

# Lookup file
training_lookup = {'train': data_train['path'].tolist(), 'val': data_val['path'].tolist()}

# Add Augmentation Datasets
## Zheng 2020

In [13]:
# Get file paths
paths = [os.path.join(DATA_PATH, 'zheng_2020', 'formatted', file.split('.')[0]) 
         for file in os.listdir(os.path.join(DATA_PATH, 'zheng_2020', 'formatted')) if 'json' in file]

# Add to lookup dictionary
training_lookup['train'].extend(paths)
random.shuffle(training_lookup['train'])

# Save Lookup JSON

In [15]:
# Save file
os.makedirs(os.path.join(DATA_PATH, 'deepecg'), exist_ok=True)
with open(os.path.join(DATA_PATH, 'deepecg', 'training_lookup.json'), 'w') as file:
    json.dump(training_lookup, file, sort_keys=True)