# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 2. Create Training Dataset

# Setup Noteboook

In [2]:
# Import 3rd party libraries
import os
import sys
import json
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Split Physionet 2020 Training Data
## Create Training Lookup File

In [10]:
# Set sample frequencies
sample_frequencies = [250, 300, 350, 400, 450, 500]

# Set datasets
datasets = ['A', 'B', 'C', 'D', 'E', 'F']

# Loop through sample frequencies
for fs in sample_frequencies[0:1]:
    
    # Create directory for formatted data
    os.makedirs(os.path.join(DATA_PATH, 'training', str(fs)), exist_ok=True)
    
    # Create list
    lookup = list()
    
    # Loop through datasets
    for dataset in datasets:
        
        # Get filenames
        filenames = [filename.split('.')[0] for filename in os.listdir(os.path.join(DATA_PATH, dataset, str(fs))) 
                     if 'json' in filename]
        
        # Loop through filenames
        for filename in filenames:

            # Import meta data
            meta_data = json.load(open(os.path.join(DATA_PATH, dataset, str(fs), '{}.json'.format(filename))))
            
            # Save label
            if meta_data['labels_training']:
                lookup.append({'filename': filename, 'labels': meta_data['labels_training'], 'dataset': dataset, 'fs': fs,
                               'labels_merged': meta_data['labels_training_merged'], 
                               'path': os.path.join(DATA_PATH, dataset, str(fs))})
                
            
            
            
            
        
        

In [11]:
lookup = pd.DataFrame(lookup)
lookup.head()

Unnamed: 0,filename,labels,dataset,fs,labels_merged,path
0,A0001,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,250,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebig\Documents\Code\physionet-challe...
1,A0002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,250,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebig\Documents\Code\physionet-challe...
2,A0003,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,250,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebig\Documents\Code\physionet-challe...
3,A0004,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,250,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebig\Documents\Code\physionet-challe...
4,A0006,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,250,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebig\Documents\Code\physionet-challe...


#### A

In [None]:
# Get meta data files
filenames = [file.split('.')[0] for file in os.listdir(os.path.join(DATA_PATH, 'A', 'formatted')) 
             if 'json' in file]

# Get label for each file
data_1 = {'filename': [], 'labels': []}
for filename in filenames:
    meta_data = json.load(open(os.path.join(DATA_PATH, 'physionet_2020_1', 'formatted', '{}.json'.format(filename))))
    if meta_data['labels']:
        data_1['filename'].append(filename)
        data_1['labels'].append(meta_data['label_train'])

# Combine
data_1 = pd.DataFrame(data_1)

# Add path
data_1['path'] = data_1['filename'].map(lambda filename: os.path.join(DATA_PATH, 'physionet_2020_1', 'formatted', filename))

# View DataFrame
data_1.head()

#### Tranche 2

In [None]:
# Get meta data files
filenames = [file.split('.')[0] for file in os.listdir(os.path.join(DATA_PATH, 'physionet_2020_2', 'formatted')) 
             if 'json' in file]

# Get label for each file
data_2 = {'filename': [], 'labels': []}
for filename in filenames:
    meta_data = json.load(open(os.path.join(DATA_PATH, 'physionet_2020_2', 'formatted', '{}.json'.format(filename))))
    if meta_data['labels']:
        data_2['filename'].append(filename)
        data_2['labels'].append(meta_data['label_train'])

# Combine
data_2 = pd.DataFrame(data_2)

# Add path
data_2['path'] = data_2['filename'].map(lambda filename: os.path.join(DATA_PATH, 'physionet_2020_2', 'formatted', filename))

# View DataFrame
data_2.head()

#### Combine Tranche 1 and 2

In [None]:
# Merge
data = pd.concat([data_1, data_2], ignore_index=True, axis=0).reset_index(drop=True)

# View DataFrame
data.head()

## Split Data 

In [None]:
# Split dataset into train/evaluate
rmskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
for train_index, val_index in rmskf.split(np.stack(data['labels'].values), np.stack(data['labels'].values)):
    pass
   
# Lookup file
training_lookup = {'train': data.loc[train_index, 'path'].tolist(), 'val': data.loc[val_index, 'path'].tolist()}

# Save Lookup JSON

In [None]:
# Save file
os.makedirs(os.path.join(DATA_PATH, 'deepecg'), exist_ok=True)
with open(os.path.join(DATA_PATH, 'deepecg', 'training_lookup.json'), 'w') as file:
    json.dump(training_lookup, file, sort_keys=True)