# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 2. Create Cross-Validation Dataset

# Setup Noteboook

In [1]:
# Import 3rd party libraries
import os
import sys
import json
import random
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Add Augmentation Datasets
## Zheng 2020

In [2]:
# Get file paths
paths = [os.path.join(DATA_PATH, 'zheng_2020', 'formatted', file.split('.')[0]) 
         for file in os.listdir(os.path.join(DATA_PATH, 'zheng_2020', 'formatted')) if 'json' in file]

# Split Physionet 2020 Training Data
## Create Training Lookup File
#### Tranche 1

In [3]:
# Get meta data files
filenames = [file.split('.')[0] for file in os.listdir(os.path.join(DATA_PATH, 'physionet_2020_1', 'formatted')) 
             if 'json' in file]

# Get label for each file
data_1 = {'filename': [], 'labels': []}
for filename in filenames:
    meta_data = json.load(open(os.path.join(DATA_PATH, 'physionet_2020_1', 'formatted', '{}.json'.format(filename))))
    data_1['filename'].append(filename)
    data_1['labels'].append('Normal' if meta_data['labels'] and 'Normal' in meta_data['labels'] else 'Other')

# Combine
data_1 = pd.DataFrame(data_1)

# Add path
data_1['path'] = data_1['filename'].map(lambda filename: os.path.join(DATA_PATH, 'physionet_2020_1', 'formatted', filename))

# View DataFrame
data_1.head()

Unnamed: 0,filename,labels,path
0,A0001,Other,C:\Users\sebastian goodfellow\Documents\code\p...
1,A0002,Normal,C:\Users\sebastian goodfellow\Documents\code\p...
2,A0003,Other,C:\Users\sebastian goodfellow\Documents\code\p...
3,A0004,Other,C:\Users\sebastian goodfellow\Documents\code\p...
4,A0005,Other,C:\Users\sebastian goodfellow\Documents\code\p...


#### Trache 2

In [4]:
# Get meta data files
filenames = [file.split('.')[0] for file in os.listdir(os.path.join(DATA_PATH, 'physionet_2020_2', 'formatted')) 
             if 'json' in file]

# Get label for each file
data_2 = {'filename': [], 'labels': []}
for filename in filenames:
    meta_data = json.load(open(os.path.join(DATA_PATH, 'physionet_2020_2', 'formatted', '{}.json'.format(filename))))
    data_2['filename'].append(filename)
    data_2['labels'].append('Normal' if meta_data['labels'] and 'Normal' in meta_data['labels'] else 'Other')

# Combine
data_2 = pd.DataFrame(data_2)

# Add path
data_2['path'] = data_2['filename'].map(lambda filename: os.path.join(DATA_PATH, 'physionet_2020_2', 'formatted', filename))

# View DataFrame
data_2.head()

Unnamed: 0,filename,labels,path
0,Q0001,Other,C:\Users\sebastian goodfellow\Documents\code\p...
1,Q0002,Other,C:\Users\sebastian goodfellow\Documents\code\p...
2,Q0003,Other,C:\Users\sebastian goodfellow\Documents\code\p...
3,Q0004,Other,C:\Users\sebastian goodfellow\Documents\code\p...
4,Q0005,Other,C:\Users\sebastian goodfellow\Documents\code\p...


#### Combine Tranche 1 and 2

In [5]:
# Merge
data = pd.concat([data_1, data_2], ignore_index=True, axis=0).reset_index(drop=True)

# View DataFrame
data.head()

Unnamed: 0,filename,labels,path
0,A0001,Other,C:\Users\sebastian goodfellow\Documents\code\p...
1,A0002,Normal,C:\Users\sebastian goodfellow\Documents\code\p...
2,A0003,Other,C:\Users\sebastian goodfellow\Documents\code\p...
3,A0004,Other,C:\Users\sebastian goodfellow\Documents\code\p...
4,A0005,Other,C:\Users\sebastian goodfellow\Documents\code\p...


## Split and Save

In [6]:
# Initialize K-Folds
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

# Loop through folds
for cv_fold, (train_index, val_index) in enumerate(skf.split(data, data['labels'])):

    # Lookup file
    training_lookup = {'train': data.loc[train_index, 'path'].tolist(), 
                       'val': data.loc[val_index, 'path'].tolist()}
    
    # Add augmentationd data
#     training_lookup['val'].extend(paths)
#     random.shuffle(training_lookup['val'])
#     random.shuffle(training_lookup['val'])
#     random.shuffle(training_lookup['val'])
#     random.shuffle(training_lookup['val'])
#     random.shuffle(training_lookup['val'])
    
    # Save file
    os.makedirs(os.path.join(DATA_PATH, 'deepecg_binary', 'cross_validation', str(cv_fold + 1)), exist_ok=True)
    with open(os.path.join(DATA_PATH, 'deepecg_binary', 'cross_validation', 
                           str(cv_fold + 1), 'training_lookup.json'), 'w') as file:
        json.dump(training_lookup, file, sort_keys=True)