# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 2. Create Cross-Validation Dataset

# Setup Noteboook

In [14]:
# Import 3rd party libraries
import os
import sys
import json
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Split Physionet 2020 Training Data
## Create Training Lookup File

In [15]:
# Set sample frequencies
sample_frequencies = [350]

# Set datasets
datasets = ['A', 'B', 'D', 'E', 'F']

# Create list
data = list()

# Loop through sample frequencies
for fs in sample_frequencies:
    
    # Loop through datasets
    for dataset in datasets:
        
        # Get filenames
        filenames = [filename.split('.')[0] for filename in os.listdir(os.path.join(DATA_PATH, dataset, str(fs))) 
                     if 'json' in filename]
        
        # Loop through filenames
        for filename in filenames:

            # Import meta data
            meta_data = json.load(open(os.path.join(DATA_PATH, dataset, str(fs), '{}.json'.format(filename))))
            
            # Save label
            if meta_data['labels_training']:
                data.append({'filename': filename, 'labels': meta_data['labels_training'], 'dataset': dataset, 'fs': fs,
                             'labels_merged': meta_data['labels_training_merged'], 
                             'path': os.path.join(DATA_PATH, dataset, str(fs), filename)})
                
# Create DataFrame
data = pd.DataFrame(data)

# View DataFrame
data.head()    

Unnamed: 0,filename,labels,dataset,fs,labels_merged,path
0,A0001,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,350,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebastian goodfellow\Documents\code\p...
1,A0002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,350,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebastian goodfellow\Documents\code\p...
2,A0003,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,350,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebastian goodfellow\Documents\code\p...
3,A0004,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,350,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebastian goodfellow\Documents\code\p...
4,A0006,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,350,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C:\Users\sebastian goodfellow\Documents\code\p...


# Cross-Validation 1
## iterative-stratification

In [16]:
# Loop through sample frequencies
for fs in sample_frequencies:
    
    # Filter by sample frequency
    df = data[data['fs'] == fs].reset_index()
    
    # Split dataset into train/evaluate
    rmskf = MultilabelStratifiedKFold(n_splits=6, random_state=0)
    for cv_fold, (train_index, val_index) in enumerate(rmskf.split(np.stack(df['labels_merged'].values), 
                                                                   np.stack(df['labels_merged'].values))):

        # Lookup file
        training_lookup = {'train': df.loc[train_index, 'path'].tolist(), 'val': df.loc[val_index, 'path'].tolist()}

        # Save file
        os.makedirs(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation', 
                                 'iterative_stratification', str(fs), str(cv_fold)), exist_ok=True)
        with open(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation',
                               'iterative_stratification', str(fs), str(cv_fold), 'training_lookup.json'), 'w') as file:
            json.dump(training_lookup, file, sort_keys=False, indent=4)

# Cross-Validation 2
## Split by dataset

In [17]:
# Set cv splits
cv_splits = [{'train': ['D', 'E', 'F'], 'val': ['A', 'B']},
             {'train': ['A', 'B', 'F'], 'val': ['D', 'E']},
             {'train': ['A', 'B', 'D', 'E'], 'val': ['F']}]

# Loop through sample frequencies
for fs in sample_frequencies:
    
    # Filter by sample frequency
    df = data[data['fs'] == fs].reset_index()
    
    # Split dataset into train/evaluate
    for cv_fold, cv_split in enumerate(cv_splits):
        
        # Filter tain and val
        df_train = df[df['dataset'].isin(cv_split['train'])]
        df_val = df[df['dataset'].isin(cv_split['val'])]
    
        # Lookup file
        training_lookup = {'train': df_train['path'].tolist(), 'val': df_val['path'].tolist()}
        
        # Save file
        os.makedirs(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation', 
                                 'dataset_split', str(fs), str(cv_fold)), exist_ok=True)
        with open(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation',
                               'dataset_split', str(fs), str(cv_fold), 'training_lookup.json'), 'w') as file:
            json.dump(training_lookup, file, sort_keys=False, indent=4)