# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 2. Create Cross-Validation Dataset

# Setup Noteboook

In [1]:
# Import 3rd party libraries
import os
import sys
import json
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml import DATA_PATH

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Split Physionet 2020 Training Data
## Create Training Lookup File

In [2]:
# Set datasets
datasets = ['A', 'B', 'C', 'D', 'E', 'F']

# Create list
data = list()

# Loop through datasets
for dataset in datasets:

    # Get filenames
    filenames = [filename.split('.')[0] for filename in os.listdir(os.path.join(DATA_PATH, dataset, 'formatted')) 
                 if 'json' in filename]

    # Loop through filenames
    for filename in filenames:

        # Import meta data
        meta_data = json.load(open(os.path.join(DATA_PATH, dataset, 'formatted', '{}.json'.format(filename))))

        # Save label
        if meta_data['labels_training']:
            data.append({'filename': filename, 'labels': meta_data['labels_training'], 'dataset': dataset,
                         'labels_merged': meta_data['labels_training_merged']})
        else:
            data.append({'filename': filename, 'labels': [0 for _ in range(27)], 'dataset': dataset, 
                         'labels_merged': [0 for _ in range(27)]})
                
# Create DataFrame
data = pd.DataFrame(data)

# View DataFrame
data.head()    

Unnamed: 0,filename,labels,dataset,labels_merged
0,A0001,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A0002,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A0003,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A0004,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,A0005,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Cross-Validation 1
## iterative-stratification

In [11]:
# Split dataset into train/evaluate
rmskf = MultilabelStratifiedKFold(n_splits=6, random_state=0)
for cv_fold, (train_index, val_index) in enumerate(rmskf.split(np.stack(data['labels_merged'].values), 
                                                               np.stack(data['labels_merged'].values))):

    # Lookup file
    training_lookup = {'train': data.loc[train_index, 'filename'].tolist(), 'val': data.loc[val_index, 'filename'].tolist()}

    # Save file
    os.makedirs(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation', 'iterative_stratification'), exist_ok=True)
    with open(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation', 'iterative_stratification', 
                           'cv_{}.json'.format(cv_fold)), 'w') as file:
        json.dump(training_lookup, file, sort_keys=False, indent=4)

C:\Users\sebig\Documents\Code\physionet-challenge-2020\data\training\deepecg\cross_validation\iterative_stratification
C:\Users\sebig\Documents\Code\physionet-challenge-2020\data\training\deepecg\cross_validation\iterative_stratification
C:\Users\sebig\Documents\Code\physionet-challenge-2020\data\training\deepecg\cross_validation\iterative_stratification
C:\Users\sebig\Documents\Code\physionet-challenge-2020\data\training\deepecg\cross_validation\iterative_stratification
C:\Users\sebig\Documents\Code\physionet-challenge-2020\data\training\deepecg\cross_validation\iterative_stratification
C:\Users\sebig\Documents\Code\physionet-challenge-2020\data\training\deepecg\cross_validation\iterative_stratification


# Cross-Validation 2
## Split by dataset

In [None]:
# Set cv splits
cv_splits = [{'train': ['D', 'E', 'F'], 'val': ['A', 'B']},
             {'train': ['A', 'B', 'F'], 'val': ['D', 'E']},
             {'train': ['A', 'B', 'D', 'E'], 'val': ['F']}]

# Loop through sample frequencies
for fs in sample_frequencies:
    
    # Filter by sample frequency
    df = data[data['fs'] == fs].reset_index()
    
    # Split dataset into train/evaluate
    for cv_fold, cv_split in enumerate(cv_splits):
        
        # Filter tain and val
        df_train = df[df['dataset'].isin(cv_split['train'])]
        df_val = df[df['dataset'].isin(cv_split['val'])]
    
        # Lookup file
        training_lookup = {'train': df_train['path'].tolist(), 'val': df_val['path'].tolist()}
        
        # Save file
        os.makedirs(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation', 
                                 'dataset_split', str(fs), str(cv_fold)), exist_ok=True)
        with open(os.path.join(DATA_PATH, 'training', 'deepecg', 'cross_validation',
                               'dataset_split', str(fs), str(cv_fold), 'training_lookup.json'), 'w') as file:
            json.dump(training_lookup, file, sort_keys=False, indent=4)