# PhysioNet/Computing in Cardiology Challenge 2020
## Classification of 12-lead ECGs
### 1. Extract Features

# Setup Notebook

In [23]:
# Import 3rd party libraries
import os
import sys
import time
import numpy as np
from joblib import Parallel, delayed

# Import local Libraries
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))))
from kardioml.models.physionet2017.features.feature_extractor import Features
from kardioml import DATA_PATH, ECG_LEADS, FILTER_BAND_LIMITS

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Extract Features

In [24]:
def process_lead(idx, filenames, lead, load_path, save_path):
    
    # Extract features
    features = Features(filename=filenames[idx], load_path=load_path, save_path=save_path, lead=lead)
    features.extract_features(feature_groups=['full_waveform_features', 'rri_features', 'template_features'], 
                              filter_bandwidth=FILTER_BAND_LIMITS)

    # Save features
    features.save_features()

In [25]:
# Loop through datasets
for dataset in ['A', 'B', 'C', 'D', 'E', 'F']:
    
    print('Processing dataset {}'.format(dataset))
    
    # Set save path
    save_path = os.path.join(DATA_PATH, dataset, 'features_physionet_2017')
    os.makedirs(save_path, exist_ok=True)
    
    # Set load path
    load_path = os.path.join(DATA_PATH, dataset, 'formatted')
    
    # Get list of files
    filenames = [file.split('.')[0] for file in os.listdir(load_path)
                 if 'json' in file]
  
    # Loop through leads
    for lead in ECG_LEADS[0:1]:
        print('Processing {} lead {} signals.'.format(len(filenames), lead))
        start_time = time.time()
        # Loop through filenames
        _ = Parallel(n_jobs=-1)(delayed(process_lead)(idx, filenames, lead, load_path, save_path) 
                                for idx in range(len(filenames)))  
        print('Completed in {} minutes.'.format(np.round((time.time() - start_time) / 60, 2)))

    print('Completed dataset {}\n'.format(dataset))

Processing dataset A
Processing 6877 lead I signals.
Completed in 16.05 minutes.
Completed dataset A

Processing dataset B
Processing 3453 lead I signals.
Completed in 9.0 minutes.
Completed dataset B

Processing dataset C
Processing 74 lead I signals.
Completed in 9.92 minutes.
Completed dataset C

Processing dataset D
Processing 516 lead I signals.
Completed in 13.03 minutes.
Completed dataset D

Processing dataset E
Processing 21837 lead I signals.
Completed in 59.17 minutes.
Completed dataset E

Processing dataset F
Processing 10344 lead I signals.
Completed in 14.2 minutes.
Completed dataset F



# Extract Features DEBUG

In [11]:
# Loop through datasets
for dataset in ['B']:
    
    # Set save path
    save_path = os.path.join(DATA_PATH, dataset, 'features_physionet_2017')
    os.makedirs(save_path, exist_ok=True)
    
    # Set load path
    load_path = os.path.join(DATA_PATH, dataset, 'formatted')
    
    # Get list of files
    filenames = [file.split('.')[0] for file in os.listdir(load_path)
                 if 'json' in file]

    # Loop through leads
    for lead in ECG_LEADS[0:1]:

        # Loop through filenames
        for filename in filenames[0:200]:

            # Extract features
            features = Features(filename=filename, load_path=load_path, save_path=save_path, lead=lead)
            features.extract_features(feature_groups=['full_waveform_features', 'rri_features', 'template_features'], 
                                      filter_bandwidth=[3, 45])

            # Save features
            features.save_features()