# ECG DB 2 - MIT-BIH Atrial Fibrillation
### Sebastian D. Goodfellow, Ph.D.

# Setup Notebook

In [1]:
# Import 3rd party libraries
import os
import sys
import wfdb
import numpy as np
import pandas as pd
from scipy import interpolate
import matplotlib.pylab as plt

# Import local Libraries
sys.path.insert(0, r'C:\Users\sebig\Documents\Code\ecg_db')
from ecgdb.config.config import DATA_DIR

# Configure Notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Download Dataset

In [2]:
# Set database name
db_name = 'db2'

# Set raw data path
raw_path = os.path.join(DATA_DIR, db_name, 'raw')

# Download data set
# wfdb.dl_database('afdb', raw_path)

# Get list of recordings
recording_ids = [file.split('.')[0] for file in os.listdir(raw_path) if '.dat' in file]

# Print list of recordings
print(recording_ids)

# Labels 
label_dict = {'AFIB': 'atrial fibrillation', 'AFL': 'atrial flutter', 'J': 'AV junctional rhythm'}

# Get sample frequency (Hz)
record = wfdb.rdrecord(os.path.join(raw_path, recording_ids[0]))
fs = record.__dict__['fs']

['04015', '04043', '04048', '04126', '04746', '04908', '04936', '05091', '05121', '05261', '06426', '06453', '06995', '07162', '07859', '07879', '07910', '08215', '08219', '08378', '08405', '08434', '08455']


# Get Continuous Arrhythmia Sections

In [3]:
# Empty dictionary for arrhythmia sections
sections = list()

# Loop through recordings
for recording_id in recording_ids:
    
    # Import recording
    record = wfdb.rdrecord(os.path.join(raw_path, recording_id))
    
    # Import annotations
    annotation = wfdb.rdann(os.path.join(raw_path, recording_id), 'atr')
    
    # Get waveform data
    data = record.__dict__['p_signal']
    
    # labels
    labels = [label[1:] for label in annotation.__dict__['aux_note']]
    
    # Samples
    sample = annotation.__dict__['sample']
    
    # Loop through labels
    for idx, label in enumerate(labels):
        
        if any(label in val for val in list(label_dict.keys())):
            
            if idx != len(labels) - 1:
                sections.append({'label': label, 'waveform': 1, 'record': recording_id,
                                 'db': 'afdb', 'data': data[sample[idx]:sample[idx + 1], 0]})
                sections.append({'label': label, 'waveform': 2, 'record': recording_id,
                                 'db': 'afdb', 'data': data[sample[idx]:sample[idx + 1], 1]})
                
            elif idx == len(labels) - 1:
                sections.append({'label': label, 'waveform': 1, 'record': recording_id,
                                 'db': 'afdb', 'data': data[sample[idx]:, 0]})
                sections.append({'label': label, 'waveform': 2, 'record': recording_id,
                                 'db': 'afdb', 'data': data[sample[idx]:, 1]})

# Get Samples

In [4]:
# Empty dictionary for arrhythmia samples
samples = list()

# Set sample length in seconds
sample_length = 60 

# Get sample length in sample points
sample_length_sp = sample_length * fs

# Loop through sections
for section in sections:
    
    # Set index
    idx = 0
    
    # Get number of samples in section
    num_samples = int(np.ceil(len(section['data']) / sample_length_sp))
    
    # Loop through samples
    for sample_id in range(num_samples):
        
        # Get sample
        if sample_id != num_samples - 1:
            samples.append({'label': section['label'], 'waveform': section['waveform'], 'record': section['record'],
                            'sample': sample_id, 'db': section['db'], 'data': section['data'][idx:idx + sample_length_sp]})
            idx += sample_length_sp
            
        elif sample_id == num_samples - 1:
            samples.append({'label': section['label'], 'waveform': section['waveform'], 'record': section['record'],
                            'sample': sample_id, 'db': section['db'], 'data': section['data'][idx:]})

# Resample

In [5]:
# Set resample frequency (Hz)
fs_rs = 300

# Get time array
time = np.arange(sample_length_sp) * 1 / fs

# Loop through samples
for idx, sample in enumerate(samples):
    
    # Get time array
    time = np.arange(len(sample['data'])) * 1 / fs

    # Generate new resampling time array
    times_rs = np.arange(0, time[-1], 1 / fs_rs)

    # Setup interpolation function
    interp_func = interpolate.interp1d(x=time, y=sample['data'], kind='linear')

    # Interpolate contiguous segment
    values_rs = interp_func(times_rs)
    
    # Save resampled waveform
    samples[idx]['data'] = values_rs

# Save Training Dataset

In [6]:
# Set processed data path
processed_path = os.path.join(DATA_DIR, db_name, 'processed')

# Create empty DataFrame
labels = pd.DataFrame(data=[], columns=['db', 'record', 'label', 'sample', 'signal_id', 'train_label', 'file_name'])

# Loop through samples
for idx, sample in enumerate(samples):

    # Set file name
    file_name = '{}_{}_signal{}_sample{}.npy'.format(sample['db'], sample['record'], sample['waveform'], sample['sample'])
    
    # Get labels
    labels = labels.append(pd.Series({'db': sample['db'], 'record': sample['record'], 'label': sample['label'], 
                                      'sample': sample['sample'], 'signal_id': sample['waveform'], 
                                      'train_label': 'A' if sample['label'] == 'AFIB' else 'O', 
                                      'file_name': file_name}), ignore_index=True)
    
    # Save waveform as .npy
    np.save(os.path.join(processed_path, 'waveforms', file_name), sample['data'])
    
# Save labels
labels.to_csv(os.path.join(processed_path, 'labels', 'labels.csv'), index=False)