In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Conv1D
import wfdb                            # Package for loading the ecg and annotation
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore") 
import random
from keras.layers import Bidirectional, LSTM
# Random Initialization
random.seed(42)

record_list = ['100','101','102','103','104','105','106','107',
           '108','109','111','112','113','114','115','116',
           '117','118','119','121','122','123','124','200',
           '201','202','203','205','207','208','209','210',
           '212','213','214','215','217','219','220','221',
           '222','223','228','230','231','232','233','234']

path = "data/"
data = path

ModuleNotFoundError: No module named 'keras'

In [2]:
symbols_df = pd.DataFrame()

# Reading all .atr files 
for str in record_list:
    # Generating filepath for all .atr file names
    filepath = path + str
    # Saving annotation object
    annotation = wfdb.rdann(filepath, 'atr')
    # Extracting symbols from the object
    sym = annotation.symbol
    # Saving value counts
    values, counts = np.unique(sym, return_counts=True)
    # Writing data points into dataframe
    df = pd.DataFrame({'symbol':values, 'Counts':counts, 'Patient Number':[str]*len(counts)})
    # Concatenating all data points  
    symbols_df = pd.concat([symbols_df, df], axis = 0)

In [3]:
nonbeat = ['[','!',']','x','(',')','p','t','u','`',
           '\'','^','|','~','+','s','T','*','D','=','"','@','Q','?']

abnormal = ['L','R','V','/','A','f','F','j','a','E','J','e','S']

normal = ['N']

symbols_df['category'] = -1
symbols_df.loc[symbols_df.symbol=='N', 'category'] = 0
symbols_df.loc[symbols_df.symbol.isin(abnormal), 'category'] = 1


In [4]:
symbols_df

Unnamed: 0,symbol,Counts,Patient Number,category
0,+,1,100,-1
1,A,33,100,1
2,N,2239,100,0
3,V,1,100,1
0,+,1,101,-1
...,...,...,...,...
0,+,3,234,-1
1,J,50,234,1
2,N,2700,234,0
3,V,3,234,1


In [5]:
# One complete signal consists of annotation and data. Each annotation file can be divided into two groups whcih
# are symbols represting the peaks, like N, A, etc and the location of these symbols in the ecg data from 
# annotation objects's sample part

def get_ecg(filename):
    signal = wfdb.rdrecord(filename, channels=[0]).p_signal
    annotation_symbols = wfdb.rdann(filename, "atr").symbol
    annotation_symbol_location = wfdb.rdann(filename, "atr").sample
    
    return signal, annotation_symbols, annotation_symbol_location


In [39]:
def build_XY(p_signal, df_ann, num_cols):
    # this function builds the X,Y matrices for each beat
    # it also returns the original symbols for Y
    
    num_rows = len(df_ann)

    X = np.zeros((num_rows, num_cols))
    Y = np.zeros((num_rows,1))
    sym = []
    
    # keep track of rows
    max_row = 0

    for atr_sample, atr_sym in zip(df_ann.atr_sample.values,df_ann.atr_sym.values):

        left = max([0,(atr_sample  - num_sec*fs) ])
        right = min([len(p_signal),(atr_sample + num_sec*fs) ])
        x = p_signal[left: right]
        if len(x) == num_cols:
            X[max_row] = x
            Y[max_row] = int(atr_sym in abnormal)
            sym.append(atr_sym)
            max_row += 1
    X = X[:max_row,:]
    Y = Y[:max_row,:]
    return X,Y,sym

In [42]:
def make_dataset(pts, num_sec, fs):
    # function for making dataset ignoring non-beats
    # input:
    #   pts - list of patients
    #   num_sec = number of seconds to include before and after the beat
    #   fs = frequency
    # output: 
    #   X_all = signal (nbeats , num_sec * fs columns)
    #   Y_all = binary is abnormal (nbeats, 1)
    #   sym_all = beat annotation symbol (nbeats,1)
    
    # initialize numpy arrays
    num_cols = 2*num_sec*fs
    X_all = np.zeros((1,num_cols))
    Y_all = np.zeros((1,1))
    sym_all = []
    
    # list to keep track of number of beats across patients
    max_rows = []
    
    for pt in pts:
        file = data + pt
        
        p_signal, atr_sym, atr_sample = get_ecg(file)
        
        # grab the first signal
        p_signal = p_signal[:,0]
        
        # make df to exclude the nonbeats
        df_ann = pd.DataFrame({'atr_sym':atr_sym, 'atr_sample':atr_sample})
        df_ann = df_ann.loc[df_ann.atr_sym.isin(abnormal + ['N'])]
        
        X,Y,sym = build_XY(p_signal, df_ann, num_cols)
        sym_all = sym_all+sym
        max_rows.append(X.shape[0])
        X_all = np.append(X_all,X,axis = 0)
        Y_all = np.append(Y_all,Y,axis = 0)
        
    # drop the first zero row
    X_all = X_all[1:]
    Y_all = Y_all[1:]

    return X_all, Y_all, sym_all


[2230]
(array([[-0.34 , -0.335, -0.33 , ..., -0.36 , -0.35 , -0.34 ],
       [-0.39 , -0.395, -0.41 , ..., -0.34 , -0.335, -0.33 ],
       [-0.4  , -0.41 , -0.425, ..., -0.33 , -0.33 , -0.34 ],
       ...,
       [-0.3  , -0.315, -0.325, ..., -0.375, -0.365, -0.36 ],
       [-0.325, -0.34 , -0.345, ..., -0.365, -0.365, -0.345],
       [-0.32 , -0.31 , -0.315, ..., -0.5  , -0.485, -0.49 ]]), array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]]), ['N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'

In [19]:
num_sec = 3
fs = 360

X_all, Y_all, sym_all = make_dataset(record_list, num_sec, fs)

In [20]:
print(np.unique(sym_all))

['/' 'A' 'E' 'F' 'J' 'L' 'N' 'R' 'S' 'V' 'a' 'e' 'f' 'j']
