### Imports

In [None]:
import pandas
import numpy
import wfdb
import ast

### Paths

In [None]:
path = '/home/rishikant/Documents/Datasets/PTB-XL/'

### Loading and converting raw annotations

In [None]:
Y = pandas.read_csv(path + 'ptbxl_database.csv', index_col = 'ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

### Loading raw signal data

In [None]:
# The lead to load
lead = 1

# If lead == 0, this means all leads are required (default behaviour)
if (lead == 0):
    X = [wfdb.rdsamp(path+f) for f in Y.filename_hr]    
    X = numpy.array([signal for signal, meta in X])
# If lead > 12, this means wrong lead selected, throw error
elif (lead < 0 or lead > 12):
    print("Invalid lead!")
# Else, return the data from the selected lead
else:
    X = numpy.zeros([5000, Y.filename_hr.size])
    for i in range(Y.filename_hr.size):
        signal = numpy.array(wfdb.rdsamp(path + Y.filename_hr[i + 1])[0])[:, lead - 1]
        X[:, i] = signal

### Loading diagnostic statements

In [None]:
agg_df = pandas.read_csv(path + 'scp_statements.csv', index_col = 0)
agg_df = agg_df[agg_df.diagnostic == 1]

### Aggregating diagnostic information as superclass

In [None]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

### Saving the data
- Waveforms are saved as numpy array
- Annotations are saved as pandas dataframe

In [None]:
# Waveforms
numpy.savetxt('waveforms.csv', X, delimiter=',')

# Annotations
Y['diagnostic_superclass'].to_csv('annotations.csv')