# Preprocessing of ECG-Fragment Dataset

In [1]:
import pandas as pd
import numpy as np
import wfdb
from pathlib import Path
import zipfile
import json
import os
import shutil
import torch 

In [2]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# -------------------- DEFINE CUSTOM TEMPLATE -------------------- #
pio.templates['draft'] = go.layout.Template(layout=dict(
    margin=dict(l=50, r=50, b=50, t=50),
    legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
)
))
pio.templates.default = "plotly_white+draft"

### Open records

In [3]:
data_path = Path("../dataset/ecg-fragment-1.0.0 (340hz)")
records = {}

# Read the dataset records
# ----------------------------------------------
with open(data_path / "RECORDS") as f:
    for x in f.readlines():
        record_path = x.replace("\n", '')
        key = record_path.split('/')[0]

        if key not in records.keys():
            records[key] = []

        records[key].append(record_path)


# Split the dataset into train, val, and test
# Note: The class balance is maintained
# ----------------------------------------------
train_data, train_label = [], []
val_data, val_label = [], []
test_data, test_label = [], []

# split data by class
for key in records.keys():

    # read the class dir
    class_data = []
    for record in records[key]:
        class_data.append(wfdb.rdsamp(data_path / record)[0])

    class_data = np.stack(class_data)

    # split the dataset into - train, val and test 
    size_train_val, size_test = round(len(class_data)*.8), round(len(class_data)*.2)
    size_train, size_val = round(size_train_val*.8), round(size_train_val*.2)

    # train data
    range_train = np.arange(0, size_train)
    train_data.append(class_data[range_train])
    train_label.append([key]*len(range_train))

    # validation data
    range_val = np.arange(size_train, size_train+size_val)
    val_data.append(class_data[range_val])
    val_label.append([key]*len(range_val))

    # test data
    range_test = np.arange(range_val[-1]+1, (range_val[-1]+1)+size_test)
    test_data.append(class_data[range_test])
    test_label.append([key]*len(range_test))


In [4]:
temp_dir = Path("ecg-fragment_360hz")
os.makedirs(temp_dir, exist_ok=True)

def encode_labels(labels: list[str]):
    label_df = pd.DataFrame({'label': labels})

    coded_labels, uniques = pd.factorize(label_df.label.values)
    coded_labels = coded_labels.reshape(-1,1)

    return coded_labels, uniques

metadata = wfdb.rdsamp(data_path / records['1_Dangerous_VFL_VF'][0])[1]

metadata['task'] = 'Multiclass classification'
metadata

{'fs': 250,
 'sig_len': 721,
 'n_sig': 1,
 'base_date': None,
 'base_time': None,
 'units': ['mV'],
 'sig_name': ['col 1'],
 'comments': [],
 'task': 'Multiclass classification'}

### Split into train, val and test

In [6]:
from scipy.stats import zscore

**Note:** 

At the dataset descrition in the [physionet](https://physionet.org/content/ecg-fragment-high-risk-label/1.0.0/#files-panel) the sample rate of the signal is 360 Hz, however at the `.hea` records the sample rate is 250 Hz. This way, we assume that the `.hea` information is incorrect!

Also, the signal have 721 samples, to round the signal into 2 sec, we crop the records at 720 samples.

In [5]:
metadata['fs'] = 360
metadata['sig_len'] = 720

In [7]:
# [Train dataset]
train_samples = np.concatenate(train_data, dtype=np.float32)    # concat all classes
train_samples = np.moveaxis(train_samples, -1, 1)               # [batch, signal, channel] -> [batch, channel, signal]
train_samples = zscore(train_samples, axis=2)                   # normalization of the data
train_samples = torch.from_numpy(train_samples[:,:,:-1])        # remove last sample 

train_labels, train_uniques = encode_labels(sum(train_label, []))
train_labels = torch.from_numpy(train_labels)

# [validation dataset]
val_samples = np.concatenate(val_data, dtype=np.float32)
val_samples = np.moveaxis(val_samples, -1, 1)
val_samples = zscore(val_samples, axis=2)
val_samples = torch.from_numpy(val_samples[:,:,:-1])

val_labels, val_uniques = encode_labels(sum(val_label, []))
val_labels = torch.from_numpy(val_labels)

# [Test dataset]
test_samples = np.concatenate(test_data, dtype=np.float32)
test_samples = np.moveaxis(test_samples, -1, 1)
test_samples = zscore(test_samples, axis=2)
test_samples = torch.from_numpy(test_samples[:,:,:-1])

test_labels, test_uniques = encode_labels(sum(test_label, []))
test_labels = torch.from_numpy(test_labels)

# Labels' code
metadata['labels_code'] = val_uniques.tolist()
metadata['num_classes'] = len(val_uniques)

### Save as torch file (.pt)

In [9]:
torch.save({'samples': train_samples, 'labels': train_labels.squeeze()}, temp_dir / 'train.pt')
torch.save({'samples': val_samples, 'labels': val_labels.squeeze()}, temp_dir / 'val.pt')
torch.save({'samples': test_samples, 'labels': test_labels.squeeze()}, temp_dir / 'test.pt')

with open(temp_dir / "metadata.json", 'w') as f:
    json.dump(metadata, f)

In [10]:
zf = zipfile.ZipFile(temp_dir.name + ".zip", 'w')
for dirname, subdir, files in os.walk(temp_dir.name):
    for filename in files:
        zf.write(temp_dir / filename, filename)

zf.close()

shutil.rmtree(temp_dir)

In [21]:
sample_id = 1
fig = make_subplots(rows=len(test_data), cols=1, shared_xaxes=True, shared_yaxes=True, vertical_spacing=0.025)

fig.add_traces([
    go.Scatter(y=test_data[0][sample_id,:,0], name=test_label[0][0]),
    go.Scatter(y=test_data[1][sample_id,:,0], name=test_label[1][0]),
    go.Scatter(y=test_data[2][sample_id,:,0], name=test_label[2][0]),
    go.Scatter(y=test_data[3][sample_id,:,0], name=test_label[3][0]),
    go.Scatter(y=test_data[4][sample_id,:,0], name=test_label[4][0]),
    go.Scatter(y=test_data[5][sample_id,:,0], name=test_label[5][0]),
], rows=[1,2,3,4,5,6], cols=1)

fig.update_layout(height=800, width=1000)

fig.show()

### Upsample to 500Hz

In [11]:
import torch.nn as nn

In [12]:
dataset_path = Path("../dataset/ecg-fragment_360hz")

with open(dataset_path / "metadata.json", 'r') as f:
    metadata = json.load(f)

train_data = torch.load(dataset_path / 'train.pt')
train_samples = train_data['samples']

val_data = torch.load(dataset_path / 'val.pt')
val_samples = val_data['samples']

test_data = torch.load(dataset_path / 'test.pt')
test_samples = test_data['samples']

In [18]:
new_hz = 500

duration = (metadata['sig_len']/metadata['fs'])
new_size = int(duration*new_hz)

train_samples = nn.functional.interpolate(train_samples, new_size, mode='linear')
train_samples = train_samples.float()

val_samples = nn.functional.interpolate(val_samples, new_size, mode='linear')
val_samples = val_samples.float()

test_samples = nn.functional.interpolate(test_samples, new_size, mode='linear')
test_samples = test_samples.float()

metadata['fs'] = new_hz
metadata['sig_len'] = train_samples.shape[-1]

In [19]:
train_samples.shape, train_data['samples'].shape, duration

(torch.Size([651, 1, 1000]), torch.Size([651, 1, 720]), 2.0)

In [28]:
fig = go.Figure()

t_original = np.linspace(0, duration, train_data['samples'].shape[-1])
t_resampled = np.linspace(0, duration, train_samples.shape[-1])

fig.add_traces([
    go.Scatter(x=t_original, y=train_data['samples'][456,0], name='original'),
    go.Scatter(x=t_resampled, y=train_samples[456,0], name='resampled'),
])
fig.update_layout(xaxis_title="Time (s)", width=800, height=300)
fig.show()

In [21]:
temp_dir = Path("ecg-fragment_500hz")
os.makedirs(temp_dir, exist_ok=True)

torch.save({'samples': train_samples, 'labels': train_data['labels']}, temp_dir / 'train.pt')
torch.save({'samples': val_samples, 'labels': val_data['labels']}, temp_dir / 'val.pt')
torch.save({'samples': test_samples, 'labels': test_data['labels']}, temp_dir / 'test.pt')

with open(temp_dir / "metadata.json", 'w') as f:
    json.dump(metadata, f)

In [22]:
zf = zipfile.ZipFile(temp_dir.name + ".zip", 'w')
for dirname, subdir, files in os.walk(temp_dir.name):
    for filename in files:
        zf.write(temp_dir / filename, filename)

zf.close()

shutil.rmtree(temp_dir)