In [None]:
# https://chatgpt.com/c/685ea7d6-8738-8004-b57a-44b44511f2f1

In [1]:
import wfdb
import numpy as np
import matplotlib.pyplot as plt
import os

In [12]:
import wfdb

record_path = '../data/files/I01'

# Load the record (wfdb reads the .hea and finds .mat automatically)
record = wfdb.rdrecord(record_path)

print("Signal shape:", record.p_signal.shape)
print("Signal names:", record.sig_name)


Signal shape: (462600, 12)
Signal names: ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']


In [13]:
annotation = wfdb.rdann(record_path, 'atr')

In [14]:
# Show basic info
print("Signal shape:", record.p_signal.shape)
print("Signal names:", record.sig_name)
print("Number of annotations:", len(annotation.sample))
print("First 5 annotation positions (samples):", annotation.sample[:5])
print("First 5 annotation symbols:", annotation.symbol[:5])

Signal shape: (462600, 12)
Signal names: ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
Number of annotations: 2757
First 5 annotation positions (samples): [114 277 442 608 710]
First 5 annotation symbols: ['N', 'N', 'N', 'N', 'V']


In [15]:
import numpy as np

signal = record.p_signal
labels = annotation.symbol
positions = annotation.sample

fs = int(record.fs)  # sampling frequency (e.g., 257 Hz)
window_size = fs     # 1-second window (can be 2×fs for 2s)

X = []
y = []

for idx, pos in enumerate(positions):
    start = pos - window_size // 2
    end = pos + window_size // 2

    # Skip beats too close to the signal edges
    if start < 0 or end > len(signal):
        continue

    # Extract window (12-lead)
    window = signal[start:end, :]
    X.append(window)
    y.append(labels[idx])

X = np.array(X)  # shape: (n_beats, window_size, 12)
y = np.array(y)  # shape: (n_beats,)


In [19]:
X.shape

(2756, 256, 12)

In [22]:
sample = X[0]

In [27]:
sample.shape

(256, 12)

In [31]:
sample[:,0]

array([3.61764706, 3.67973856, 3.67647059, 3.72875817, 3.7254902 ,
       3.67647059, 3.66993464, 3.69607843, 3.68300654, 3.66339869,
       3.65686275, 3.67973856, 3.69281046, 3.7254902 , 3.73202614,
       3.7745098 , 3.78431373, 3.77777778, 3.80392157, 3.81045752,
       3.80392157, 3.85294118, 3.85947712, 3.88562092, 3.94117647,
       3.97058824, 3.94444444, 3.89869281, 3.89869281, 3.90196078,
       3.90196078, 3.89215686, 3.8627451 , 3.80392157, 3.76797386,
       3.76470588, 3.75490196, 3.73529412, 3.72222222, 3.71895425,
       3.70261438, 3.74836601, 3.74183007, 3.67973856, 3.66993464,
       3.69607843, 3.66993464, 3.68954248, 3.69934641, 3.66993464,
       3.67647059, 3.69281046, 3.66339869, 3.65359477, 3.66666667,
       3.67320261, 3.68627451, 3.68954248, 3.68300654, 3.6503268 ,
       3.62418301, 3.62745098, 3.66339869, 3.71568627, 3.69281046,
       3.6372549 , 3.66666667, 3.67320261, 3.66339869, 3.65359477,
       3.6372549 , 3.64052288, 3.66993464, 3.64052288, 3.62091

In [34]:
y[0]

np.str_('N')

In [37]:
y[0:10]

array(['N', 'N', 'N', 'V', 'N', 'N', 'N', 'N', 'N', 'N'], dtype='<U1')

In [38]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes:", label_encoder.classes_)


Classes: ['N' 'V']


In [42]:
y_encoded[:10]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [None]:
# Normalize each window across the time dimension (z-score per lead)
X = (X - X.mean(axis=1, keepdims=True)) / (X.std(axis=1, keepdims=True) + 1e-8)


In [None]:
'''
From your data (likely INCART or St. Petersburg), the sampling frequency is: 257
This means:

257 samples = 1 second of data

So window_size = fs gives you a 1-second window

window_size = 2 * fs → 2-second window, etc.
'''

In [44]:
# all files
import os
import wfdb
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Config
data_dir = '../data/files'
fs = 257  # Sampling frequency — adjust if yours differs
window_size = fs  # 1 second
X_all = []
y_all = []

# List all .hea files
hea_files = [f for f in os.listdir(data_dir) if f.endswith('.hea')]

# Process each record
for hea_file in tqdm(hea_files, desc="Processing records"):
    record_name = os.path.splitext(hea_file)[0]
    record_path = os.path.join(data_dir, record_name)

    try:
        record = wfdb.rdrecord(record_path)
        annotation = wfdb.rdann(record_path, 'atr')
    except Exception as e:
        print(f"Skipping {record_name} due to error: {e}")
        continue

    signal = record.p_signal
    labels = annotation.symbol
    positions = annotation.sample

    for idx, pos in enumerate(positions):
        start = pos - window_size // 2
        end = pos + window_size // 2

        # Skip if near edges
        if start < 0 or end > len(signal):
            continue

        # Extract window
        window = signal[start:end, :]
        X_all.append(window)
        y_all.append(labels[idx])

# Convert to arrays
X_all = np.array(X_all)
y_all = np.array(y_all)

print(f"Final dataset: X shape = {X_all.shape}, y shape = {y_all.shape}")


Processing records: 100%|██████████| 75/75 [00:05<00:00, 14.75it/s]


Final dataset: X shape = (175824, 256, 12), y shape = (175824,)


In [46]:
# Remove '+' symbols (non-beat markers)
valid_mask = y_all != '+'
X_clean = X_all[valid_mask]
y_clean = y_all[valid_mask]


In [47]:
# Encode labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_all)

# Normalize (z-score) each sample per lead
X_all = (X_all - X_all.mean(axis=1, keepdims=True)) / (X_all.std(axis=1, keepdims=True) + 1e-8)

print("Classes:", label_encoder.classes_)


Classes: ['+' 'A' 'B' 'F' 'N' 'Q' 'R' 'S' 'V' 'j' 'n']
