# 01 ‚Äì PTB-XL Data Loading and Preprocessing

This notebook loads and explores the PTB-XL dataset, verifies metadata,
and prepares standardised numpy arrays for model training.

**Official split rule**
- Train ‚Üí folds 1 ‚Äì 8  
- Validation ‚Üí fold 9  
- Test ‚Üí fold 10


In [1]:
# ======================================================
# 00. Environment and GPU Check
# ======================================================

import os
import sys
import torch

print("‚úÖ Environment setup check:")
print(f"Python version : {sys.version.split()[0]}")
print(f"Working folder : {os.getcwd()}")

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    print(f"üß† GPU detected : {device_name}")
    print(f"CUDA version   : {torch.version.cuda}")
    print(f"PyTorch build  : {torch.__version__}")
else:
    print("‚ö†Ô∏è No GPU detected ‚Äî running on CPU.")

print("=" * 60)


‚úÖ Environment setup check:
Python version : 3.12.8
Working folder : d:\IIT\L6\FYP\chagas-ecg-detection\notebooks
üß† GPU detected : NVIDIA GeForce RTX 3050 6GB Laptop GPU
CUDA version   : 11.8
PyTorch build  : 2.7.1+cu118


In [2]:
# ======================================================
# 01. Load and Verify PTB-XL Metadata
# ======================================================

import pandas as pd
import numpy as np
import wfdb

pd.set_option('display.max_columns', None)

# --- Dataset path ---
DATA_PATH = r"D:\IIT\L6\FYP\chagas-ecg-detection\data\raw\ptbxl"
META_FILE = os.path.join(DATA_PATH, "ptbxl_database.csv")

# --- Verify file existence ---
if not os.path.exists(META_FILE):
    raise FileNotFoundError(f"‚ùå Metadata file not found: {META_FILE}")

# --- Load metadata ---
df = pd.read_csv(META_FILE)

print("=" * 70)
print("‚úÖ PTB-XL metadata loaded successfully")
print(f"File path : {META_FILE}")
print(f"Shape     : {df.shape[0]} rows √ó {df.shape[1]} columns")
print("Columns   :", ", ".join(df.columns))
print("=" * 70)

display(df.head())


‚úÖ PTB-XL metadata loaded successfully
File path : D:\IIT\L6\FYP\chagas-ecg-detection\data\raw\ptbxl\ptbxl_database.csv
Shape     : 21799 rows √ó 28 columns
Columns   : ecg_id, patient_id, age, sex, height, weight, nurse, site, device, recording_date, report, scp_codes, heart_axis, infarction_stadium1, infarction_stadium2, validated_by, second_opinion, initial_autogenerated_report, validated_by_human, baseline_drift, static_noise, burst_noise, electrodes_problems, extra_beats, pacemaker, strat_fold, filename_lr, filename_hr


Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,scp_codes,heart_axis,infarction_stadium1,infarction_stadium2,validated_by,second_opinion,initial_autogenerated_report,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,"{'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}",,,,,False,False,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,"{'NORM': 80.0, 'SBRAD': 0.0}",,,,,False,False,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,"{'NORM': 100.0, 'SR': 0.0}",,,,,False,False,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,"{'NORM': 100.0, 'SR': 0.0}",,,,,False,False,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,"{'NORM': 100.0, 'SR': 0.0}",,,,,False,False,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [3]:
# ======================================================
# 02. Preprocess PTB-XL Records ‚Üí Normalized Arrays
# ======================================================

import ast
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing

# --- Output directory ---
PROCESSED_DIR = os.path.join(
    r"D:\IIT\L6\FYP\chagas-ecg-detection\data\processed", "ptbxl"
)
os.makedirs(PROCESSED_DIR, exist_ok=True)
print(f"üìÅ Processed data will be saved to: {PROCESSED_DIR}")

# --- Load SCP mapping (diagnostic codes ‚Üí superclass) ---
SCP_FILE = os.path.join(DATA_PATH, "scp_statements.csv")
scp_df = pd.read_csv(SCP_FILE, index_col=0)
scp_df = scp_df[scp_df['diagnostic'] == 1]
scp_map = scp_df['diagnostic_class'].to_dict()
print(f"‚úÖ Loaded SCP mapping for {len(scp_map)} diagnostic codes.")

# --- Worker: read + normalize + label one record ---
def process_record(idx):
    try:
        rel = df.loc[idx, "filename_lr"]
        full = os.path.join(DATA_PATH, rel)
        record = wfdb.rdrecord(full)
        signal = record.p_signal.T  # (12, samples)

        # Normalize per-lead to [-1, 1]
        smin, smax = signal.min(1, keepdims=True), signal.max(1, keepdims=True)
        denom = np.where((smax - smin) == 0, 1.0, (smax - smin))
        signal_norm = 2 * (signal - smin) / denom - 1

        # Map SCP codes ‚Üí superclass label
        scp_dict = ast.literal_eval(df.loc[idx, "scp_codes"])
        relevant = {k: v for k, v in scp_dict.items() if k in scp_map}
        if not relevant:
            label = "UNKNOWN"
        else:
            superclasses = [scp_map[k] for k in relevant.keys()]
            label = max(set(superclasses), key=superclasses.count)

        return signal_norm.astype(np.float32), label, df.loc[idx, "strat_fold"]
    except Exception as e:
        return None, None, None

# --- Parallel processing ---
print("‚öôÔ∏è  Parallel processing all ECG signals ...")
num_workers = multiprocessing.cpu_count()
print(f"üß© Using {num_workers} CPU threads")

signals, labels, folds = [], [], []
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = {executor.submit(process_record, i): i for i in range(len(df))}
    for fut in tqdm.tqdm(as_completed(futures), total=len(df)):
        sig, lbl, fld = fut.result()
        if sig is not None:
            signals.append(sig)
            labels.append(lbl)
            folds.append(fld)

print(f"‚úÖ Finished processing {len(signals)} ECGs successfully.")


üìÅ Processed data will be saved to: D:\IIT\L6\FYP\chagas-ecg-detection\data\processed\ptbxl
‚úÖ Loaded SCP mapping for 44 diagnostic codes.
‚öôÔ∏è  Parallel processing all ECG signals ...
üß© Using 16 CPU threads


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21799/21799 [02:31<00:00, 143.54it/s]

‚úÖ Finished processing 21799 ECGs successfully.





In [4]:
# ======================================================
# 03. Split and Save Preprocessed Arrays
# ======================================================

signals = np.array(signals, dtype=object)
labels  = np.array(labels)
folds   = np.array(folds)

# --- PTB-XL official folds ---
train_mask = np.isin(folds, list(range(1, 9)))  # folds 1‚Äì8
val_mask   = folds == 9
test_mask  = folds == 10

np.save(os.path.join(PROCESSED_DIR, "train_signals.npy"), signals[train_mask])
np.save(os.path.join(PROCESSED_DIR, "val_signals.npy"),   signals[val_mask])
np.save(os.path.join(PROCESSED_DIR, "test_signals.npy"),  signals[test_mask])
np.save(os.path.join(PROCESSED_DIR, "train_labels.npy"),  labels[train_mask])
np.save(os.path.join(PROCESSED_DIR, "val_labels.npy"),    labels[val_mask])
np.save(os.path.join(PROCESSED_DIR, "test_labels.npy"),   labels[test_mask])

print("\n‚úÖ Saved processed NumPy arrays:")
for f in os.listdir(PROCESSED_DIR):
    print("   ", f)



‚úÖ Saved processed NumPy arrays:
    test_labels.npy
    test_signals.npy
    train_labels.npy
    train_signals.npy
    val_labels.npy
    val_signals.npy


In [5]:
# ======================================================
# 04. Verify Processed PTB-XL Arrays
# ======================================================

import collections

expected = [
    "train_signals.npy", "val_signals.npy", "test_signals.npy",
    "train_labels.npy",  "val_labels.npy",  "test_labels.npy"
]

for split in ["train", "val", "test"]:
    x = np.load(os.path.join(PROCESSED_DIR, f"{split}_signals.npy"), allow_pickle=True)
    y = np.load(os.path.join(PROCESSED_DIR, f"{split}_labels.npy"), allow_pickle=True)

    print(f"\nüì¶ {split.upper()} SET")
    print(f"   Signals: {len(x)} | Labels: {len(y)}")
    if len(x) == len(y):
        print("   ‚úÖ Integrity OK")
    else:
        print("   ‚ùå Mismatch detected!")

    print(f"   Example shape : {x[0].shape}")
    print(f"   Example label : {y[0]}")



üì¶ TRAIN SET
   Signals: 17418 | Labels: 17418
   ‚úÖ Integrity OK
   Example shape : (12, 1000)
   Example label : NORM

üì¶ VAL SET
   Signals: 2183 | Labels: 2183
   ‚úÖ Integrity OK
   Example shape : (12, 1000)
   Example label : MI

üì¶ TEST SET
   Signals: 2198 | Labels: 2198
   ‚úÖ Integrity OK
   Example shape : (12, 1000)
   Example label : NORM


In [6]:
# ============================================================
# 05. Convert Large Arrays into Per-Record .npy Files
# ============================================================
from pathlib import Path
import tqdm

DATA_DIR = Path(PROCESSED_DIR)
OUT_DIR  = DATA_DIR / "records_split"
OUT_DIR.mkdir(parents=True, exist_ok=True)

for split in ["train", "val", "test"]:
    print(f"\nüß© Converting {split} split ...")
    x = np.load(DATA_DIR / f"{split}_signals.npy", allow_pickle=True)
    y = np.load(DATA_DIR / f"{split}_labels.npy", allow_pickle=True)

    split_dir = OUT_DIR / split
    split_dir.mkdir(exist_ok=True)

    for i in tqdm.trange(len(x)):
        np.save(split_dir / f"{i:05d}.npy", x[i].astype(np.float32))

    np.save(split_dir / "labels.npy", y)

print("\n‚úÖ Conversion complete ‚Äî You can now delete *_signals.npy to free space.")



üß© Converting train split ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17418/17418 [00:48<00:00, 356.49it/s]



üß© Converting val split ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2183/2183 [00:04<00:00, 455.69it/s]



üß© Converting test split ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2198/2198 [00:04<00:00, 501.37it/s]


‚úÖ Conversion complete ‚Äî You can now delete *_signals.npy to free space.





In [7]:
# ============================================================
# 06. Verify per-record split folder
# ============================================================
ROOT = Path(PROCESSED_DIR) / "records_split"
for split in ["train", "val", "test"]:
    split_dir = ROOT / split
    labels = np.load(split_dir / "labels.npy", allow_pickle=True)
    files = sorted([f for f in split_dir.glob("*.npy") if f.name != "labels.npy"])
    print(f"{split.upper():>5}: {len(files)} signals | {len(labels)} labels | example {files[0].name}")


TRAIN: 17418 signals | 17418 labels | example 00000.npy
  VAL: 2183 signals | 2183 labels | example 00000.npy
 TEST: 2198 signals | 2198 labels | example 00000.npy
