# Build train/test HDF5 datasets
Merge **two** source HDF5 files – one with normal‑sinus‑rhythm (NSR) recordings, one with atrial‑fibrillation (AF) recordings – into two new self‑contained files:

* `train_ds.h5` – all groups from subjects assigned to the *train* split.
* `test_ds.h5`  – all groups from the held‑out subjects.

Splitting is done **by subject ID** so that no subject appears in both splits.

In [None]:
# !pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl (11.2 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.6.0


In [21]:
import h5py, pandas as pd, numpy as np, random, os
from tqdm import tqdm
import os

In [22]:

# # ---- configuration ----------------------------------------------------
# folder_of_interest = "length_full"
# NSR_PATH   = f"downloaded_files/{folder_of_interest}/mimic_non_af_data.h5"
# AF_PATH    = f"downloaded_files/{folder_of_interest}/mimic_af_data.h5"



# OUT_TRAIN  =  f"downloaded_files/{folder_of_interest}/train_ds.h5"
# OUT_TEST   =  f"downloaded_files/{folder_of_interest}/test_ds.h5"

# TRAIN_RATIO = 0.8               # 80 % of subjects for training
# RNG_SEED    = 42


## Helper · collect all groups from a file

In [23]:

# def collect_groups(path: str, label: int):
#     """Return list of dicts with file, group_id, subj_id, label."""
#     rows=[]
#     with h5py.File(path, 'r') as h5:
#         for gid in h5:
#             g = h5[gid]
#             subj = str(g.attrs.get('subj_id', gid.split('-')[0]))
#             rows.append(dict(file=path, group_id=gid,
#                              subj_id=subj, label=label))
#     return rows


## Step 1 – collect metadata

In [24]:
# rows = collect_groups(NSR_PATH, 0) + collect_groups(AF_PATH, 1)
# meta = pd.DataFrame(rows)
# print('Total groups:', len(meta))
# display(meta.head())


## Step 2 – subject‑level split

In [25]:

# random.seed(RNG_SEED)
# subjects = meta.subj_id.unique().tolist()
# random.shuffle(subjects)

# cut = int(len(subjects) * TRAIN_RATIO)
# train_subj = set(subjects[:cut])

# meta['split'] = np.where(meta.subj_id.isin(train_subj), 'train', 'test')
# print(meta.split.value_counts())


## Helper · copy selected groups

In [26]:

# def copy_split(df_split, out_path):
#     if os.path.exists(out_path):
#         os.remove(out_path)
#     with h5py.File(out_path, 'w') as h5_out:
#         for _, row in tqdm(df_split.iterrows(), total=len(df_split),
#                            desc=f'→ {out_path}'):
#             with h5py.File(row.file, 'r') as h5_in:
#                 src = h5_in[row.group_id]
#                 h5_in.copy(src, h5_out, name=row.group_id, without_attrs=False)
#                 h5_out[row.group_id].attrs['label'] = row.label
#     print(f'Wrote {len(df_split)} groups to {out_path}')


## Step 3 – write new HDF5 files

In [27]:
import h5py

# ── your existing split lists ───────────────────────────────────────────────
# e.g. you might have done something like:
# all_ids = sorted(list(h5py.File('mimic_af_data.h5','r').keys()) +
#                  list(h5py.File('mimic_non_af_data.h5','r').keys()))
# train_ids, test_ids = train_test_split(all_ids, test_size=0.2, random_state=42)
# train_ids = [...]   # <-- fill in your train record IDs
# test_ids  = [...]   # <-- fill in your test record IDs

# # ── source files ────────────────────────────────────────────────────────────
# src_files = [
#     'mimic_af_data.h5',
#     'mimic_non_af_data.h5'
# ]

def build_split_h5(src_paths, out_path, selected_ids):
    """
    Copy only groups in selected_ids from each src H5 into a new H5 at out_path,
    preserving all dataset contents and attributes.
    """
    with h5py.File(out_path, 'w') as dst:
        for src_path in src_paths:
            with h5py.File(src_path, 'r') as src:
                for gid, grp in src.items():
                    if gid not in selected_ids:
                        continue
                    # replicate group
                    dst_grp = dst.create_group(gid)
                    # copy group attrs
                    for k, v in grp.attrs.items():
                        dst_grp.attrs[k] = v
                    # copy every dataset in that group
                    for ds_name, ds in grp.items():
                        if not isinstance(ds, h5py.Dataset):
                            continue
                        data = ds[()]  # read full array into memory
                        dst_ds = dst_grp.create_dataset(
                            ds_name,
                            data=data,
                            compression='gzip',
                            chunks=True
                        )
                        # copy dataset attributes
                        for ak, av in ds.attrs.items():
                            dst_ds.attrs[ak] = av

# # ── run it ──────────────────────────────────────────────────────────────────
# build_split_h5(src_files, 'train_ds.h5', train_ids)
# build_split_h5(src_files, 'test_ds.h5',  test_ids)

# print("✔️  Generated train_ds.h5 & test_ds.h5 with preserved structure")

In [28]:

# copy_split(meta[meta.split=='train'], OUT_TRAIN)
# copy_split(meta[meta.split=='test'], OUT_TEST)


In [29]:
import os
import random
import h5py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# ── your existing configuration ───────────────────────────────
folder_of_interest = "length_full"
NSR_PATH   = f"downloaded_files/{folder_of_interest}/mimic_non_af_data.h5"
AF_PATH    = f"downloaded_files/{folder_of_interest}/mimic_af_data.h5"

OUT_TRAIN  = f"downloaded_files/{folder_of_interest}/train_ds.h5"
OUT_TEST   = f"downloaded_files/{folder_of_interest}/test_ds.h5"

TRAIN_RATIO = 0.8
RNG_SEED    = 42

# ── helper to collect all groups with labels ────────────────────
def collect_groups(path: str, label: int):
    rows = []
    with h5py.File(path, 'r') as h5:
        for gid in h5:
            subj = str(h5[gid].attrs.get('subj_id', gid.split('-')[0]))
            rows.append(dict(file=path, group_id=gid, subj_id=subj, label=label))
    return rows

# ── build a DataFrame and stratified split by subject ───────────
rows = collect_groups(NSR_PATH, 0) + collect_groups(AF_PATH, 1)
meta = pd.DataFrame(rows)

random.seed(RNG_SEED)
subjects = meta.subj_id.unique().tolist()
random.shuffle(subjects)
cut = int(len(subjects) * TRAIN_RATIO)
train_subj = set(subjects[:cut])

meta['split'] = np.where(meta.subj_id.isin(train_subj), 'train', 'test')
print(meta.split.value_counts())

# ── re‑use your previously defined build_split_h5 ───────────────
def build_split_h5(src_paths, out_path, selected_ids):
    if os.path.exists(out_path):
        os.remove(out_path)
    with h5py.File(out_path, 'w') as dst:
        for src_path in src_paths:
            with h5py.File(src_path, 'r') as src:
                for gid, grp in src.items():
                    if gid not in selected_ids:
                        continue
                    # copy entire group (datasets + attrs)
                    src.copy(grp, dst, name=gid, without_attrs=False)
                    # then ensure our 'label' attr is set/overwritten:
                    dst[gid].attrs['label'] = int(grp.attrs.get('af_status', grp.attrs.get('label', 0)))

# ── extract group IDs per split and write out ────────────────────
train_ids = meta.loc[meta.split=='train', 'group_id'].tolist()
test_ids  = meta.loc[meta.split=='test',  'group_id'].tolist()

build_split_h5([NSR_PATH, AF_PATH], OUT_TRAIN, train_ids)
build_split_h5([NSR_PATH, AF_PATH], OUT_TEST,  test_ids)

print("✔️  train_ds.h5 and test_ds.h5 created.")

split
train    28
test      7
Name: count, dtype: int64
✔️  train_ds.h5 and test_ds.h5 created.
