In [8]:
from typing import List, NamedTuple
import numpy as np
import yaml
import io
import zipfile

# NOTE: `frame` here refers to hand pose angles

W = 64

class HandEmgTuple(NamedTuple):
    frame: np.ndarray  # (20,), float32 expected
    emg: np.ndarray  # (W, C), float32 expected


class HandEmgRecording(NamedTuple):
    couples: List[HandEmgTuple]
    sigma: np.ndarray  # (20,), float32 single final frame


class RecordingsWriter:
    """
    A context manager for writing recordings to a ZIP archive in a proprietary binary format.

    Each recording is saved as a separate file (named "1.rec", "2.rec", etc) inside the archive.
    In each file, the first byte is the header that indicates the number of EMG channels (C),
    which applies to all samples in that recording. Each sample is then stored as:

      [ [<20 x float32: frame>, <W x C float32: emg>], [...], ... <20 x float32: sigma frame> ]
    """

    def __init__(self, filename: str):
        self.filename = filename
        self.archive = None
        self.recording_index = 1
        self.C = None  # To store the number of EMG channels

    def __enter__(self):
        self.archive = zipfile.ZipFile(
            self.filename,
            mode="w",
            compression=zipfile.ZIP_DEFLATED,
            compresslevel=9,
        )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.archive is not None:
            self.archive.close()

    def add(self, recording: HandEmgRecording):
        """
        Add a single recording to the ZIP archive.

        Args:
            recording: A list of HandEmgTuple samples. Each sample is stored with its frame
                       (20 float32 values) and its emg (W x C float32 values). The number of EMG
                       channels (C) is determined from the first sample and is assumed to be consistent.
        """
        if self.archive is None:
            raise RuntimeError("Archive is not open. Use 'with' statement to open it.")

        bio = io.BytesIO()

        # Determine the number of EMG channels (C) from the first sample.
        C = recording.couples[0].emg.shape[1]
        if self.C is None:
            # Store C for metadata
            self.C = C
            self.archive.writestr("_metadata.yml", yaml.dump({"C": C}))

        elif self.C != C:
            raise ValueError("Inconsistent number of EMG channels across recordings.")

        # Write each sample: frame (20 float32 values) then emg (W * C float32 values).
        for tup in recording.couples:
            # Verify data types and dimensions.
            assert (
                tup.frame.dtype == np.float32
            ), f"Frame dtype must be float32, got {tup.frame.dtype}"
            assert (
                tup.emg.dtype == np.float32
            ), f"EMG dtype must be float32, got {tup.emg.dtype}"
            assert tup.frame.shape == (
                20,
            ), f"Frame shape must be (20,), got {tup.frame.shape}"
            assert (
                tup.emg.shape[0] == W and tup.emg.shape[1] == C
            ), f"EMG shape must be ({W}, {C}), got {tup.emg.shape}"

            bio.write(tup.frame.tobytes())
            bio.write(tup.emg.flatten().tobytes())

        # Write the final frame (sigma) as well.
        bio.write(recording.sigma.tobytes())

        # Save the recording in the archive under a sequential filename like "1.rec".
        self.archive.writestr(f"{self.recording_index}.rec", bio.getvalue())
        self.recording_index += 1

In [9]:
from tqdm import tqdm
import h5py


slices = [
    (1, 23000, 38000),
    (1, 45000, 60000),
    (1, 100000, 120000),
    (2, 1000, 16000),
    (2, 16500, 50000),
    (2, 64000, 118000),
    (2, 120000, 128000),
    (3, 1100, 16000),
    (3, 51000, 62966),
    (4, 1600, 11900),
    (4, 20000, 32400),
    (4, 101000, 123667),
    (5, 12000, 35000),
    (5, 39000, 50000),
    (5, 68000, 111000),
    (5, 120000, 130000),
    (6, 1000, 16000),
    (6, 31000, 105000),
    (6, 107000, 132000),
    (6, 135000, 156754),
    (7, 1000, 49000),
    (7, 50000, 85000),
    (7, 86000, 135706),
    (8, 0, 133000),
    (8, 135000, 148923),
    (10, 0, 35000),
    (10, 101000, 166715),
    (11, 10000, 35000),
    (11, 55000, 71000),
    (12, 0, 9000),
    (12, 11000, 30000),
    (12, 35000, 39000),
]

base_path = "C:/Users/shich/emg2pose_data"


filepath = "../dataset.zip"
if __name__ == "__main__":
    print("Archiving...")
    with RecordingsWriter(filepath) as writer:
        for e in tqdm(slices):
            with h5py.File(
                f"{base_path}/2022-04-07-1649318400-8125c-cv-emg-pose-train@2-recording-{e[0]}_left.hdf5",
                "r",
            ) as f:
                timeseries = f["emg2pose"]["timeseries"]  # type: ignore
                joint_angles = timeseries["joint_angles"]  # type: ignore
                emg = timeseries["emg"]  # type: ignore

                start, end = e[1], e[2]

                slices = (end - start) // W
                real_end = slices * W + start

                recording = HandEmgRecording(couples=[], sigma=joint_angles[real_end])  # type: ignore
                for i in range(slices):
                    emg_slice = emg[start + i * W : start + (i + 1) * W]  # type: ignore
                    joints = joint_angles[start + i * W]  # type: ignore

                    recording.couples.append(HandEmgTuple(frame=joints, emg=emg_slice))  # type: ignore

                writer.add(recording)

Archiving...


100%|██████████| 32/32 [00:02<00:00, 13.34it/s]
