In [None]:
# Try xarray to explore the HDF5 file structure since h5py is not available
import xarray as xr

h5_file_path = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"

# Attempt to open the HDF5 file as a dataset
try:
    ds = xr.open_dataset(
        h5_file_path, engine=None
    )  # engine=None lets xarray auto-detect
    print(ds)
except Exception as e:
    print(f"Could not open file with xarray: {e}")

In [None]:
# Inspect the structure of the HDF5 file using h5py
import h5py

h5_file_path = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"


def print_h5_structure(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(f"[DATASET] {name} shape={obj.shape}, dtype={obj.dtype}")
    elif isinstance(obj, h5py.Group):
        print(f"[GROUP] {name}")
        for k in obj.attrs:
            print(f"    attr: {k} = {obj.attrs[k]}")


with h5py.File(h5_file_path, "r") as f:
    f.visititems(print_h5_structure)

In [None]:
# Inspect frameRate and durationPerDataset to determine sampling for 10s of data
import h5py

h5_file_path = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"

with h5py.File(h5_file_path, "r") as f:
    frame_rate = f["Attributes/imagingParameters/frameRate"][:]
    duration = f["Attributes/imagingParameters/durationPerDataset"][:]
    print("Frame rate (Hz):", frame_rate)
    print("Duration per dataset (s):", duration)
    # Confirm that n_frames = frame_rate * duration matches what we saw (3600)
    n_frames = int(frame_rate[0, 0] * duration[0, 0])
    print("Computed number of frames:", n_frames)

In [None]:
# Read the first ~10 seconds (150 frames at 15 Hz) from 'Plume Data/dataset_001' efficiently
import h5py
import numpy as np

h5_file_path = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"
frames_to_read = 150  # 15 Hz * 10 seconds

with h5py.File(h5_file_path, "r") as f:
    dset = f["Plume Data/dataset_001"]
    partial_data = dset[:frames_to_read]  # Efficient slice
    print(f"Shape of extracted segment: {partial_data.shape}")
    print(f"dtype: {partial_data.dtype}")
    print("Sample statistics for this segment:")
    print("Min:", np.min(partial_data))
    print("Max:", np.max(partial_data))
    print("Mean:", np.mean(partial_data))
    print("Std:", np.std(partial_data))
    # Print values from first time step (for sanity check)
    print("Sample from first frame:", partial_data[0, :5, :5])

In [None]:
# Save 10s segment (150 frames), including metadata, as a new HDF5 file in plug-and-play-demo/assets
dest_file_path = "plug-and-play-demo/assets/demo_10s.h5"
import os

os.makedirs(os.path.dirname(dest_file_path), exist_ok=True)

import h5py

h5_file_path = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"
frames_to_read = 150  # 10 seconds at 15 Hz

with h5py.File(h5_file_path, "r") as src, h5py.File(dest_file_path, "w") as dst:
    # Copy Attributes group recursively
    src.copy("Attributes", dst)
    # Copy README group recursively (optional, but often helpful)
    src.copy("README", dst)
    # Copy xGrid and yGrid
    src.copy("Plume Data/xGrid", dst.create_group("Plume Data"))
    src.copy("Plume Data/yGrid", dst["Plume Data"])
    # Copy 10s segment from one plume dataset (e.g., dataset_001)
    dset_src = src["Plume Data/dataset_001"]
    dset_dst = dst["Plume Data"].create_dataset(
        "dataset_001", data=dset_src[:frames_to_read], compression="gzip"
    )

print(f"Wrote {frames_to_read} frames and metadata to {dest_file_path}")

In [None]:
# Extract original publication and DOI info from HDF5 metadata to build a sidecar provenance file
import h5py

source_file = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"
with h5py.File(source_file, "r") as f:
    # These fields are based on previous print output; adjust if names differ
    citation = f["Attributes/datagroupMetadata/citation"][:].item()
    doi = f["Attributes/datagroupMetadata/DOI"][:].item()
    print("Original publication citation:", citation)
    print("Original publication DOI:", doi)

# The public release publication is supplied by user:
release_citation = (
    "Efrén Álvarez-Salvado, Angela M Licata, Erin G Connor, Margaret K McHugh, "
    "Benjamin MN King, Nicholas Stavropoulos, Jonathan D Victor, John P Crimaldi, "
    "Katherine I Nagel (2018) Elementary sensory-motor transformations underlying olfactory navigation "
    "in walking fruit-flies eLife 7:e37815\n\nhttps://doi.org/"
)
print("Public release publication for this data segment:")
print(release_citation)

In [None]:
# Write sidecar JSON containing provenance and release information
import json

sidecar = {
    "source_file": "a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5",
    "original_publication_citation": (
        citation.decode() if isinstance(citation, bytes) else citation
    ),
    "original_publication_DOI": doi.decode() if isinstance(doi, bytes) else doi,
    "public_release_citation": release_citation.strip(),
    "public_release_DOI": "https://doi.org/",  # As-supplied, can be filled in if the actual DOI is provided in the future
}

sidecar_path = "plug-and-play-demo/assets/demo_10s_provenance.json"
with open(sidecar_path, "w") as f:
    json.dump(sidecar, f, indent=2)

print(f"Wrote provenance sidecar JSON to {sidecar_path}")
print(json.dumps(sidecar, indent=2))

In [None]:
# Attempt to extract all available provenance and relevant metadata from the HDF5 file
import h5py

h5_path = "/Volumes/KINGSTON/o2a_plumes/a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5"
provenance = {}

with h5py.File(h5_path, "r") as f:
    dgmd = f["Attributes/datagroupMetadata"]
    for field in dgmd:
        key = field
        val = dgmd[field][:]
        try:
            val = val.item()  # unpack array
            if isinstance(val, bytes):
                val = val.decode(errors="replace")
        except Exception:
            pass
        provenance[f"datagroupMetadata/{key}"] = val
    # Scan flowConditions, odorRelease, imagingParameters
    for major in ["flowConditions", "odorRelease", "imagingParameters"]:
        group_path = f"Attributes/{major}"
        group = f[group_path]
        for field in group:
            if isinstance(group[field], h5py.Dataset):
                val = group[field][:]
                try:
                    val = val.item()
                    if isinstance(val, bytes):
                        val = val.decode(errors="replace")
                except Exception:
                    pass
                provenance[f"{major}/{field}"] = val
        # scan 2nd-level for odorRelease/odorant, flowConditions/workingFluid
        if major in ["odorRelease", "flowConditions"]:
            for sub in group:
                if isinstance(group[sub], h5py.Group):
                    for subf in group[sub]:
                        val = group[sub][subf][:]
                        try:
                            val = val.item()
                            if isinstance(val, bytes):
                                val = val.decode(errors="replace")
                        except Exception:
                            pass
                        provenance[f"{major}/{sub}/{subf}"] = val
# show all provenance fields found
yaml_like = "\n".join(f"{k}: {v}" for k, v in provenance.items())
print(yaml_like)

In [None]:
# Write sidecar JSON containing complete provenance and release information (updated with supplied original pub info)
import json
import numpy as np


def ensure_json_serializable(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    if isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    if isinstance(obj, bytes):
        return obj.decode(errors="replace")
    if isinstance(obj, dict):
        return {k: ensure_json_serializable(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [ensure_json_serializable(v) for v in obj]
    return obj


extra_provenance = {
    "dataGroupIdentifier": provenance.get("datagroupMetadata/dataGroupIdentifier"),
    "dataOriginationDate": provenance.get("datagroupMetadata/dataOriginationDate"),
    "dataOriginator": provenance.get("datagroupMetadata/dataOriginator"),
    "h5originationDate": provenance.get("datagroupMetadata/h5originationDate"),
    "h5originator": provenance.get("datagroupMetadata/h5originator"),
    "principalInvestigator": provenance.get("datagroupMetadata/principalInvestigator"),
    "fundingSources": provenance.get("datagroupMetadata/fundingSources"),
    "flow_conditions": {
        "flowFacilityDims": provenance.get("flowConditions/flowFacilityDims"),
        "flowSpeed": provenance.get("flowConditions/flowSpeed"),
        "workingFluid": {
            "Type": provenance.get("flowConditions/workingFluid/Type"),
            "Density": provenance.get("flowConditions/workingFluid/Density"),
            "kinematicViscosity": provenance.get(
                "flowConditions/workingFluid/kinematicViscosity"
            ),
        },
    },
    "odor_release": {
        "Conditions": provenance.get("odorRelease/Conditions"),
        "releaseHeight": provenance.get("odorRelease/releaseHeight"),
        "releaseSpeed": provenance.get("odorRelease/releaseSpeed"),
        "sourceLengthscale": provenance.get("odorRelease/sourceLengthscale"),
        "sourceLocation": provenance.get("odorRelease/sourceLocation"),
        "odorant": {
            "Type": provenance.get("odorRelease/odorant/Type"),
            "molecularDiffusivity": provenance.get(
                "odorRelease/odorant/molecularDiffusivity"
            ),
        },
    },
    "imaging_parameters": {
        "durationPerDataset": provenance.get("imagingParameters/durationPerDataset"),
        "fieldOfView": provenance.get("imagingParameters/fieldOfView"),
        "frameRate": provenance.get("imagingParameters/frameRate"),
        "imageMagnification": provenance.get("imagingParameters/imageMagnification"),
        "noiseFloor": provenance.get("imagingParameters/noiseFloor"),
        "numberOfDatasets": provenance.get("imagingParameters/numberOfDatasets"),
        "timeResolution": provenance.get("imagingParameters/timeResolution"),
    },
}

sidecar_better = {
    "source_file": "a0004_air_stationarySource_isokineticNearbedRelease_10cm_s.h5",
    "original_publication_citation": "Connor, E. G., McHugh, M. K., & Crimaldi, J. P. (2018). Quantification of airborne odor plumes using planar laser-induced fluorescence. Experiments in Fluids, 59(9), 137. Springer.",
    "original_publication_DOI": "10.1007/s00348-018-2591-3",
    "original_provenance_metadata": ensure_json_serializable(extra_provenance),
    "public_release_citation": "Álvarez-Salvado, E., Licata, A. M., Connor, E. G., McHugh, M. K., King, B. M. N., Stavropoulos, N., Victor, J. D., Crimaldi, J. P., & Nagel, K. I. (2018). Elementary sensory-motor transformations underlying olfactory navigation in walking fruit-flies. eLife, 7, e37815. https://doi.org/10.7554/eLife.37815",
    "public_release_DOI": "10.7554/eLife.37815",
    "notes": "All possible provenance included here. Original and public release publications both cited.",
}

sidecar_path = "plug-and-play-demo/assets/demo_10s_provenance.json"
with open(sidecar_path, "w") as f:
    json.dump(sidecar_better, f, indent=2)

print(f"Updated and improved provenance sidecar saved to {sidecar_path}")
print(json.dumps(sidecar_better, indent=2))