In [2]:
from pathlib import Path
import pandas as pd
from natsort import natsorted
import tifffile
import numpy as np
import h5py
from cellplot_package.cellplot.segmentation import rand_col_seg
import matplotlib.pyplot as plt

dataset_base = Path('/home/simon_g/isilon_images_mnt/10_MetaSystems/MetaSystemsData/_simon/data/MCI_data')
dataset_name = 'CODEX_DLBCL2'
dataset_path = dataset_base / dataset_name

sample_ids = natsorted([f.name.split('_')[0] for f in (dataset_path / 'images').glob('*.tif')])
marker_names = pd.read_csv(dataset_path / 'marker_names.csv', index_col=0).Label.to_list()
annotations = pd.read_csv(dataset_path / 'annotations.csv')

# Create output H5 file
output_h5_folder = dataset_base / f'h5_files/{dataset_name}'
output_h5_folder.mkdir(exist_ok=True)
output_h5_path = output_h5_folder / f'{dataset_name}.h5'

with h5py.File(output_h5_path, 'w') as h5f:
    
    # Save marker names as variable-length strings
    dt = h5py.special_dtype(vlen=str)
    marker_ds = h5f.create_dataset('marker_names', (len(marker_names),), dtype=dt)
    marker_ds[:] = marker_names
    
    # Create annotations group
    coords_group = h5f.create_group('coords')
    coords_group.create_dataset('DIM1', data=annotations.DIM1.values)
    coords_group.create_dataset('DIM2', data=annotations.DIM2.values)
    sample_id_strings = annotations.SampleID.astype(str).tolist()
    coords_group.create_dataset('sample_id', data=sample_id_strings, dtype=dt)
    
    annotation_strings = annotations.annotation.astype(str).tolist()
    print(annotation_strings[:100])
    h5f.create_dataset('annotation', data=annotation_strings, dtype=dt)
    
    # Save unique sample IDs
    unique_sample_id_strings = annotations.SampleID.unique().astype(str).tolist()
    h5f.create_dataset('sample_ids', data=unique_sample_id_strings, dtype=dt)
    
    # Create top-level 'data' group
    data_group = h5f.create_group('data')
    
    # Process each sample inside the 'data' group
    for sample_id in sample_ids:
        print(f"Processing {sample_id}...")
        
        # Read multi-channel image
        mc_image_path = dataset_path / f'images/{sample_id}_image.tif'
        mc_image = tifffile.imread(mc_image_path)
        
        # Read masks
        masks_image_path = dataset_path / f'masks/{sample_id}_masks.tif'
        masks_image = tifffile.imread(masks_image_path).astype(np.uint32)
        
        print(f"  Image shape: {mc_image.shape}, Mask shape: {masks_image.shape}")
        
        # Create sample group inside 'data' group
        sample_group = data_group.create_group(sample_id)
        
        # Save image with compression (level 3)
        sample_group.create_dataset(
            'image', 
            data=mc_image, 
            compression='gzip', 
            compression_opts=3,
            chunks=(64, 64, 1)
        )
        
        # Save mask
        sample_group.create_dataset(
            'masks', 
            data=masks_image, 
            compression='gzip', 
            compression_opts=3,
            chunks=(64, 64)
        )

print(f"\nDone! Saved to: {output_h5_path}")

['CD4T', 'na', 'B', 'B', 'DC', 'TTOX', 'TPR', 'B', 'CD4T', 'B', 'TTOX_exh', 'Stromal cells', 'B', 'B', 'CD4T', 'NK', 'Treg', 'B', 'B', 'B', 'B', 'B', 'na', 'Macro', 'Macro', 'B', 'Macro', 'B', 'Macro', 'B', 'Granulo', 'Stromal cells', 'Macro', 'TTOXNaive', 'B', 'CD4T', 'CD4TNaive', 'Macro', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'TTOX', 'TTOX', 'na', 'B', 'TTOXNaive', 'TTOX_exh', 'TTOXNaive', 'B', 'CD4T', 'B', 'CD4TNaive', 'TTOX', 'B', 'B', 'B', 'TTOX', 'na', 'B', 'na', 'B', 'Macro', 'B', 'B', 'CD4T', 'Macro', 'B', 'B', 'Macro', 'B', 'DC', 'B', 'B', 'B', 'Treg', 'B', 'B', 'B', 'B', 'B', 'B', 'Macro', 'B', 'B', 'FDC', 'B', 'B', 'B', 'TTOX', 'B', 'DC', 'B', 'B', 'FDC']
Processing LN0251-006...
  Image shape: (3750, 4998, 52), Mask shape: (3750, 4998)
Processing LN0251-007...
  Image shape: (3750, 4998, 52), Mask shape: (3750, 4998)
Processing LN0265-002...
  Image shape: (3750, 4998, 52), Mask shape: (3750, 4998)
Processing LN0265-003...
  Image shape: (3750, 4998, 52), Mask shape:

In [7]:
with h5py.File(output_h5_path, 'r') as f:
    print(f['sample_ids'][:].astype(str))
    print(f.keys())
    
    curr_id = f['sample_ids'][:].astype(str)[0]
    
    print(f['data'][curr_id].keys())
    print(f['data'][curr_id]['image'].shape)
    print(f['data'][curr_id]['masks'].shape)
    
    print(f['coords']['DIM1'][:])
    print(f['coords']['DIM2'][:])
    print(f['coords']['sample_id'][:].astype(str))
    
    print(f['annotation'][:].astype(str))

['191-4reg007' '191-3reg004' '191-3reg001' '191-4reg006']
<KeysViewHDF5 ['annotation', 'coords', 'data', 'marker_names', 'sample_ids']>
<KeysViewHDF5 ['image', 'masks']>
(3750, 4998, 52)
(3750, 4998)
[   2    5    3 ... 3742 3742 3742]
[1390 1457 1476 ... 3212 3794 4087]
['191-4reg007' '191-4reg007' '191-4reg007' ... '191-4reg006' '191-4reg006'
 '191-4reg006']
['CD4T' 'B' 'DC' ... 'TPR' 'TTOX' 'B']
