In [1]:
import h5py
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
 
data = h5py.File('ds4.h5')
data.create_dataset('images', 
                    shape=(100, 256, 256, 3), 
                    maxshape=(None, 256, 256, None),
                    dtype=float)
 
data.create_dataset('image_names', 
                    shape=(100,),
                    maxshape=(None,),
                    dtype=h5py.special_dtype(vlen=str))

<HDF5 dataset "image_names": shape (100,), type "|O">

In [3]:
data['image_names'][:2] = ['some_randome_name.jpg', 'some_randome_name2.jpg']
data['images'][:2] = np.ones((2, 256, 256, 3))
 
# list available datasets in this file
for ds in data:
    print(data[ds])
 

<HDF5 dataset "image_names": shape (100,), type "|O">
<HDF5 dataset "images": shape (100, 256, 256, 3), type "<f8">


In [4]:
print(data['image_names'][0])
 
images = data['image_names']
images.resize(200, axis=0)
 
print(data['image_names'])

some_randome_name.jpg
<HDF5 dataset "image_names": shape (200,), type "|O">


The most basic structure in an h5py file is a dataset, we need to specify the desired shapes during creation. This step should be pretty self-explanatory. There is a small catch though, if you don’t specify the max shape argument you will not be able to resize the dataset in the future. Due to some performance optimizations, this is ok if you are 100% sure you will not need to extend it.

In our case, we set the max shape for axis=0 and axis=3 to None, this means that we will be able to resize this dataset without any limitations along that axis. So basically adding more images and/or adding more channels to each image.

Lets create a simple class that will implement the indexer with buffering:

In [5]:
import h5py
import numpy as np
import timeit
import time
 
 
class ImageIndexer(object):
    def __init__(self, db_path, fixed_image_shape=(512, 512), buffer_size=200, num_of_images=100):
        self.db = h5py.File(db_path, mode='w')
        self.buffer_size = buffer_size
        self.num_of_images = num_of_images
        self.fixed_image_shape = fixed_image_shape
        self.image_vector_db = None
        self.image_id_db = None
        #         self.db_index = None
        self.idxs = {"index": 0}
 
        self.image_vector_buffer = []
        self.image_id_buffer = []
 
    #         self.db_index_buffer = []
 
    def __enter__(self):
        print("indexing {} images".format(self.num_of_images))
        self.t0 = time.time()
        return self
 
    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.image_id_buffer:
            print("writing last buffers")
            print(len(self.image_id_buffer))
 
            self._write_buffer(self.image_id_db, self.image_id_buffer)
            self._write_buffer(self.image_vector_db, self.image_vector_buffer)
 
        print("closing h5 db")
        self.db.close()
        print("indexing took {0}".format(time.time() - self.t0))
 
    @property
    def image_vector_size(self):
        if self.fixed_image_shape:
            return self.fixed_image_shape[0] * self.fixed_image_shape[1]
        else:
            return None
 
    def create_datasets(self):
 
        IMG_ROWS, IMG_COLS, CHANN = self.fixed_image_shape
 
        self.image_id_db = self.db.create_dataset(
            "image_ids",
            (self.num_of_images,),
            maxshape=None,
            dtype=h5py.special_dtype(vlen=str)
 
        )
 
        self.image_vector_db = self.db.create_dataset(
            "images",
            shape=(self.num_of_images,IMG_ROWS, IMG_COLS, CHANN),
            dtype="float"
        )
 
    def add(self, image_name, image_vector):
        self.image_id_buffer.append(image_name)
        self.image_vector_buffer.append(image_vector)
 
        if None in (self.image_vector_db, self.image_id_db):
            self.create_datasets()
 
        if len(self.image_id_buffer) >= self.buffer_size:
            self._write_buffer(self.image_id_db, self.image_id_buffer)
            self._write_buffer(self.image_vector_db, self.image_vector_buffer)
 
            # increment index
            self.idxs['index'] += len(self.image_vector_buffer)
 
            # clean buffers
            self._clean_buffers()
 
    def _write_buffer(self, dataset, buf):
        print("Writing buffer {}".format(dataset))
        start = self.idxs['index']
        end = len(buf)
        dataset[start:start + end] = buf
 
    def _clean_buffers(self):
        self.image_id_buffer = []
        self.image_vector_buffer = []

Usage

Since we implemented enter and exit methods we can use this class as a context manager, this will come handy to close the file at the end and write the last buffers at context-exit.

Example usage:

In [6]:
with ImageIndexer('my_dataset.h5', 
                  fixed_image_shape=(512, 512, 3), 
                  buffer_size=200, 
                  num_of_images=200) as imageindexer:
 
    for my_image in list_of_images:
        image_array = read_image(my_image)
        imageindexer.add(my_image, image_array)
 
 

indexing 200 images
closing h5 db
indexing took 0.0006651878356933594


NameError: name 'list_of_images' is not defined