Source https://www.pythonforthelab.com/blog/how-to-use-hdf5-files-in-python/

**Basic saving and reading the data**

In [1]:
import h5py
import numpy as np

arr = np.random.randn(1000)

with h5py.File('random.hdf5', 'w') as f:
    dset = f.create_dataset("default", data=arr)

  from ._conv import register_converters as _register_converters


In [3]:
with h5py.File('random.hdf5', 'r') as f:
   data = f['default']
   print(min(data))
   print(max(data))
   print(data[:15])
   for key in f.keys():
       print(key) 


-3.1155833593559734
3.657201953104082
[-0.61642994 -0.50906164  1.64592415  0.56725858  0.06411687  0.65907973
  1.2436992   0.20323535  0.88101301 -0.03446799  0.50091855 -0.19534449
 -1.4705829  -0.96633255  0.13376327]
default


In [6]:
# f = h5py.File('random.hdf5', 'r')
# data = f['default']
# f.close()
# print(data[1])

In [5]:
f = h5py.File('random.hdf5', 'r')
data = f['default'][:]
f.close()
print(data[10])

0.5009185459355459


**Selective reading from HDF5**

In [8]:
with h5py.File('random.hdf5', 'r') as f:
   data_set = f['default']
   data = data_set[:10]

print(data[1])
#print(data_set[1]) # error

-0.5090616409706761


In [9]:

arr1 = np.random.randn(10000)
arr2 = np.random.randn(10000)

with h5py.File('complex_read.hdf5', 'w') as f:
    f.create_dataset('array_1', data=arr1)
    f.create_dataset('array_2', data=arr2)

In [15]:
# with h5py.File('complex_read.hdf5', 'r') as f:
#     d1 = f['array_1']
#     d2 = f['array_2']

#     data = d2[d1>0] # wont work

In [13]:
with h5py.File('complex_read.hdf5', 'r') as f:
    d1 = f['array_1']
    d2 = f['array_2']

    data = d2[d1[:]>0]
print('The length of data with a for loop: {}'.format(len(data)))

The length of data with a for loop: 4882


In [12]:
with h5py.File('complex_read.hdf5', 'r') as f:
    d1 = f['array_1']
    d2 = f['array_2']

    data = []

    for i in range(len(d1)):
        if d1[i] > 0:
            data.append(d2[i])

print('The length of data with a for loop: {}'.format(len(data)))

The length of data with a for loop: 4882


**Selective writing to HDF5**

In [16]:
arr = np.random.randn(100)

with h5py.File('random.hdf5', 'w') as f:
   dset = f.create_dataset("default", (1000,))
   dset[10:20] = arr[50:60]

In [18]:
arr = np.random.randn(1000)

with h5py.File('random.hdf5', 'w') as f:
   dset = f.create_dataset("default", (1000,))
   dset = arr # wrong
   print(dset) 

[ 4.07396209e-01  6.31643058e-01 -6.13761108e-01  1.08416731e+00
 -7.11668459e-01  2.54749148e-01  1.83865260e+00  7.74822744e-01
 -7.68735208e-01 -1.10934281e+00 -1.47762630e+00 -5.55610100e-01
  3.17339507e-02 -7.69294869e-01  1.78294925e+00 -1.57473234e+00
 -9.41925180e-01  2.08802876e+00  1.04287200e+00 -4.10421024e-01
 -1.96290813e+00 -4.64331019e-01  3.74820404e-01  5.44994939e-01
 -9.91450073e-01 -6.41072896e-01 -1.49945534e+00 -6.91575424e-01
 -1.38145757e-01 -2.88435138e-02  4.56048471e-01  8.39027790e-01
  1.69118038e+00 -2.35105212e+00  8.98509875e-01 -1.29156044e+00
 -1.82945796e+00  7.90489476e-01  1.22316018e+00 -4.87344320e-01
 -9.27412600e-01 -1.38977012e-01  1.04517895e-01 -1.93829491e+00
  7.12960402e-01  1.32260819e+00 -2.04726678e-02  4.70621581e-01
 -9.29891703e-01  8.54203166e-01  9.99007400e-01 -1.00200014e+00
  1.54859960e+00 -1.06088152e-01 -8.66807403e-02 -5.00529634e-02
  1.46207189e+00 -8.44777521e-01  7.94372985e-01  1.83292850e+00
  2.19205366e+00  3.83501

In [21]:
with h5py.File('random.hdf5', 'w') as f:
    dset = f.create_dataset('default', (500, 1024))
    dset[1,2] = 1
    dset[200:500, 500:1024] = 123
    print(dset[:])



[[  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   1. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]
 ...
 [  0.   0.   0. ... 123. 123. 123.]
 [  0.   0.   0. ... 123. 123. 123.]
 [  0.   0.   0. ... 123. 123. 123.]]


**Specifing datatypes optimize space**

In [22]:
with h5py.File('several_datasets.hdf5', 'w') as f:
   dset_int_1 = f.create_dataset('integers', (10, ), dtype='i1')
   dset_int_8 = f.create_dataset('integers8', (10, ), dtype='i8')
   dset_complex = f.create_dataset('complex', (10, ), dtype='c16')

   dset_int_1[0] = 1200
   dset_int_8[0] = 1200.1
   dset_complex[0] = 3 + 4j

In [23]:
arr = np.random.randn(100000)

f = h5py.File('integer_1.hdf5', 'w')
d = f.create_dataset('dataset', (100000,), dtype='i1')
d[:] = arr
f.close()

f = h5py.File('integer_8.hdf5', 'w')
d = f.create_dataset('dataset', (100000,), dtype='i8')
d[:] = arr
f.close()

f = h5py.File('float.hdf5', 'w')
d = f.create_dataset('dataset', (100000,), dtype='f16')
d[:] = arr
f.close()


File 	Size (b)

integer_1 	102144

integer_8 	802144

float 	1602144

**Compressing data**

In [25]:
import h5py
import numpy as np

arr = np.random.randn(100000)

with h5py.File('integer_1_compr.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100000,), dtype='i1', compression="gzip", compression_opts=9)
    d[:] = arr

with h5py.File('integer_8_compr.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100000,), dtype='i8', compression="gzip", compression_opts=9)
    d[:] = arr

with h5py.File('float_compr.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100000,), dtype='f16', compression="gzip", compression_opts=9)
    d[:] = arr

We chose gzip because it is supported in all platforms. The parameters compression_opts sets the level of compression. The higher the level, the less space data takes but the longer the processor has to work. The default level is 4. We can see the differences in our files based on the level of compression:

Type 	No Compression 	Compression 9 	Compression 4

integer_1 	102144 	28016 	30463

integer_8 	802144 	43329 	57971

float 	1602144 	1469580 	1469868

**Resizing Data**

In [26]:
with h5py.File('resize_dataset.hdf5', 'w') as f:
    d = f.create_dataset('dataset', (100, ),  maxshape=(500, ))
    d[:100] = np.random.randn(100)
    d.resize((200,))
    d[100:200] = np.random.randn(100)

with h5py.File('resize_dataset.hdf5', 'r') as f:
    dset = f['dataset']
    print(dset[99])
    print(dset[199])

-0.18844663
-1.2919675


First, you create a dataset to store 100 values and set a maximum size of up to 500 values. After you stored the first batch of values, you can expand the dataset to store the following 100. You can repeat the procedure up to a dataset with 500 values. 

In [27]:
with h5py.File('resize_dataset.hdf5', 'a') as f:
    dset = f['dataset']
    dset.resize((300,))
    dset[:200] = 0
    dset[200:300] = np.random.randn(100)

with h5py.File('resize_dataset.hdf5', 'r') as f:
    dset = f['dataset']
    print(dset[99])
    print(dset[199])
    print(dset[299])

0.0
0.0
-0.70444626


In [34]:
# with h5py.File('movie_dataset.hdf5', 'w') as f:
#    d = f.create_dataset('dataset', (1024, 1024, 1),  maxshape=(1024, 1024, None ))
#    d[:,:,0] = first_frame
#    d.resize((1024,1024,2))
#    d[:,:,1] = second_frame


The dataset holds square images of 1024x1024 pixels, while the third dimension gives us the stacking in time. We assume that the images don't change in shape, but we would like to stack one after the other without establishing a limit. This is why we set the third dimension's maxshape to None.

**Save data in chunks**

In [30]:
with h5py.File('chunked_dataset.hdf5', 'w') as f:
    dset = f.create_dataset("chunked", (1000, 1000), chunks=(100, 100))

The command means that all the data in dset[0:100,0:100] will be stored together. It is also true for dset[200:300, 200:300], dset[100:200, 400:500], etc. According to h5py, there are some performance implications while using chunks:

    Chunking has performance implications. It is recommended to keep the total size of your chunks between 10 KiB and 1 MiB, larger for larger datasets. Also keep in mind that when any element in a chunk is accessed, the entire chunk is read from disk.

There is also the possibility of enabling auto-chunking, that will take care of selecting the best size automatically. Auto-chunking is enabled by default if you use compression or maxshape. You enable it explicitly by doing:

In [33]:
with h5py.File('chunked_dataset.hdf5', 'w') as f:
    dset = f.create_dataset("chunked", (1000, 1000), chunks=True)
    print(dset.chunks)

(63, 125)


**Organizing data with groups**

In [36]:
arr = np.random.randn(1000)

with h5py.File('groups.hdf5', 'w') as f:
    g = f.create_group('Base_Group')
    gg = g.create_group('Sub_Group')

    d = g.create_dataset('default', data=arr)
    dd = gg.create_dataset('default', data=arr)
     
with h5py.File('groups.hdf5', 'r') as f:
   d = f['Base_Group/default']
   dd = f['Base_Group/Sub_Group/default']
   print(d[1])
   print(dd[1])

0.0576639435258907
0.0576639435258907


In [37]:
with h5py.File('groups.hdf5', 'r') as f:
    for k in f.keys():
        print(k)

Base_Group


However, when you have nested groups, you will also need to start nesting for-loops. There is a better way of iterating through the tree, but it is a bit more involved. We need to use the visit() method, like this:

In [38]:
def get_all(name):
   print(name)

with h5py.File('groups.hdf5', 'r') as f:
   f.visit(get_all)

Base_Group
Base_Group/Sub_Group
Base_Group/Sub_Group/default
Base_Group/default


Notice that we define a function get_all that takes one argument, name. When we use the visit method, it takes as argument a function like get_all. visit will go through each element and while the function doesn't return a value other than None, it will keep iterating. For example, imagine we are looking for an element called Sub_Group we have to change get_all

In [39]:
def get_all(name):
    if 'Sub_Group' in name:
        return name

with h5py.File('groups.hdf5', 'r') as f:
    g = f.visit(get_all)
    print(g)

Base_Group/Sub_Group


When the method visit is iterating through every element, as soon as the function returns something that is not None it will stop and return the value that get_all generated. Since we are looking for the Sub_Group, we make the get_all return the name of the group when it finds Sub_Group as part of the name that is analyzing. Bear in mind that g is a string, if you want to actually get the group, you should do:

In [40]:
with h5py.File('groups.hdf5', 'r') as f:
   g_name = f.visit(get_all)
   group = f[g_name]

And you can work as explained earlier with groups. A second approach is to use a method called visititems that takes a function with two arguments: name and object. We can do:

In [41]:
def get_objects(name, obj):
   if 'Sub_Group' in name:
      return obj

with h5py.File('groups.hdf5', 'r') as f:
   group = f.visititems(get_objects)
   data = group['default']
   print('First data element: {}'.format(data[0]))

First data element: -0.7320548885697936


The main difference when using visititems is that we have accessed not only the name of the object that is being analyzed but also the object itself. You can see that what the function returns is the object and not the name. This pattern allows you to achieve more complex filtering. For example, you may be interested in the groups that are empty, or that have a specific type of dataset in them.

**Storing metadata**

The main difference when using visititems is that we have accessed not only the name of the object that is being analyzed but also the object itself. You can see that what the function returns is the object and not the name. This pattern allows you to achieve more complex filtering. For example, you may be interested in the groups that are empty, or that have a specific type of dataset in them.

In [42]:
import time
import numpy as np
import h5py
import os

arr = np.random.randn(1000)

with h5py.File('groups.hdf5', 'w') as f:
    g = f.create_group('Base_Group')
    d = g.create_dataset('default', data=arr)

    g.attrs['Date'] = time.time()
    g.attrs['User'] = 'Me'

    d.attrs['OS'] = os.name

    for k in g.attrs.keys():
        print('{} => {}'.format(k, g.attrs[k]))

    for j in d.attrs.keys():
      print('{} => {}'.format(j, d.attrs[j]))

Date => 1560235504.3818154
User => Me
OS => posix


In the code above you can see that the attrs is like a dictionary. In principle, you shouldn't use attributes to store data, keep them as small as you can. However, you are not limited to single values, you can also store arrays. If you happen to have metadata stored in a dictionary and you want to add it automatically to the attributes, you can use update:

In [43]:
with h5py.File('groups.hdf5', 'w') as f:
   g = f.create_group('Base_Group')
   d = g.create_dataset('default', data=arr)

   metadata = {'Date': time.time(),
      'User': 'Me',
      'OS': os.name,}

   f.attrs.update(metadata)

   for m in f.attrs.keys():
      print('{} => {}'.format(m, f.attrs[m]))

Date => 1560235655.6074827
User => Me
OS => posix


Remember that the data types that hdf5 supports are limited. For example, dictionaries are not supported. If you want to add a dictionary to an hdf5 file you will need to serialize it. In Python, you can serialize a dictionary in different ways. In the example below, we are going to do it with JSON because it is very popular in different fields, but you are free to use whatever you like, including pickle.

In [44]:
import json

with h5py.File('groups_dict.hdf5', 'w') as f:
    g = f.create_group('Base_Group')
    d = g.create_dataset('default', data=arr)

    metadata = {'Date': time.time(),
                'User': 'Me',
                'OS': os.name,}

    m = g.create_dataset('metadata', data=json.dumps(metadata))