# h5py introduction

Author: Julian Lißner<br>
For questions and feedback please write a mail to: [lissner@mib.uni-stuttgart.de](mailto:lissner@mib.uni-stuttgart.de)

In [7]:
import numpy as np
import h5py
import sys
from datetime import date, datetime
sys.path.append( 'submodules' )
import read_h5 as read
from general_functions import file_size, tic, toc

In [8]:
import os
os.system( 'rm *.h5')

1

## Creating hdf5 files 
- files are opened with permissions
- `w` always creates a new file
- `a` opens an existing file, creates if does not exist
- `r` opens an existing file, can not write
- `r+` opens an existing file, can write

In [10]:
h5file = h5py.File( 'nonexistant_file.h5', 'r+' )
#h5file.close()

In [11]:
h5file = h5py.File( 'nonexistant_file.h5', 'a' )
#subfolder = h5file.create_group( 'arbitrary_name' )
#subfolder.create_dataset( 'dset_0', data=np.arange( 5), compression='gzip', dtype='u1' )

read.display_all_data( h5file)
h5file.close()


#### Datasets and metadata in root folder ####

#### Datasets stored in each folder ####

content of folder: subfolder
<HDF5 dataset "data": shape (69, 420), type "<f4">

#### Metadata stored in each folder ####


In [13]:
h5file = h5py.File( 'nonexistant_file.h5', 'w' )
read.display_all_data( h5file)

my_data = h5file.create_dataset( 'subfolder/data', shape=(69,420), compression='gzip' )
h5file.close()


#### Datasets and metadata in root folder ####

#### Datasets stored in each folder ####

#### Metadata stored in each folder ####


In [14]:
filecheck = h5py.File( 'nonexistant_file.h5', 'r' )
filecheck.create_dataset( 'some_data', data=np.arange(5) )

ValueError: Unable to create dataset (no write intent on file)

In [None]:
print( filecheck['subfolder/data'][:])
filecheck.close()

- files need to  be open to access
- permissions govern access type
- if not careful files will be overwritten by `h5py`
----------------
----------------


## Writing data
- datasets can be written without allocating memory
- `h5py` sets pointers directly pointing on the hard drive
- slight speedloss on partial writing
- every data must have metadata
- datatype specifies required storage of dataset

In [15]:
precompute_results = lambda x, y=3: x*y + 1
partial_results = lambda n: np.arange( n) * np.random.randint( low=1, high=10)
h5file.close()

In [16]:
my_result = precompute_results( x=np.arange( 100000) )
h5file = h5py.File( 'my_file.h5', 'w')
result_group = h5file.create_group( 'results' )
dataset = result_group.create_dataset( 'precomputed_results', data=my_result, compression='gzip', dtype=np.int16 )

print( 'size of file with only int dataset' )
print( file_size( 'my_file.h5' ) )

result_group.create_dataset( 'big_results', data=my_result, compression='gzip', dtype=np.float64 )
print( 'size of file with also float dataset' )
print( file_size( 'my_file.h5' ) )

size of file with only int dataset
96.00 bytes
size of file with also float dataset
144.36 KiB


In [17]:
result_group.attrs.update( dict( container_for='results', creation_code='explanation video' ) )

In [18]:
metadata = {'author':'lissner', 'date(yymmdd)':date.today().strftime("%YY:%MM%DD"),
            'time':datetime.now().strftime("%H:%M:%S"), 'input parameter':'np.arange(1000)'}
dataset.attrs.update( metadata) 

In [19]:
def generic_metadata( author='lissner'):
    now = datetime.now().strftime("%H:%M:%S")
    today = date.today().strftime("%YY/%MM/%DD")
    return { 'author':author, 'time':now, 'date(yymmdd)':today }

#### partial write

In [20]:
data_pointer = result_group.create_dataset( 'allocated_data', shape=(69,420), compression='gzip', dtype=np.int32 )
data_pointer[:, 0] = 3.14141414
print( data_pointer[:5,0] )

[3 3 3 3 3]


In [27]:
print( 'type of my data pointer:', type( data_pointer) )
data_pointer = np.random.rand( 69, 420)
print( 'type of my data pointer:', type( data_pointer) )

type of my data pointer: <class 'numpy.ndarray'>
type of my data pointer: <class 'numpy.ndarray'>


In [21]:
data_pointer = result_group[ 'allocated_data' ]
print( 'type of my data pointer:', type( data_pointer) )

type of my data pointer: <class 'h5py._hl.dataset.Dataset'>


In [22]:
n = data_pointer.shape[0] 
tic( 'looping {} times'.format( data_pointer.shape[-1] )) 
for i in range( 1, data_pointer.shape[-1] ):
    data_pointer[ :, i ] = partial_results( n )    
toc( 'looping {} times'.format( data_pointer.shape[-1] )) 

Initializing timer for this tag: looping 420 times
looping 420 times -> elapsed time: 0.0670


In [23]:
tic( 'writing data directly') 
data_pointer[:] = np.random.randint( low=0, high=9001, size=(69,420)) 
toc( 'writing data directly') 

Initializing timer for this tag: writing data directly
writing data directly -> elapsed time: 0.0020


In [22]:
data_pointer.attrs.update( generic_metadata() )
data_pointer.attrs.update( dict( input_parameter=n) )
h5file.close()

#### Displaying elements of the file

In [24]:
h5file = h5py.File( 'my_file.h5', 'r') 
h5file.visit( print) 

results
results/allocated_data
results/big_results
results/precomputed_results


In [25]:
h5file.visititems( print) 

results <HDF5 group "/results" (3 members)>
results/allocated_data <HDF5 dataset "allocated_data": shape (69, 420), type "<i4">
results/big_results <HDF5 dataset "big_results": shape (100000,), type "<f8">
results/precomputed_results <HDF5 dataset "precomputed_results": shape (100000,), type "<i2">


In [26]:
print( 'metadata for the subfolder "results":')
for key, value in h5file['results'].attrs.items():
    print( '{}: {}'.format( key, value) )

metadata for the subfolder "results":
container_for: results
creation_code: explanation video


In [27]:
h5file.close() 

-----------------
-----------------

### Deleting datasets
- not entire storage recovererd
- can be done in python
- handle to dataset required

In [28]:
h5file = h5py.File( 'my_file.h5', 'r+' )
print( 'some values of the dataset', h5file[ 'results/big_results'][:5])
print( 'file of size with the dataset' )
print( file_size( 'my_file.h5' ) )

del h5file[ 'results/big_results' ]

try:
    print( 'some values of the dataset', h5file[ 'results/big_results'][:5])
except:
    print( "!!! Can't acces dataset, does not exist")

print( 'file of size without the dataset' ) 
print( file_size( 'my_file.h5' )  )
h5file.close()

some values of the dataset [ 1.  4.  7. 10. 13.]
file of size with the dataset
144.36 KiB
!!! Can't acces dataset, does not exist
file of size without the dataset
144.36 KiB


## bugs bugs bugs, features!

In [29]:
h5file = h5py.File( 'tmpfile.h5', 'w')
my_data = h5file.create_dataset( 'subfolder/random_data', data=np.arange(10) )

In [30]:
h5file = 'x'
print( my_data[5] )

5


In [31]:
same_file = h5py.File( 'tmpfile.h5', 'w')

OSError: Unable to create file (unable to truncate a file which is already open)

In [None]:
h5file.close()

In [32]:
same_file = h5py.File( 'tmpfile.h5', 'a')
my_data = same_file['subfolder/random_data']
my_data[0] = 9
print( np.array( my_data) )
same_file.close()

[9 1 2 3 4 5 6 7 8 9]


In [33]:
h5file = h5py.File( 'example_file.h5', 'w')
data = h5file.create_dataset( 'subfolder/subsubfolder/data', shape=(10,10), dtype='u1')
data[0,0] = -1
print( data[0,0])

255


In [34]:
h5file.close()
data[-1,-1] = 255

ValueError: Not a dataset (not a dataset)

In [35]:
h5file = h5py.File( 'example_file.h5', 'r+')
h5file['subfolder'].create_dataset( 'dset_0', data=np.arange(5) )

<HDF5 dataset "dset_0": shape (5,), type "<i4">

In [36]:
subfolder = h5file['subfolder']
subfolder['dset_0'][2] = 999
print( h5file['subfolder/dset_0'] )
print( h5file['subfolder/dset_0'][:] )
print( np.array( h5file['subfolder/dset_0'] ) )

<HDF5 dataset "dset_0": shape (5,), type "<i4">
[  0   1 999   3   4]
[  0   1 999   3   4]


In [37]:
data = subfolder['dset_0']
data = np.arange(5) * (-1)
print( 'local variable', data )
print( 'dataset', subfolder['dset_0'][:] ) 

local variable [ 0 -1 -2 -3 -4]
dataset [  0   1 999   3   4]


In [38]:
h5file.create_group( 'testgroup').attrs.update( dict( new='custom entry') )
print( h5file[ 'testgroup'].attrs['new'] )


custom entry


In [39]:
h5file.close()

In [40]:
os.system( 'rm *.h5')

1