#  `h5py` Notes
- Resource: https://docs.h5py.org/en/stable/index.html
- Resource: Course-571740000 `Data Processing for Engineers and Scientists` 
- Install it first by typing `conda install h5py` OR `pip install h5py` on terminal
- --Wen

In [77]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import os
import datetime

## Creating/opening hdf5 files
`hdf5` files are created/opened with the same command: `<file_object> = h5py.File(<path_to_file>, <permission_mode>, <...>)`.
- The set permission mode determines how `h5py` accesses the file. Valid modes are:
    - `r`: read only (file must exist already) $\qquad\rightarrow $ this is the default setting
    - `r+`: read/write (file must exist already)
    - `w`: create file (overwrite if file exists)
    - `w-` / `x`: create file (fails if file exists)
    - `a`: read/write if the file exists already, otherwise creates the file

In [78]:
# Set the path for the file in the 'data' folder
filename = os.path.join('data', 'my_first_file.h5')

# Create an HDF5 file with write permission mode ('w' for write)
h5file = h5py.File(filename, 'w')

## Creating a first data set

In [79]:
# In the created file allocate storage for an array of shape (128,3) where all elements are integers.
# Name it `first_dataset`
my_data = h5file.create_dataset('first_dataset', shape=(128, 3), dtype='i')
    
#Fill the dataset `my_data` with all ones
my_data[:] = np.ones((128, 3), dtype='i')
    
#Create a second dataset named second_dataset in the hdf5 file 
#which is an array of shape (20,20) containing only zeros 
#using the keyword data.
my_data_2 = h5file.create_dataset( 'second_dataset', data=np.zeros((20,20)),dtype='i' )

## Bringing in some structure using groups and metadata

In [80]:
#Creating a group
#Add a new group named first_group to the open hdf5-file.
first_group = h5file.create_group('first_group')

#Adding metadata
metadata_dict = {
    'created_by': 'Your Name',
    'creation_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'h5py_version': h5py.__version__
}
first_group.attrs.update(metadata_dict)

### Writing data directly to groups
- There are two main approaches to creating a dataset inside a specific folder:
    - `<group_reference>.create_dataset(..)`
        - `create_dataset()` is called on the reference of the group.
        - This makes sense if you already have a reference around.
    - `h5file[ <group_name_in_the_file> ].create_dataset(..)`
        - `h5file[ <group_name_in_the_file> ]` gets the reference to create the dataset in.
        - Here, the group is accessed by name in the file.

In [81]:
# Generate a random array
random_array = np.random.rand(219)

# Create a dataset named 'third_dataset' in 'first_group' containing 'random_array'
my_data_3 = first_group.create_dataset('third_dataset', data=random_array)

# Define and add metadata
metadata = {
    'created_by': 'Wen',
    'creation_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'h5py_version': h5py.__version__
}
my_data_3.attrs.update(metadata)

# Print the value of the attribute 'created_by'
print(my_data_3.attrs['created_by'])

Wen


## Closing `hdf5` files

In [83]:
h5file.close()

# All references to the file are lost
# Using a context manager in form of a `with`-statement
with h5py.File("data/my_first_file.h5", "a") as h5file:
    # Access 'first_group' and the dataset 'third_dataset'
    my_group = h5file['first_group']
    my_data_3 = my_group['third_dataset']
    
    # Print the value of the attribute 'created_by'
    print(my_data_3.attrs['created_by'])

Wen


### if I don't close files...

In [None]:
def get_dataset():
    h5file = h5py.File('./data/my_first_file.h5', 'r')
    return h5file['first_dataset']      # Read dataset that you have created (we will come back to 'reading data' later)

some_data = get_dataset()

# Although h5file is out of scope the file remains open due to the reference 'some_data'

# Delete the reference
del some_data

# File will be closed

### Display content

In [84]:
with h5py.File("data/my_first_file.h5", "a") as h5file:
    # Display keys at the top level of the HDF5 file
    print("Keys in the HDF5 file:")
    print(h5file.keys())

    # Access 'first_group'
    my_group = h5file['first_group']

    # Display keys within 'first_group'
    print("\nKeys in 'first_group':")
    print(my_group.keys())


Keys in the HDF5 file:
<KeysViewHDF5 ['first_dataset', 'first_group', 'second_dataset']>

Keys in 'first_group':
<KeysViewHDF5 ['third_dataset']>


In [85]:
def printPath(path):
    print(path)

with h5py.File("data/my_first_file.h5", "a") as h5file:
    h5file.visit(printPath)

first_dataset
first_group
first_group/third_dataset
second_dataset


### Check whether a certain path exists in an hdf5-file

In [86]:
with h5py.File('data/my_first_file.h5', 'r') as h5file:
    condition_1 = "first_dataset" in h5file
    print("Does 'first_dataset' exist?\t\t ----> {}".format(condition_1))

    condition_2 = "first_group/third_dataset" in h5file
    print("Does 'first_group/third_dataset' exist?\t ----> {}".format(condition_2))

    condition_3 = "wrong_path/weird_dataset" in h5file
    print("Does 'wrong_path/weird_dataset' exist?\t ----> {}".format(condition_3))

Does 'first_dataset' exist?		 ----> True
Does 'first_group/third_dataset' exist?	 ----> True
Does 'wrong_path/weird_dataset' exist?	 ----> False


## Accessing `hdf5`-files
- To load data from a file as a reference we use `<variable_reference> = <file_object>['<path']`.
- To load data into memory simply use slicing, i.e. `<variable_in_memory> = <file_object>['<path'][:]` would load the entire dataset into memory. Of course this also works for parts of the dataset that are sliced. To load a dataset consisting of a scalar to memory use `<file_object>['<path'][()]`.
- Read/write operations are generally slower when accessing data directly from the hard drive. However, the loss of computational speed is, in most cases, negligible.

In [88]:
# Open the provided HDF5 file in read-only mode
with h5py.File("data/images.h5", "r") as h5file:
    # Display keys at the top level of the HDF5 file
    print("Keys in the HDF5 file:")
    print(h5file.keys())

    # Use visit method to print the entire structure of the file
    print("\nFile structure using visit:")
    h5file.visit(printPath)


Keys in the HDF5 file:
<KeysViewHDF5 ['image_data']>

File structure using visit:
image_data
image_data/dset_0
image_data/dset_1
image_data/dset_10
image_data/dset_11
image_data/dset_2
image_data/dset_3
image_data/dset_4
image_data/dset_5
image_data/dset_6
image_data/dset_7
image_data/dset_8
image_data/dset_9


In [90]:
with h5py.File("data/images.h5", "r") as h5file:

    # Read dset_0 into memory
    dset_0 = h5file['image_data/dset_0'][:]

    # read dset_1 as reference (is in the same group as dset_0)
    dset_1 = h5file['image_data/dset_1']

    print('mean value of dset_0:', np.mean(dset_0))
    print('mean value of dset_1:', np.mean(dset_1))

print('\n################ file closed ################\n')

# Reference is not available any longer
print('mean value of dset_0:', np.mean(dset_0))
try:
    print('mean value of dset_1:', np.mean(dset_1))
except:
    print('could not read dset_1!')


mean value of dset_0: 0.32746875
mean value of dset_1: 0.32349375

################ file closed ################

mean value of dset_0: 0.32746875
could not read dset_1!


## More advanced datasets with compound datatypes I

In [93]:
# Define a compound datatype
# The compound datatype consists of three fields: 'material', 'location', and 'temperature'
# 'material' is an integer, 'location' is a 3-component float array, and 'temperature' is a float
compound_datatype = np.dtype([
    ('material', np.int32),
    ('location', np.float64, (3,)),
    ('temperature', np.float64)
])

# Sample data
samples_data = np.array([
    (1, [0.1, 0.2, 0.3], 25.5),
    (2, [1.0, 2.0, 3.0], 30.0),
], dtype=compound_datatype)

# Create an HDF5 file and store the data
with h5py.File('data/samples.h5', 'w') as h5file:
    # Create a dataset with the compound datatype
    samples_dataset = h5file.create_dataset('samples', data=samples_data)

    # Retrieve data from the dataset
    retrieved_data = samples_dataset[:]

# Print the retrieved data
print(retrieved_data)

[(1, [0.1, 0.2, 0.3], 25.5) (2, [1. , 2. , 3. ], 30. )]


## More advanced datasets with compound datatypes II

In [98]:
# Define a compound datatype
compound_datatype = np.dtype([
    ('material', np.int32),
    ('location', np.float64, (3,)),
    ('temperature', np.float64)
])

# Provided samples
materials = [5, 1]
locations = [[0.5, 15.2, 70.5], [100.2, 5.4, 9.0]]
temperatures = [77.4, 64.2]

with h5py.File('data/more_complex_file.h5', 'a') as h5file:
    # Write via slicing
    group_1 = h5file.create_group('samples_1')
    dset_1 = group_1.create_dataset('data', shape=(2,), dtype=compound_datatype)
    dset_1[0] = (materials[0], locations[0], temperatures[0])
    dset_1[1] = (materials[1], locations[1], temperatures[1])

    # Write via field access
    group_2 = h5file.create_group('samples_2')
    dset_2 = group_2.create_dataset('data', shape=(2,), dtype=compound_datatype)
    dset_2['material'] = materials
    dset_2['location'] = locations
    dset_2['temperature'] = temperatures

    # Write via combination of both
    group_3 = h5file.create_group('samples_3')
    dset_3 = group_3.create_dataset('data', shape=(2,), dtype=compound_datatype)
    h5file['samples_3']['data'][0] = (materials[0], locations[0], temperatures[0])
    h5file['samples_3']['data'][1] = (materials[1], locations[1], temperatures[1])
    
    # Read and display data from group_1
    print("Data from samples_1:")
    print(h5file['samples_1']['data'][:])

    # Read and display data from group_2
    print("\nData from samples_2:")
    print(h5file['samples_2']['data'][:])

    # Read and display data from group_3
    print("\nData from samples_3:")
    print(h5file['samples_3']['data'][:])


Data from samples_1:
[(5, [  0.5,  15.2,  70.5], 77.4) (1, [100.2,   5.4,   9. ], 64.2)]

Data from samples_2:
[(5, [  0.5,  15.2,  70.5], 77.4) (1, [100.2,   5.4,   9. ], 64.2)]

Data from samples_3:
[(5, [  0.5,  15.2,  70.5], 77.4) (1, [100.2,   5.4,   9. ], 64.2)]
