## Playing with HDF5 files

### Import necessary libraries

In [1]:
import h5py
import numpy as np

### Create a file

In [2]:
# Create a file and overwrite if already exists.
file = h5py.File ('file.h5', 'w')

### Create dataset
Dataset has the raw data and some description of the data

In [3]:
# Create a 4x6 integer(Big endian 32-bit) matrix
dataset = file.create_dataset("/dset",(4, 6), h5py.h5t.STD_I32BE)

# Give some description of the dataset
dataset.attrs["Features"] = "Some features on the root group"

### Display some information

In [4]:
print ("Dataset dataspace is", dataset.shape)
print ("Dataset Numpy datatype is", dataset.dtype)
print ("Dataset name is", dataset.name)
print ("Dataset is a member of the group", dataset.parent)
print ("Dataset was created in the file", dataset.file)

('Dataset dataspace is', (4, 6))
('Dataset Numpy datatype is', dtype('>i4'))
('Dataset name is', u'/dset')
('Dataset is a member of the group', <HDF5 group "/" (1 members)>)
('Dataset was created in the file', <HDF5 file "file.h5" (mode r+)>)


### Populate data

In [5]:
data = np.zeros((4,6))
for i in range(4):
    for j in range(6):
        data[i][j] = i*6+j+1

print(data)

[[  1.   2.   3.   4.   5.   6.]
 [  7.   8.   9.  10.  11.  12.]
 [ 13.  14.  15.  16.  17.  18.]
 [ 19.  20.  21.  22.  23.  24.]]


### Save data on dataset

In [6]:
dataset[...] = data

### Create another group
Imagine goup as directories

In [7]:
group = file.create_group("Labels")

### Create a dataset on a group

In [8]:
# Create a new dataset on group
dataset2 = group.create_dataset("Y", (2,3), dtype = h5py.h5t.STD_I16LE)

## Create a numpy array
Y = np.zeros((2,3))
Y[0][0] = 0
Y[0][1] = 0
Y[0][2] = 1

Y[1][0] = 0
Y[1][1] = 1
Y[1][2] = 0

print(Y)

# Put data on new dataset
dataset2[...] = Y

[[ 0.  0.  1.]
 [ 0.  1.  0.]]


### Close file and save data

In [9]:
file.flush()
file.close()

### Inspecting HDF5 file from the command line

In [10]:
!h5dump file.h5

HDF5 "file.h5" {
GROUP "/" {
   GROUP "Labels" {
      DATASET "Y" {
         DATATYPE  H5T_STD_I16LE
         DATASPACE  SIMPLE { ( 2, 3 ) / ( 2, 3 ) }
         DATA {
         (0,0): 0, 0, 1,
         (1,0): 0, 1, 0
         }
      }
   }
   DATASET "dset" {
      DATATYPE  H5T_STD_I32BE
      DATASPACE  SIMPLE { ( 4, 6 ) / ( 4, 6 ) }
      DATA {
      (0,0): 1, 2, 3, 4, 5, 6,
      (1,0): 7, 8, 9, 10, 11, 12,
      (2,0): 13, 14, 15, 16, 17, 18,
      (3,0): 19, 20, 21, 22, 23, 24
      }
      ATTRIBUTE "Features" {
         DATATYPE  H5T_STRING {
            STRSIZE H5T_VARIABLE;
            STRPAD H5T_STR_NULLTERM;
            CSET H5T_CSET_ASCII;
            CTYPE H5T_C_S1;
         }
         DATASPACE  SCALAR
         DATA {
         (0): "Some features on the root group"
         }
      }
   }
}
}


## Open a file
This file has been generated from matlab with the functions h5create and h5write with 2 matrices X[2x3] and Y[6x1]

In [17]:
!h5dump for_leo.h5

HDF5 "for_leo.h5" {
GROUP "/" {
   DATASET "X" {
      DATATYPE  H5T_IEEE_F64LE
      DATASPACE  SIMPLE { ( 3, 2 ) / ( 3, 2 ) }
      DATA {
      (0,0): 1, 4,
      (1,0): 2, 5,
      (2,0): 3, 6
      }
   }
   DATASET "Y" {
      DATATYPE  H5T_IEEE_F64LE
      DATASPACE  SIMPLE { ( 1, 4 ) / ( 1, 4 ) }
      DATA {
      (0,0): 5, 6, 7, 8
      }
   }
}
}


In [11]:
file = h5py.File ('for_leo.h5', 'r+')

### Open a dataset on the group X

In [16]:
dataset = file['/X']

### Read dataset

In [20]:
X = dataset[...]
print(type(X))
print(X)
X_t = X.transpose()
print('Transposing to keep same format')
print(X_t)

<type 'numpy.ndarray'>
[[ 1.  4.]
 [ 2.  5.]
 [ 3.  6.]]
Transposing to keep same format
[[ 1.  2.  3.]
 [ 4.  5.  6.]]
