In [1]:
import h5py
import idx2numpy as idx2np #to read the idx files
import glob
import os
import numpy as np
import timeit

In [2]:
train_files    = [file for file in glob.glob('MNIST_idx/train-*')] #first elem is train images path and second elem is train labels path
test_files     = [file for file in glob.glob('MNIST_idc/t10k-*')] #first elem is test images path and second elem is test labels path
save_file_path = 'MNIST_hdf5/'
timeit_number  = 1000

In [3]:
#creates the directory if it does not exist yet
try:
    os.mkdir(save_file_path)
    print("Directory is created successfully!")
except FileExistsError:
    print("Directory already exists!")

Directory already exists!


In [4]:
#MEMORY EXTENSIVE!
#both line loads the image and labels as np arrays into the memory
load_training_images = idx2np.convert_from_file(train_files[0])
load_training_labels = idx2np.convert_from_file(train_files[1])

#### HDF5, Hierarchical Data Format, consist of two types of objects. 

#### 1) Datasets ; 2) Groups
Datasets are multidimensional arrays and group consists of datasets **OR** other groups. Within a dataset, the dimensions and the type of the array have to be uniform.

In [5]:
def single_hdf5(image, filename, label):
    '''Stores a single image to an hfd5 file.
       Parameters
       ----------
       image    : numpy array 
       filename : name of the file
       label    : label of the training image
    '''
    file = h5py.File(save_file_path + filename + '.h5', 'w') #create a new hdf5 file
    
    #since an HDF5 file can contain more than one dataset, we store the image array and the label array in one file.
    dataset  = file.create_dataset("image", np.shape(image), h5py.h5t.STD_U8BE, data=image)
    meta_set = file.create_dataset("meta", np.shape(label), h5py.h5t.STD_U8BE, data=label) 
    file.close()
    

#### In saving the images as .png to the disk, a function that is created to store a single file can be repeated until all the images is stored. However, in HDF5, we can store them all at one time.

In [6]:
def many_hdf5(images=load_training_images, labels=load_training_labels):
    '''Stores multiple images to HDF5.
       Parameters
       ----------
       images : numpy array of multiple images
       labels : corresponding labels 
    '''
    num_images = images.shape[0]
    
    file = h5py.File(save_file_path + str(num_images) + '.h5', 'w')
    
    dataset = file.create_dataset("images", np.shape(images), h5py.h5t.STD_U8BE, data=images)
    meta_set = file.create_dataset("meta", np.shape(labels), h5py.h5t.STD_U8BE, data=labels)
    
    file.close()

In [7]:
time_hdf5 = timeit.timeit(many_hdf5, number=timeit_number)

In [10]:
print("The time taken to store the images in hdf5 format is : %g"%(time_hdf5/timeit_number))

The time taken to store the images in hdf5 format is : 0.396472
