<a href="https://colab.research.google.com/github/ShaswataJash/LargeDatasetHandling/blob/master/HDF5_reading_in_C%2B%2B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installation of required software packages

In [None]:
!pip install h5py==3.8.0

In [None]:
#Ref: https://docs.h5py.org/en/stable/mpi.html
#check whether parallel version of h5py is availiable
!h5cc -showconfig

In [None]:
#!pip install hdf5plugin~=2.0
#Installation from source can achieve better performances than pre-built binaries. (refer: http://www.silx.org/doc/hdf5plugin/latest/install.html)
!pip install -vv hdf5plugin --no-binary hdf5plugin
#!pip install hdf5plugin==4.1.1

In [None]:
#ref: https://www.silx.org/doc/hdf5plugin/latest/hdf5plugin_EuropeanHUG2022.html
%%bash
export HDF5_PLUGIN_PATH=`python3 -c "
import hdf5plugin; print(hdf5plugin.PLUGINS_PATH)"`
echo "HDF5_PLUGIN_PATH=${HDF5_PLUGIN_PATH}"
ls ${HDF5_PLUGIN_PATH}

In [None]:
!ls -l /usr/lib/x86_64-linux-gnu/hdf5/serial

In [None]:
#needed if c++ is being used for hdf5 access
!mkdir /usr/lib/x86_64-linux-gnu/hdf5/plugins
!cp /usr/local/lib/python3.10/dist-packages/hdf5plugin/plugins/* /usr/lib/x86_64-linux-gnu/hdf5/plugins

In [None]:
!wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.12/hdf5-1.12.1/bin/unix/hdf5-1.12.1-Std-ubuntu2010_64.tar.gz

In [None]:
!tar -xzf hdf5-1.12.1-Std-ubuntu2010_64.tar.gz

In [None]:
!hdf/HDF5-1.12.1-Linux.sh

Refer https://support.hdfgroup.org/HDF5/doc/TechNotes/TechNote-HDF5-ImprovingIOPerformanceCompressedDatasets.pdf to understand the utility of the binary h5ls and then h5dump. In h5dump, **specifically note the compression ratio**.

In [None]:
!HDF5-1.12.1-Linux/HDF_Group/HDF5/1.12.1/bin/h5ls -lrv /mnt/train_multi_inputs.h5

In [None]:
!HDF5-1.12.1-Linux/HDF_Group/HDF5/1.12.1/bin/h5dump -H -p -d /train_multi_inputs/block0_values /mnt/train_multi_inputs.h5

#HDF5 reading through C++

In [None]:
!rm -Rf HighFive
!git clone https://github.com/BlueBrain/HighFive.git

In [None]:
!rm tqdm.cpp
!git clone https://github.com/tqdm/tqdm.cpp.git

In [None]:
#Ref: https://pytorch.org/get-started/locally/ [Choose > Stable: Linux: LibTorch: C++: cpu]
!wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip
!unzip libtorch-cxx11*.zip

In [None]:
!mkdir torch_example_app

In [None]:
%%writefile torch_example_app/CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(example-app)

find_package(Torch REQUIRED)
list(APPEND CMAKE_PREFIX_PATH "<my-libtorch-path>")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
include_directories(
  "${TORCH_INCLUDE_DIRS}"
)
add_executable(example-app example-app.cpp)
target_link_libraries(example-app "${TORCH_LIBRARIES}")
set_property(TARGET example-app PROPERTY CXX_STANDARD 14)

In [None]:
%%writefile torch_example_app/example-app.cpp

#include <torch/torch.h>
#include <iostream>

int main() {
  const torch::Tensor& tensor1 = torch::rand({5, 3});
  std::cout << "tensor1:\n" << tensor1 << &tensor1 << std::endl;

  const std::tuple<at::Tensor, at::Tensor>& min_res = at::min(tensor1, 0, true);
  std::cout << "min_res of tensor1:\n" << std::get<0>(min_res) << "index of min_res=" << std::get<1>(min_res)  << std::endl;

  const torch::Tensor& tensor2 = torch::rand({1, 3});
  std::cout << "tensor2:\n" << tensor2 << &tensor2 << std::endl;

  const torch::Tensor& min = at::min(std::get<0>(min_res), tensor2);
  std::cout << "min between min_res of tensor1 and tensor2:\n" << min << &min << std::endl;

  torch::Tensor r = torch::rand({1, 3});
  std::cout << "random tensor r:\n" << r << &r << std::endl;
  const torch::Tensor& after_copy = at::copy_out(r, r, min);
  std::cout << "tensor r (after copy from min):\n" << r << &r << std::endl;
  std::cout << "result_r after_copy (after copy from min):\n" << after_copy << &after_copy << std::endl;
  
  return 0;
}

In [None]:
!mkdir torch_example_app/build
%cd torch_example_app/build
!cmake -DCMAKE_PREFIX_PATH=/content/libtorch ..
!cmake --build . --config Release
%cd /content

In [None]:
!./torch_example_app/build/example-app

In [None]:
!mkdir torch_example_along_with_hdf5

In [None]:
%%writefile torch_example_along_with_hdf5/hdf5reading.cpp

#include <torch/torch.h>

#include <highfive/H5File.hpp>
#include <highfive/H5DataSet.hpp>
#include <highfive/H5DataSpace.hpp>
using namespace HighFive;

#include "tqdm/tqdm.h"

#include <algorithm>
#include <iostream>

const std::string FILE_NAME("/mnt/train_multi_inputs.h5");
const int BATCH_SIZE = 4000;
int main(void) {

    try {
        // We open the file as read-only:
        File file(FILE_NAME, File::ReadOnly);
        const DataSet& dataset = file.getDataSet("/train_multi_inputs/block0_values");

        // get the dimension of the dataset
        const std::vector< size_t >& dim = dataset.getDimensions();
        std::cout << "dimension:(" << dim[0] << "," <<  dim[1] << ")" << std::endl; 

        const DataType& dType = dataset.getDataType();
        std::cout << "datatype:" << dType.string() << std::endl; 

        //float *result = new float[BATCH_SIZE * dim[1]];
        auto options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
        torch::Tensor pre_allocated_tensor = torch::zeros({BATCH_SIZE, dim[1]}, options);
        //ref: https://discuss.pytorch.org/t/can-i-get-the-cuda-tensor-pointer-in-the-python-pytorch/141195
        float *result = pre_allocated_tensor.data_ptr<float>();

        torch::Tensor max_t = torch::zeros({1, dim[1]}, options);
        torch::Tensor min_t = torch::zeros({1, dim[1]}, options); 
        bool not_inited = false;
        for (int startingRow : tqdm::range(0, (int)dim[0], BATCH_SIZE)){
            
            dataset.select({startingRow, 0}, {std::min(BATCH_SIZE, (int)(dim[0]) - startingRow), dim[1]}).read(result);
            
            const std::tuple<at::Tensor, at::Tensor>& max_col = at::max(pre_allocated_tensor, 0, true); //max finding along dimension 0
            const std::tuple<at::Tensor, at::Tensor>& min_col = at::min(pre_allocated_tensor, 0, true); //min finding along dimension 0

            //std::cout << "startingRow : " << startingRow << std::endl;
            if (not_inited){
                at::copy_out(max_t, max_t, std::get<0>(max_col));
                at::copy_out(min_t, min_t, std::get<0>(min_col));
            }else{
                const torch::Tensor& local_max = at::max(std::get<0>(max_col), max_t);
                const torch::Tensor& local_min = at::max(std::get<0>(min_col), min_t);
                at::copy_out(max_t, max_t, local_max);
                at::copy_out(min_t, min_t, local_min);
            }
        }

        //delete [] result;

    } catch (Exception& err) {
        // catch and print any HDF5 error
        std::cerr << err.what() << std::endl;
        return -1;
    }

    return 0;  // successfully terminated
}

In [None]:
#https://github.com/BlueBrain/HighFive/issues/350 (why  /usr/include/hdf5/serial has to be included?)
%%writefile torch_example_along_with_hdf5/CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(torch-hdf5-example-app)

find_package(Torch REQUIRED)
list(APPEND CMAKE_PREFIX_PATH "<my-libtorch-path>")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
include_directories(
  "${TORCH_INCLUDE_DIRS}" "/content/HighFive/include/" "/usr/include/hdf5/serial" "/content/tqdm.cpp/include"
)
link_directories("/usr/lib/x86_64-linux-gnu/hdf5/serial/")
add_executable(torch-hdf5-example-app hdf5reading.cpp)
target_link_libraries(torch-hdf5-example-app "${TORCH_LIBRARIES}" "hdf5")
set_property(TARGET torch-hdf5-example-app PROPERTY CXX_STANDARD 14)

In [None]:
%cd /content

In [None]:
!mkdir torch_example_along_with_hdf5/build
%cd torch_example_along_with_hdf5/build
!cmake -DCMAKE_PREFIX_PATH=/content/libtorch ..
!cmake --build . --config Release
%cd /content

In [None]:
!pwd

In [None]:
!./torch_example_along_with_hdf5/build/torch-hdf5-example-app