Skip to content

Commit

Permalink
Merge pull request #384 from UCL-CCS/hdf5_decoder
Browse files Browse the repository at this point in the history
added HDF5 decoder
  • Loading branch information
wedeling committed Feb 6, 2023
2 parents ba96e7d + 6ae7990 commit 6dd5ec6
Show file tree
Hide file tree
Showing 8 changed files with 785 additions and 974 deletions.
1 change: 1 addition & 0 deletions easyvvuq/decoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .simple_csv import SimpleCSV
from .json import JSONDecoder
from .yaml import YAMLDecoder
from .hdf5 import HDF5

__copyright__ = """
Expand Down
109 changes: 109 additions & 0 deletions easyvvuq/decoders/hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""A Decoder for HDF5 format files.
"""

import os
import logging
import h5py
from easyvvuq import OutputType

__copyright__ = """
Copyright 2018 Robin A. Richardson, David W. Wright
This file is part of EasyVVUQ
EasyVVUQ is free software: you can redistribute it and/or modify
it under the terms of the Lesser GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EasyVVUQ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Lesser GNU General Public License for more details.
You should have received a copy of the Lesser GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__license__ = "LGPL"


logger = logging.Logger(__name__)


class HDF5:
"""HDF5 Decoder.
Parameters
----------
target_filename: str
Filename of an HDF5 file to decode.
ouput_columns: list
A list of column names that will be selected to appear in the output.
"""

def __init__(self, target_filename, output_columns):
if len(output_columns) == 0:
msg = "output_columns cannot be empty."
logger.error(msg)
raise RuntimeError(msg)
self.target_filename = target_filename
self.output_columns = output_columns
self.output_type = OutputType('sample')

@staticmethod
def _get_output_path(run_info=None, outfile=None):
"""Constructs absolute path from the `target_filename` and the `run_dir` parameter
in the `run_info` retrieved from the database.
Parameters
----------
run_info: dict
Run info as retrieved from the database.
outfile: str
Filename of the file to be parsed.
Returns
-------
str
An absolute path to the output file in the run directory.
"""
run_path = run_info['run_dir']
if not os.path.isdir(run_path):
raise RuntimeError(f"Run directory does not exist: {run_path}")
return os.path.join(run_path, outfile)

def parse_sim_output(self, run_info={}):
"""Parses the HDF5 file and converts it to the EasyVVUQ internal dictionary based
format. The file is parsed in such a way that each column will appear as a vector
QoI in the output dictionary.
For example if the file contains the following data
a,b
1,2
3,4
And both `a` and `b` are specified as `output_columns` the output will look as follows
{'a': [1, 3], 'b': [2, 4]}.
Parameters
----------
run_info: dict
Information about the run (used to retrieve construct the absolute path
to the CSV file that needs decoding.
"""
out_path = self._get_output_path(run_info, self.target_filename)
results = {}

with h5py.File(out_path, 'r') as h5f:
for column in self.output_columns:
try:
# TODO: this will always flatten, but HDF5 could handle
# 2D or 3D arrays as well. Will probably break the analysis
# classes though, but maybe something to incorporate later.
results[column] = h5f[column][()].flatten().tolist()
except KeyError:
raise RuntimeError('column not found in the hdf5 file: {}'.format(column))

return results
2 changes: 1 addition & 1 deletion easyvvuq/sampling/simplex_stochastic_collocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ def check_LEC_j(self, p_j, v, S_j, n_mc, queue):
----------
p_j : int
The polynomial order of the j-th stencil.
v : array
v : array
The code samples.
S_j : array, shape (N + 1,)
The interpolation stencil of the j-th simplex element, expressed
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ dill
tqdm
qcg-pilotjob~=0.13.0
qcg-pilotjob-executor-api~=0.13.0
h5py
Binary file added tests/hdf5/test.hdf5
Binary file not shown.
24 changes: 24 additions & 0 deletions tests/test_decoder_hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from easyvvuq.decoders.hdf5 import HDF5
import os
import numpy as np
import pytest


@pytest.fixture
def decoder():
return HDF5('test.hdf5', output_columns=['array10', 'array2'])


def test_hdf5(decoder):
df = decoder.parse_sim_output({'run_dir': os.path.join('tests', 'hdf5')})
# test decoding arrays of different length from the same file
assert df['array10'] == np.arange(10).tolist()
assert df['array2'] == np.arange(2).tolist()


def test_get_output_path(decoder):
assert(decoder._get_output_path(
{'run_dir': os.path.join('tests', 'hdf5')}, 'test.hdf5') ==
os.path.join('tests', 'hdf5', 'test.hdf5'))
with pytest.raises(RuntimeError):
decoder._get_output_path({'run_dir': os.path.join('hdf5')}, 'test.hdf5')

0 comments on commit 6dd5ec6

Please sign in to comment.