Permalink
Browse files

Support unicode attributes at HDF5 level

  • Loading branch information...
avalentino committed Oct 6, 2012
1 parent 33fa52a commit 20886e6d74df172a696c6dba1a69aef34ae6f1c3
Showing with 119 additions and 43 deletions.
  1. +11 −0 RELEASE_NOTES.txt
  2. +16 −3 src/H5ATTR.c
  3. +6 −3 src/H5ATTR.h
  4. +3 −3 tables/definitions.pxd
  5. +65 −26 tables/hdf5Extension.pyx
  6. +9 −4 tables/tableExtension.pyx
  7. +9 −4 tables/utilsExtension.pyx
View
@@ -92,6 +92,17 @@ Many deprecated features and support for obsolete modules has been dropped:
removed
+Other backward incompatible changes
+-----------------------------------
+
+- Unicode attributes are no more stored in the HDF5 file as pickled string.
+ They are now saved on the HDF5 file as UTF-8 encoded strings.
+
+ Although this do not introduces no API breakage, files produced are different
+ (for what concerns unicode attributes) from the ones produced by earlier
+ versions of PyTables.
+
+
Bugs fixed
----------
View
@@ -114,7 +114,8 @@ herr_t H5ATTRset_attribute( hid_t obj_id,
herr_t H5ATTRset_attribute_string( hid_t obj_id,
const char *attr_name,
- const char *attr_data )
+ const char *attr_data,
+ int cset )
{
hid_t attr_type;
hid_t attr_size;
@@ -126,6 +127,10 @@ herr_t H5ATTRset_attribute_string( hid_t obj_id,
if ( (attr_type = H5Tcopy( H5T_C_S1 )) < 0 )
goto out;
+ if ( ( ( cset == H5T_CSET_ASCII ) || ( cset == H5T_CSET_UTF8 ) ) &&
+ ( H5Tset_cset( attr_type, cset ) < 0 ) )
+ goto out;
+
attr_size = strlen( attr_data ) + 1; /* extra null term */
if ( H5Tset_size( attr_type, (size_t)attr_size) < 0 )
@@ -237,7 +242,8 @@ herr_t H5ATTRget_attribute( hid_t obj_id,
herr_t H5ATTRget_attribute_string( hid_t obj_id,
const char *attr_name,
- char **data)
+ char **data,
+ int *cset )
{
/* identifiers */
hid_t attr_id;
@@ -253,6 +259,9 @@ herr_t H5ATTRget_attribute_string( hid_t obj_id,
if ( (attr_type = H5Aget_type( attr_id )) < 0 )
goto out;
+ if ( ( cset != NULL ) && ( ( *cset = H5Tget_cset( attr_type ) ) < 0 ) )
+ goto out;
+
is_vlstr = H5Tis_variable_str( attr_type );
if ( is_vlstr == 0 )
{
@@ -310,7 +319,8 @@ herr_t H5ATTRget_attribute_string( hid_t obj_id,
int H5ATTRget_attribute_vlen_string_array( hid_t obj_id,
const char *attr_name,
- char ***data)
+ char ***data,
+ int *cset )
{
/* identifiers */
hid_t attr_id = -1, attr_type = -1, space_id = -1;
@@ -325,6 +335,9 @@ int H5ATTRget_attribute_vlen_string_array( hid_t obj_id,
if ( (attr_type = H5Aget_type( attr_id )) < 0 )
goto out;
+ if ( ( cset != NULL ) && ( ( *cset = H5Tget_cset( attr_type ) ) < 0 ) )
+ goto out;
+
if ( (space_id = H5Aget_space( attr_id )) < 0 )
goto out;
View
@@ -39,7 +39,8 @@ herr_t H5ATTRset_attribute( hid_t obj_id,
herr_t H5ATTRset_attribute_string( hid_t obj_id,
const char *attr_name,
- const char *attr_data );
+ const char *attr_data,
+ int cset);
herr_t H5ATTRget_attribute( hid_t loc_id,
const char *attr_name,
@@ -48,11 +49,13 @@ herr_t H5ATTRget_attribute( hid_t loc_id,
herr_t H5ATTRget_attribute_string( hid_t obj_id,
const char *attr_name,
- char **data);
+ char **data,
+ int *cset );
int H5ATTRget_attribute_vlen_string_array( hid_t obj_id,
const char *attr_name,
- char ***data );
+ char ***data,
+ int *cset );
/*-------------------------------------------------------------------------
*
View
@@ -433,14 +433,14 @@ cdef extern from "H5ATTR.h" nogil:
herr_t H5ATTRget_attribute(hid_t loc_id, char *attr_name,
hid_t type_id, void *data)
herr_t H5ATTRget_attribute_string(hid_t loc_id, char *attr_name,
- char **attr_value)
+ char **attr_value, int *cset)
herr_t H5ATTRget_attribute_vlen_string_array(hid_t loc_id, char *attr_name,
- char ***attr_value)
+ char ***attr_value, int *cset)
herr_t H5ATTRset_attribute(hid_t obj_id, char *attr_name,
hid_t type_id, size_t rank, hsize_t *dims,
char *attr_data )
herr_t H5ATTRset_attribute_string(hid_t loc_id, char *attr_name,
- char *attr_data)
+ char *attr_data, int cset)
herr_t H5ATTRfind_attribute(hid_t loc_id, char *attr_name)
herr_t H5ATTRget_type_ndims(hid_t loc_id, char *attr_name,
hid_t *type_id, H5T_class_t *class_id,
View
@@ -58,17 +58,18 @@ from utilsExtension cimport malloc_dims, get_native_type
# Types, constants, functions, classes & other objects from everywhere
from libc.stdlib cimport malloc, free
-from libc.string cimport strdup
+from libc.string cimport strdup, strlen
from numpy cimport import_array, ndarray
from cpython cimport (PyString_AsString, PyString_FromStringAndSize,
PyString_Check)
+from cpython.unicode cimport PyUnicode_DecodeUTF8
from definitions cimport (const_char, uintptr_t, hid_t, herr_t, hsize_t, hvl_t,
H5S_seloper_t, H5D_FILL_VALUE_UNDEFINED,
H5O_TYPE_UNKNOWN, H5O_TYPE_GROUP, H5O_TYPE_DATASET, H5O_TYPE_NAMED_DATATYPE,
H5L_TYPE_ERROR, H5L_TYPE_HARD, H5L_TYPE_SOFT, H5L_TYPE_EXTERNAL,
- H5T_class_t, H5T_sign_t, H5T_NATIVE_INT,
+ H5T_class_t, H5T_sign_t, H5T_NATIVE_INT, H5T_CSET_ASCII, H5T_CSET_UTF8,
H5F_SCOPE_GLOBAL, H5F_ACC_TRUNC, H5F_ACC_RDONLY, H5F_ACC_RDWR,
H5P_DEFAULT, H5P_FILE_ACCESS, H5P_FILE_CREATE,
H5S_SELECT_SET, H5S_SELECT_AND, H5S_SELECT_NOTB,
@@ -94,6 +95,7 @@ from definitions cimport (const_char, uintptr_t, hid_t, herr_t, hsize_t, hvl_t,
H5_HAVE_WINDOWS_DRIVER, pt_H5Pset_fapl_windows,
H5_HAVE_IMAGE_FILE, pt_H5Pset_file_image, pt_H5Fget_file_image)
+cdef int H5T_CSET_DEFAULT = 16
# Include conversion tables
include "convtypetables.pxi"
@@ -198,22 +200,28 @@ cdef object getshape(int rank, hsize_t *dims):
# Helper function for quickly fetch an attribute string
cdef object get_attribute_string_or_none(node_id, attr_name):
- """Returns a string attribute if it exists in node_id.
+ """Returns a string/unicode attribute if it exists in node_id.
It returns ``None`` in case it don't exists (or there have been problems
reading it).
"""
cdef char *attr_value
+ cdef int cset = H5T_CSET_DEFAULT
cdef object retvalue
attr_value = NULL
retvalue = None # Default value
if H5ATTRfind_attribute(node_id, attr_name):
- ret = H5ATTRget_attribute_string(node_id, attr_name, &attr_value)
+ ret = H5ATTRget_attribute_string(node_id, attr_name, &attr_value, &cset)
if ret < 0:
return None
- retvalue = numpy.string_(attr_value)
+ if cset == H5T_CSET_UTF8:
+ retvalue = PyUnicode_DecodeUTF8(attr_value, strlen(attr_value), NULL)
+ retvalue = numpy.unicode_(retvalue)
+ else:
+ retvalue = numpy.string_(attr_value)
+
# Important to release attr_value, because it has been malloc'ed!
if attr_value:
free(<void *>attr_value)
@@ -613,6 +621,7 @@ cdef class AttributeSet:
cdef hsize_t *dims
cdef ndarray ndv
cdef object byteorder, rabyteorder, baseatom
+ cdef int cset = H5T_CSET_DEFAULT
# The dataset id of the node
dset_id = node._v_objectID
@@ -655,13 +664,14 @@ cdef class AttributeSet:
if (isinstance(value, numpy.ndarray) and
value.dtype.kind == 'U' and
value.shape == ()):
- value = value[()]
- # Convert this object to a null-terminated string
- # (binary pickles are not supported at this moment)
- value = pickle.dumps(value, 0)
- ret = H5ATTRset_attribute_string(dset_id, name, value)
+ value = value[()].encode('utf-8')
+ cset = H5T_CSET_UTF8
+ else:
+ # Convert this object to a null-terminated string
+ # (binary pickles are not supported at this moment)
+ value = pickle.dumps(value, 0)
- return
+ ret = H5ATTRset_attribute_string(dset_id, name, value, cset)
# Get attributes
@@ -682,6 +692,7 @@ cdef class AttributeSet:
cdef ndarray ndvalue
cdef object shape, stype_atom, shape_atom, retvalue
cdef int i, nelements
+ cdef int cset = H5T_CSET_DEFAULT
# The dataset id of the node
dset_id = node._v_objectID
@@ -695,11 +706,15 @@ cdef class AttributeSet:
# Call a fast function for scalar values and typical class types
if (rank == 0 and class_id == H5T_STRING):
- ret = H5ATTRget_attribute_string(dset_id, attrname, &str_value)
+ ret = H5ATTRget_attribute_string(dset_id, attrname, &str_value, &cset)
if ret < 0:
raise HDF5ExtError("Can't read attribute %s in node %s." %
(attrname, self.name))
- retvalue = numpy.string_(str_value)
+ if cset == H5T_CSET_UTF8:
+ retvalue = PyUnicode_DecodeUTF8(str_value, strlen(str_value), NULL)
+ retvalue = numpy.unicode_(retvalue)
+ else:
+ retvalue = numpy.string_(str_value)
# Important to release attr_value, because it has been malloc'ed!
if str_value:
free(str_value)
@@ -734,14 +749,29 @@ cdef class AttributeSet:
except TypeError:
if class_id == H5T_STRING and H5Tis_variable_str(type_id):
nelements = H5ATTRget_attribute_vlen_string_array(dset_id, attrname,
- &str_values)
+ &str_values, &cset)
if nelements < 0:
raise HDF5ExtError("Can't read attribute %s in node %s." %
(attrname, self.name))
- #vl = [<char*>str_values[i] for i in range(nelements)]
- retvalue = numpy.array(
- [<char*>str_values[i] for i in range(nelements)], "O8")
+ # The following generator expressions do not work with Cython 0.15.1
+ if cset == H5T_CSET_UTF8:
+ #retvalue = numpy.fromiter(
+ # PyUnicode_DecodeUTF8(<char*>str_values[i],
+ # strlen(<char*>str_values[i]),
+ # NULL)
+ # for i in range(nelements), "O8")
+ retvalue = numpy.array([
+ PyUnicode_DecodeUTF8(<char*>str_values[i],
+ strlen(<char*>str_values[i]),
+ NULL)
+ for i in range(nelements)], "O8")
+
+ else:
+ #retvalue = numpy.fromiter(
+ # <char*>str_values[i] for i in range(nelements), "O8")
+ retvalue = numpy.array(
+ [<char*>str_values[i] for i in range(nelements)], "O8")
retvalue.shape = shape
# Important to release attr_value, because it has been malloc'ed!
@@ -1102,9 +1132,12 @@ cdef class Array(Leaf):
if self._v_file.params['PYTABLES_SYS_ATTRS']:
# Set the conforming array attributes
- H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_ )
- H5ATTRset_attribute_string(self.dataset_id, "VERSION", version)
- H5ATTRset_attribute_string(self.dataset_id, "TITLE", title)
+ H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_,
+ H5T_CSET_ASCII)
+ H5ATTRset_attribute_string(self.dataset_id, "VERSION", version,
+ H5T_CSET_ASCII)
+ H5ATTRset_attribute_string(self.dataset_id, "TITLE", title,
+ H5T_CSET_ASCII)
# Get the native type (so that it is HDF5 who is the responsible to deal
# with non-native byteorders on-disk)
@@ -1160,9 +1193,12 @@ cdef class Array(Leaf):
if self._v_file.params['PYTABLES_SYS_ATTRS']:
# Set the conforming array attributes
- H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_ )
- H5ATTRset_attribute_string(self.dataset_id, "VERSION", version)
- H5ATTRset_attribute_string(self.dataset_id, "TITLE", title)
+ H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_,
+ H5T_CSET_ASCII)
+ H5ATTRset_attribute_string(self.dataset_id, "VERSION", version,
+ H5T_CSET_ASCII)
+ H5ATTRset_attribute_string(self.dataset_id, "TITLE", title,
+ H5T_CSET_ASCII)
if self.extdim >= 0:
extdim = <ndarray>numpy.array([self.extdim], dtype="int32")
# Attach the EXTDIM attribute in case of enlargeable arrays
@@ -1667,9 +1703,12 @@ cdef class VLArray(Leaf):
if self._v_file.params['PYTABLES_SYS_ATTRS']:
# Set the conforming array attributes
- H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_ )
- H5ATTRset_attribute_string(self.dataset_id, "VERSION", version)
- H5ATTRset_attribute_string(self.dataset_id, "TITLE", title)
+ H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_,
+ H5T_CSET_ASCII)
+ H5ATTRset_attribute_string(self.dataset_id, "VERSION", version,
+ H5T_CSET_ASCII)
+ H5ATTRset_attribute_string(self.dataset_id, "TITLE", title,
+ H5T_CSET_ASCII)
# Get the datatype handles
self.disk_type_id, self.type_id = self._get_type_ids()
View
@@ -51,6 +51,7 @@ from definitions cimport (hid_t, herr_t, hsize_t, htri_t,
H5T_class_t, H5Tget_size, H5Tset_size, H5Tcreate, H5Tcopy, H5Tclose,
H5Tget_nmembers, H5Tget_member_name, H5Tget_member_type, H5Tget_native_type,
H5Tget_member_value, H5Tinsert, H5Tget_class, H5Tget_super, H5Tget_offset,
+ H5T_CSET_ASCII,
H5ATTRset_attribute_string, H5ATTRset_attribute,
get_len_of_range, get_order, set_order, is_complex,
conv_float64_timeval32, truncate_dset)
@@ -190,17 +191,20 @@ cdef class Table(Leaf):
if self._v_file.params['PYTABLES_SYS_ATTRS']:
# Set the conforming table attributes
# Attach the CLASS attribute
- ret = H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_)
+ ret = H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_,
+ H5T_CSET_ASCII)
if ret < 0:
raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." %
("CLASS", self.name))
# Attach the VERSION attribute
- ret = H5ATTRset_attribute_string(self.dataset_id, "VERSION", obversion)
+ ret = H5ATTRset_attribute_string(self.dataset_id, "VERSION", obversion,
+ H5T_CSET_ASCII)
if ret < 0:
raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." %
("VERSION", self.name))
# Attach the TITLE attribute
- ret = H5ATTRset_attribute_string(self.dataset_id, "TITLE", title)
+ ret = H5ATTRset_attribute_string(self.dataset_id, "TITLE", title,
+ H5T_CSET_ASCII)
if ret < 0:
raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." %
("TITLE", self.name))
@@ -216,7 +220,8 @@ cdef class Table(Leaf):
# We write only the first level names
for i, name in enumerate(self.description._v_names):
fieldname = "FIELD_%s_NAME" % i
- ret = H5ATTRset_attribute_string(self.dataset_id, fieldname, name)
+ ret = H5ATTRset_attribute_string(self.dataset_id, fieldname, name,
+ H5T_CSET_ASCII)
if ret < 0:
raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." %
(fieldname, self.name))
Oops, something went wrong.

0 comments on commit 20886e6

Please sign in to comment.