From 9343f449137496e3727a77ca54e0f2b9852c42eb Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Fri, 16 Sep 2022 23:17:11 -0500 Subject: [PATCH] Move `Attr` To Pure Python --- HISTORY.md | 3 + tiledb/__init__.py | 3 +- tiledb/attribute.py | 301 ++++++++++++++++++++++ tiledb/cc/attribute.cc | 67 ++++- tiledb/cc/common.cc | 6 +- tiledb/cc/enum.cc | 30 +-- tiledb/filter.py | 21 +- tiledb/indexing.pyx | 17 +- tiledb/libtiledb.pxd | 11 - tiledb/libtiledb.pyx | 439 ++------------------------------- tiledb/np2buf.py | 106 ++++++++ tiledb/tests/test_attribute.py | 179 ++++++++++++++ tiledb/tests/test_libtiledb.py | 171 ------------- tiledb/util.py | 111 ++++++++- 14 files changed, 808 insertions(+), 657 deletions(-) create mode 100644 tiledb/attribute.py create mode 100644 tiledb/np2buf.py create mode 100644 tiledb/tests/test_attribute.py diff --git a/HISTORY.md b/HISTORY.md index 86b4c6f8f5..d11486a02e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,8 @@ # In Progress +## Improvements +* Move `Attr` from Cython to pure Python [#1326](https://github.com/TileDB-Inc/TileDB-Py/pull/1326) + ## API Changes * Permit true-ASCII attributes in non-from-pandas dataframes [#1337](https://github.com/TileDB-Inc/TileDB-Py/pull/1337) * Addition of `Array.upgrade_version` to upgrade array to latest version [#1334](https://github.com/TileDB-Inc/TileDB-Py/pull/1334) diff --git a/tiledb/__init__.py b/tiledb/__init__.py index 450f2405b5..e45f1174c2 100644 --- a/tiledb/__init__.py +++ b/tiledb/__init__.py @@ -42,7 +42,6 @@ Config, Dim, Domain, - Attr, ArraySchema, consolidate, object_type, @@ -59,6 +58,8 @@ from .array import DenseArray, SparseArray +from .attribute import Attr + from .filter import ( Filter, FilterList, diff --git a/tiledb/attribute.py b/tiledb/attribute.py new file mode 100644 index 0000000000..5418fe43c4 --- /dev/null +++ b/tiledb/attribute.py @@ -0,0 +1,301 @@ +import io +import numpy as np +import warnings +from typing import Any, Sequence, TYPE_CHECKING, Union + +import tiledb.cc as lt +from .np2buf import array_type_ncells +from .ctx import default_ctx +from .filter import FilterList, Filter +from .util import _numpy_dtype, _tiledb_type_is_datetime + +if TYPE_CHECKING: + from .libtiledb import Ctx + + +class Attr(lt.Attribute): + """ + Represents a TileDB attribute. + """ + + def __init__( + self, + name: str = "", + dtype: np.dtype = np.float64, + fill: Any = None, + var: bool = None, + nullable: bool = False, + filters: Union[FilterList, Sequence[Filter]] = None, + ctx: "Ctx" = None, + _lt_obj: lt.Attribute = None, + _capsule: "PyCapsule" = None, + ): + """Class representing a TileDB array attribute. + + :param tiledb.Ctx ctx: A TileDB Context + :param str name: Attribute name, empty if anonymous + :param dtype: Attribute value datatypes + :type dtype: numpy.dtype object or type or string + :param nullable: Attribute is nullable + :type bool: + :param fill: Fill value for unset cells. + :param var: Attribute is variable-length (automatic for byte/string types) + :type dtype: bool + :param filters: List of filters to apply + :type filters: FilterList + :raises TypeError: invalid dtype + :raises: :py:exc:`tiledb.TileDBError` + + """ + self._ctx = ctx or default_ctx() + _cctx = lt.Context(self._ctx, False) + _dtype = None + + if _capsule is not None: + return super().__init__(_cctx, _capsule) + + if _lt_obj is not None: + name = _lt_obj._name + if _lt_obj._tiledb_dtype == lt.DataType.STRING_ASCII: + dtype = "ascii" + elif _lt_obj._tiledb_dtype == lt.DataType.BLOB: + dtype = "blob" + else: + dtype = np.dtype(_numpy_dtype(_lt_obj._tiledb_dtype, _lt_obj._ncell)) + nullable = _lt_obj._nullable + if not nullable: + fill = self._get_fill(_lt_obj._fill, _lt_obj._tiledb_dtype) + var = _lt_obj._var + filters = _lt_obj._filters + + if isinstance(dtype, str) and dtype == "ascii": + tiledb_dtype = lt.DataType.STRING_ASCII + _ncell = lt.TILEDB_VAR_NUM() + if var is None: + var = True + elif isinstance(dtype, str) and dtype == "blob": + tiledb_dtype = lt.DataType.BLOB + _ncell = 1 + else: + _dtype = np.dtype(dtype) + tiledb_dtype, _ncell = array_type_ncells(_dtype) + + # ensure that all unicode strings are var-length + if var or (_dtype and _dtype.kind == "U"): + var = True + _ncell = lt.TILEDB_VAR_NUM() + + if _dtype and _dtype.kind == "S": + if var and 0 < _dtype.itemsize: + warnings.warn( + f"Attr given `var=True` but `dtype` `{_dtype}` is fixed; " + "setting `dtype=S0`. Hint: set `var=True` with `dtype=S0`, " + f"or `var=False`with `dtype={_dtype}`", + DeprecationWarning, + ) + _dtype = np.dtype("S0") + + if _dtype.itemsize == 0: + if var == False: + warnings.warn( + f"Attr given `var=False` but `dtype` `S0` is var-length; " + "setting `var=True` and `dtype=S0`. Hint: set `var=False` " + "with `dtype=S0`, or `var=False` with a fixed-width " + "string `dtype=S` where is n>1", + DeprecationWarning, + ) + var = True + _ncell = lt.TILEDB_VAR_NUM() + + var = var or False + + super().__init__(_cctx, name, tiledb_dtype) + + if _ncell: + self._ncell = _ncell + + var = var or False + + if self._ncell == lt.TILEDB_VAR_NUM() and not var: + raise TypeError("dtype is not compatible with var-length attribute") + + if filters is not None: + if isinstance(filters, FilterList): + self._filters = filters + elif isinstance(filters, lt.FilterList): + self._filters = FilterList(_lt_obj=filters) + else: + self._filters = FilterList(filters) + + if fill is not None: + self._fill = np.array([fill], dtype=self.dtype) + + if nullable is not None: + self._nullable = nullable + + def __eq__(self, other): + if not isinstance(other, Attr): + return False + if self.name != other.name or self.dtype != other.dtype: + return False + return True + + def dump(self): + """Dumps a string representation of the Attr object to standard output (stdout)""" + self._dump() + + @property + def dtype(self) -> np.dtype: + """Return numpy dtype object representing the Attr type + + :rtype: numpy.dtype + + """ + return np.dtype(_numpy_dtype(self._tiledb_dtype, self._ncell)) + + @property + def name(self) -> str: + """Attribute string name, empty string if the attribute is anonymous + + :rtype: str + :raises: :py:exc:`tiledb.TileDBError` + + """ + internal_name = self._name + # handle __attr names from arrays written with libtiledb < 2 + if internal_name == "__attr": + return "" + return internal_name + + @property + def _internal_name(self): + return self._name + + @property + def isanon(self) -> bool: + """True if attribute is an anonymous attribute + + :rtype: bool + + """ + return self._name == "" or self._name.startswith("__attr") + + @property + def filters(self) -> FilterList: + """FilterList of the TileDB attribute + + :rtype: tiledb.FilterList + :raises: :py:exc:`tiledb.TileDBError` + + """ + return FilterList(_lt_obj=self._filters) + + def _get_fill(self, value, dtype) -> Any: + if dtype in (lt.DataType.CHAR, lt.DataType.BLOB): + return value.tobytes() + + if _tiledb_type_is_datetime(dtype): + return value[0].astype(np.timedelta64) + + return value + + @property + def fill(self) -> Any: + """Fill value for unset cells of this attribute + + :rtype: depends on dtype + :raises: :py:exc:`tiledb.TileDBERror` + """ + return self._get_fill(self._fill, self._tiledb_dtype) + + @property + def isnullable(self) -> bool: + """True if the attribute is nullable + + :rtype: bool + :raises: :py:exc:`tiledb.TileDBError` + + """ + return self._nullable + + @property + def isvar(self) -> bool: + """True if the attribute is variable length + + :rtype: bool + :raises: :py:exc:`tiledb.TileDBError` + + """ + return self._var + + @property + def ncells(self) -> int: + """The number of cells (scalar values) for a given attribute value + + :rtype: int + :raises: :py:exc:`tiledb.TileDBError` + + """ + assert self._ncell != 0 + return int(self._ncell) + + @property + def isascii(self) -> bool: + """True if the attribute is TileDB dtype TILEDB_STRING_ASCII + + :rtype: bool + :raises: :py:exc:`tiledb.TileDBError` + + """ + return self._tiledb_dtype == lt.DataType.STRING_ASCII + + def __repr__(self): + filters_str = "" + if self.filters: + filters_str = ", filters=FilterList([" + for f in self.filters: + filters_str += repr(f) + ", " + filters_str += "])" + + if self._tiledb_dtype == lt.DataType.STRING_ASCII: + attr_dtype = "ascii" + elif self._tiledb_dtype == lt.DataType.BLOB: + attr_dtype = "blob" + else: + attr_dtype = self.dtype + + # filters_str must be last with no spaces + return ( + f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """ + f"""var={self.isvar!s}, nullable={self.isnullable!s}""" + f"""{filters_str})""" + ) + + def _repr_html_(self): + output = io.StringIO() + + output.write("") + output.write("") + output.write("") + output.write("") + output.write("") + output.write("") + output.write("") + output.write("") + output.write(f"{self._repr_html_row_only_()}") + output.write("
NameData TypeIs Var-LenIs NullableFilters
") + + return output.getvalue() + + def _repr_html_row_only_(self): + output = io.StringIO() + + output.write("") + output.write(f"{self.name}") + output.write(f"{'ascii' if self.isascii else self.dtype}") + output.write(f"{self.isvar}") + output.write(f"{self.isnullable}") + output.write(f"{self.filters._repr_html_()}") + output.write("") + + return output.getvalue() diff --git a/tiledb/cc/attribute.cc b/tiledb/cc/attribute.cc index 38742e4c28..95c714912c 100644 --- a/tiledb/cc/attribute.cc +++ b/tiledb/cc/attribute.cc @@ -13,25 +13,68 @@ using namespace tiledb; using namespace tiledbpy::common; namespace py = pybind11; +void set_fill_value(Attribute &attr, py::array value) { + attr.set_fill_value(value.data(), value.nbytes()); +} + +py::array get_fill_value(Attribute &attr) { + const void *value; + uint64_t size; + + attr.get_fill_value(&value, &size); + + auto value_num = attr.cell_val_num(); + auto value_type = tdb_to_np_dtype(attr.type(), value_num); + + if (is_tdb_str(attr.type())) { + value_type = py::dtype("|S1"); + value_num = size; + } + + // record type + if (py::str(value_type.attr("kind")) == py::str("V")) { + value_num = 1; + } + + return py::array(value_type, value_num, value); +} + void init_attribute(py::module &m) { py::class_(m, "Attribute") - .def(py::init(), - py::keep_alive<1, 2>() /* Attribute keeps Context alive */) + .def(py::init(), + py::keep_alive<1, 2>()) + .def( py::init(), - py::keep_alive<1, 2>() /* Attribute keeps Context alive */) + py::keep_alive<1, 2>()) - .def_property_readonly("name", &Attribute::name) - .def_property_readonly("dtype", &Attribute::type) - .def_property("nullable", &Attribute::nullable, &Attribute::set_nullable) - .def_property("ncell", &Attribute::cell_val_num, + .def(py::init(), py::keep_alive<1, 2>()) + + .def("__capsule__", + [](Attribute &attr) { + return py::capsule(attr.ptr().get(), "attr", nullptr); + }) + + .def_property_readonly("_name", &Attribute::name) + + .def_property_readonly("_tiledb_dtype", &Attribute::type) + + .def_property("_nullable", &Attribute::nullable, &Attribute::set_nullable) + + .def_property("_ncell", &Attribute::cell_val_num, &Attribute::set_cell_val_num) - // .def_property("fill", &Attribute::get_fill_value, - // &Attribute::set_fill_value) - .def_property_readonly("var", &Attribute::variable_sized) - .def_property("filters", &Attribute::filter_list, + + .def_property_readonly("_var", &Attribute::variable_sized) + + .def_property("_filters", &Attribute::filter_list, &Attribute::set_filter_list) - .def_property_readonly("cell_size", &Attribute::cell_size); + + .def_property_readonly("_cell_size", &Attribute::cell_size) + + .def_property("_fill", get_fill_value, set_fill_value) + + .def("_dump", [](Attribute &attr) { attr.dump(); }); + ; } } // namespace libtiledbcpp diff --git a/tiledb/cc/common.cc b/tiledb/cc/common.cc index 744e096ff4..3f0e086355 100644 --- a/tiledb/cc/common.cc +++ b/tiledb/cc/common.cc @@ -13,9 +13,9 @@ std::unordered_map _tdb_to_np_name_dtype = { {TILEDB_UINT16, "uint16"}, {TILEDB_UINT32, "uint32"}, {TILEDB_UINT64, "uint64"}, - {TILEDB_STRING_ASCII, "|S1"}, - {TILEDB_STRING_UTF8, "|U1"}, - {TILEDB_CHAR, "|S1"}, + {TILEDB_STRING_ASCII, "S"}, + {TILEDB_STRING_UTF8, "U1"}, + {TILEDB_CHAR, "S1"}, {TILEDB_DATETIME_YEAR, "M8[Y]"}, {TILEDB_DATETIME_MONTH, "M8[M]"}, {TILEDB_DATETIME_WEEK, "M8[W]"}, diff --git a/tiledb/cc/enum.cc b/tiledb/cc/enum.cc index 0fad68d34f..9285d88dce 100644 --- a/tiledb/cc/enum.cc +++ b/tiledb/cc/enum.cc @@ -20,21 +20,21 @@ void init_enums(py::module &m) { m.def("TILEDB_OFFSET_SIZE", []() { return TILEDB_OFFSET_SIZE; }); m.def("TILEDB_TIMESTAMP_NOW_MS", []() { return TILEDB_TIMESTAMP_NOW_MS; }); - py::enum_(m, "DataType", py::module_local()) DENUM(INT32) - DENUM(INT64) DENUM(FLOAT32) DENUM(FLOAT64) DENUM(CHAR) DENUM(INT8) - DENUM(UINT8) DENUM(INT16) DENUM(UINT16) DENUM(UINT32) DENUM(UINT64) - DENUM(STRING_ASCII) DENUM(STRING_UTF8) DENUM(STRING_UTF16) DENUM( - STRING_UTF32) DENUM(STRING_UCS2) DENUM(STRING_UCS4) DENUM(ANY) - DENUM(DATETIME_YEAR) DENUM(DATETIME_WEEK) DENUM(DATETIME_DAY) - DENUM(DATETIME_HR) DENUM(DATETIME_MIN) DENUM(DATETIME_SEC) - DENUM(DATETIME_MS) DENUM(DATETIME_US) - DENUM(DATETIME_NS) DENUM(DATETIME_PS) - DENUM(DATETIME_FS) DENUM(DATETIME_AS) - DENUM(TIME_HR) DENUM(TIME_MIN) - DENUM(TIME_SEC) DENUM(TIME_MS) - DENUM(TIME_US) DENUM(TIME_NS) - DENUM(TIME_PS) DENUM(TIME_FS) - DENUM(TIME_AS); + py::enum_(m, "DataType", py::module_local()) DENUM( + INT32) DENUM(INT64) DENUM(FLOAT32) DENUM(FLOAT64) DENUM(CHAR) DENUM(INT8) + DENUM(UINT8) DENUM(INT16) DENUM(UINT16) DENUM(UINT32) DENUM(UINT64) DENUM( + BOOL) DENUM(STRING_ASCII) DENUM(STRING_UTF8) DENUM(STRING_UTF16) + DENUM(STRING_UTF32) DENUM(STRING_UCS2) DENUM(STRING_UCS4) DENUM(ANY) + DENUM(DATETIME_YEAR) DENUM(DATETIME_MONTH) DENUM(DATETIME_WEEK) + DENUM(DATETIME_DAY) DENUM(DATETIME_HR) DENUM(DATETIME_MIN) + DENUM(DATETIME_SEC) DENUM(DATETIME_MS) DENUM(DATETIME_US) + DENUM(DATETIME_NS) DENUM(DATETIME_PS) + DENUM(DATETIME_FS) DENUM(DATETIME_AS) + DENUM(TIME_HR) DENUM(TIME_MIN) DENUM(TIME_SEC) + DENUM(TIME_MS) DENUM(TIME_US) + DENUM(TIME_NS) DENUM(TIME_PS) + DENUM(TIME_FS) DENUM(TIME_AS) + DENUM(BLOB); py::enum_(m, "ArrayType") DENUM(DENSE) DENUM(SPARSE); diff --git a/tiledb/filter.py b/tiledb/filter.py index 787acea736..69e20391d4 100644 --- a/tiledb/filter.py +++ b/tiledb/filter.py @@ -1,5 +1,4 @@ import io -import numpy as np from typing import List, overload, Sequence, TYPE_CHECKING, Union import tiledb.cc as lt @@ -14,8 +13,7 @@ class Filter(lt.Filter): def __init__(self, type: lt.FilterOption, ctx: "Ctx" = None): self._ctx = ctx or default_ctx() - - super().__init__(lt.Context(self._ctx.__capsule__(), False), type) + super().__init__(lt.Context(self._ctx, False), type) def __repr__(self) -> str: output = io.StringIO() @@ -84,7 +82,7 @@ def __init__(self, type: lt.FilterType, level: int = -1, ctx: "Ctx" = None): super().__init__(type, self._ctx) self._set_option( - lt.Context(self._ctx.__capsule__(), False), + lt.Context(self._ctx, False), lt.FilterOption.COMPRESSION_LEVEL, level, ) @@ -677,16 +675,21 @@ def __init__( filters: Sequence[Filter] = None, chunksize: int = None, ctx: "Ctx" = None, - is_capsule: bool = False, + _lt_obj=None, + _capsule=None, ): self._ctx = ctx or default_ctx() - _cctx = lt.Context(self._ctx.__capsule__(), False) + _cctx = lt.Context(self._ctx, False) - if is_capsule: - super().__init__(_cctx, filters) + if _capsule is not None: + super().__init__(_cctx, _capsule) + elif _lt_obj is not None: + super().__init__(_cctx) + for i in range(_lt_obj._nfilters()): + self._add_filter(_lt_obj._filter(i)) + chunksize = _lt_obj._chunksize else: super().__init__(_cctx) - if filters is not None: filters = list(filters) for f in filters: diff --git a/tiledb/indexing.pyx b/tiledb/indexing.pyx index e578fca913..aaae7509b9 100644 --- a/tiledb/indexing.pyx +++ b/tiledb/indexing.pyx @@ -41,8 +41,8 @@ cdef class DomainIndexer(object): def __getitem__(self, object idx): # implements domain-based indexing: slice by domain coordinates, not 0-based python indexing - cdef ArraySchema schema = self.array.schema - cdef Domain dom = schema.domain + schema = self.array.schema + dom = schema.domain cdef ndim = dom.ndim cdef list attr_names = list() @@ -140,7 +140,6 @@ cdef dict execute_multi_index(Array array, unicode coord_name = (tiledb_coords()).decode('UTF-8') cdef: - Attr attr Py_ssize_t attr_idx bytes battr_name unicode attr_name @@ -257,13 +256,13 @@ cdef dict execute_multi_index(Array array, repeat_query = False break elif query_status == TILEDB_FAILED: - raise TileDBError("Query returned TILEDB_FAILED") + raise lt.TileDBError("Query returned TILEDB_FAILED") elif query_status == TILEDB_INPROGRESS: - raise TileDBError("Query returned TILEDB_INPROGRESS") + raise lt.TileDBError("Query returned TILEDB_INPROGRESS") elif query_status == TILEDB_INCOMPLETE: - raise TileDBError("Query returned TILEDB_INCOMPLETE") + raise lt.TileDBError("Query returned TILEDB_INCOMPLETE") else: - raise TileDBError("internal error: unknown query status") + raise lt.TileDBError("internal error: unknown query status") # resize arrays to final bytes-read for attr_idx in range(nattr): @@ -306,7 +305,7 @@ cpdef multi_index(Array array, tuple attr_names, tuple ranges, tiledb_query_free(&query_ptr) _raise_ctx_err(ctx_ptr, rc) - cdef Dim dim = array.schema.domain.dim(0) + dim = array.schema.domain.dim(0) cdef uint32_t c_dim_idx cdef void* start_ptr = NULL cdef void* end_ptr = NULL @@ -331,7 +330,7 @@ cpdef multi_index(Array array, tuple attr_names, tuple ranges, for range_idx in range(len(dim_ranges)): if len(dim_ranges[range_idx]) != 2: - raise TileDBError("internal error: invalid sub-range: ", dim_ranges[range_idx]) + raise lt.TileDBError("internal error: invalid sub-range: ", dim_ranges[range_idx]) start = np.array(dim_ranges[range_idx][0], dtype=dim.dtype) end = np.array(dim_ranges[range_idx][1], dtype=dim.dtype) diff --git a/tiledb/libtiledb.pxd b/tiledb/libtiledb.pxd index 1ea3f8855f..de287350c7 100644 --- a/tiledb/libtiledb.pxd +++ b/tiledb/libtiledb.pxd @@ -1187,17 +1187,6 @@ cdef class ConfigValues(object): cdef class Ctx(object): cdef tiledb_ctx_t* ptr -cdef class Attr(object): - cdef Ctx ctx - cdef tiledb_attribute_t* ptr - - @staticmethod - cdef from_ptr(const tiledb_attribute_t* ptr, Ctx ctx=*) - cdef unicode _get_name(Attr self) - cdef unsigned int _cell_val_num(Attr self) except? 0 - cdef tiledb_datatype_t _get_type(Attr self) except? TILEDB_CHAR - - cdef class Dim(object): cdef Ctx ctx cdef tiledb_dimension_t* ptr diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index 327f69fb18..ed8ff443c1 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -14,6 +14,7 @@ from collections import OrderedDict from collections.abc import Sequence from .ctx import default_ctx +from .attribute import Attr from .filter import FilterList from .vfs import VFS @@ -1456,413 +1457,6 @@ cdef unicode _tiledb_layout_string(tiledb_layout_t order): return tiledb_order_to_string[order] -cdef class Attr(object): - """Class representing a TileDB array attribute. - - :param tiledb.Ctx ctx: A TileDB Context - :param str name: Attribute name, empty if anonymous - :param dtype: Attribute value datatypes - :type dtype: numpy.dtype object or type or string - :param nullable: Attribute is nullable - :type bool: - :param fill: Fill value for unset cells. - :param var: Attribute is variable-length (automatic for byte/string types) - :type dtype: bool - :param filters: List of filters to apply - :type filters: FilterList - :raises TypeError: invalid dtype - :raises: :py:exc:`tiledb.TileDBError` - - """ - - cdef unicode _get_name(Attr self): - cdef const char* c_name = NULL - check_error(self.ctx, - tiledb_attribute_get_name(self.ctx.ptr, self.ptr, &c_name)) - cdef unicode name = c_name.decode('UTF-8', 'strict') - return name - - cdef unsigned int _cell_val_num(Attr self) except? 0: - cdef unsigned int ncells = 0 - check_error(self.ctx, - tiledb_attribute_get_cell_val_num(self.ctx.ptr, self.ptr, &ncells)) - return ncells - - def __cinit__(self): - self.ptr = NULL - - def __dealloc__(self): - if self.ptr != NULL: - tiledb_attribute_free(&self.ptr) - - def __capsule__(self): - if self.ptr == NULL: - raise TileDBError("internal error: cannot create capsule for uninitialized Attr!") - cdef const char* name = "ctx" - cap = PyCapsule_New((self.ptr), name, NULL) - return cap - - @staticmethod - cdef from_ptr(const tiledb_attribute_t* ptr, Ctx ctx=None): - """Constructs an Attr class instance from a (non-null) tiledb_attribute_t pointer - """ - if not ctx: - ctx = default_ctx() - assert(ptr != NULL) - cdef Attr attr = Attr.__new__(Attr) - attr.ctx = ctx - # need to cast away the const - attr.ptr = ptr - return attr - - def __init__(self, - name=u"", - dtype=np.float64, - fill=None, - var=None, - nullable=False, - filters=None, - Ctx ctx=None): - if not ctx: - ctx = default_ctx() - cdef bytes bname = ustring(name).encode('UTF-8') - cdef const char* name_ptr = PyBytes_AS_STRING(bname) - cdef np.dtype _dtype = None - cdef tiledb_datatype_t tiledb_dtype - cdef uint32_t ncells - - if isinstance(dtype, str) and dtype == "ascii": - tiledb_dtype = TILEDB_STRING_ASCII - ncells = TILEDB_VAR_NUM - if var is None: - var = True - else: - _dtype = np.dtype(dtype) - tiledb_dtype, ncells = array_type_ncells(_dtype) - - # ensure that all unicode strings are var-length - if var or _dtype.kind == 'U': - var = True - ncells = TILEDB_VAR_NUM - - if _dtype and _dtype.kind == 'S': - if var and 0 < _dtype.itemsize: - warnings.warn( - f"Attr given `var=True` but `dtype` `{_dtype}` is fixed; " - "setting `dtype=S0`. Hint: set `var=True` with `dtype=S0`, " - f"or `var=False`with `dtype={_dtype}`", - DeprecationWarning, - ) - _dtype = np.dtype("S0") - - if _dtype.itemsize == 0: - if var == False: - warnings.warn( - f"Attr given `var=False` but `dtype` `S0` is var-length; " - "setting `var=True` and `dtype=S0`. Hint: set `var=False` " - "with `dtype=S0`, or `var=False` with a fixed-width " - "string `dtype=S` where is n>1", - DeprecationWarning, - ) - - var = True - ncells = TILEDB_VAR_NUM - - var = var or False - - # variable-length cell type - if ncells == TILEDB_VAR_NUM and not var: - raise TypeError("dtype is not compatible with var-length attribute") - - if filters is not None: - if not isinstance(filters, FilterList): - try: - filters = iter(filters) - except: - raise TypeError("filters argument must be a tiledb.FilterList or iterable of Filters") - else: - # we want this to raise a specific error if construction fails - filters = FilterList(filters, ctx=ctx) - filter_list = filters - - # alloc attribute object and set cell num / compressor - cdef tiledb_attribute_t* attr_ptr = NULL - cdef int rc = TILEDB_OK - rc = tiledb_attribute_alloc(ctx.ptr, name_ptr, tiledb_dtype, &attr_ptr) - if rc != TILEDB_OK: - _raise_ctx_err(ctx.ptr, rc) - rc = tiledb_attribute_set_cell_val_num(ctx.ptr, attr_ptr, ncells) - if rc != TILEDB_OK: - tiledb_attribute_free(&attr_ptr) - _raise_ctx_err(ctx.ptr, rc) - - if nullable: - rc = tiledb_attribute_set_nullable(ctx.ptr, attr_ptr, 1) - if rc != TILEDB_OK: - tiledb_attribute_free(&attr_ptr) - _raise_ctx_err(ctx.ptr, rc) - - cdef tiledb_filter_list_t* filter_list_ptr = NULL - if filters is not None: - filter_list_ptr = PyCapsule_GetPointer( - filter_list.__capsule__(), "fl") - rc = tiledb_attribute_set_filter_list(ctx.ptr, attr_ptr, filter_list_ptr) - if rc != TILEDB_OK: - tiledb_attribute_free(&attr_ptr) - _raise_ctx_err(ctx.ptr, rc) - - cdef void* fill_ptr - cdef uint64_t fill_nbytes - if fill is not None: - fill_array = np.array(fill, dtype=dtype) - fill_nbytes = fill_array.nbytes - fill_ptr = np.PyArray_DATA(fill_array) - rc = tiledb_attribute_set_fill_value(ctx.ptr, - attr_ptr, - fill_ptr, - fill_nbytes) - if rc != TILEDB_OK: - tiledb_attribute_free(&attr_ptr) - _raise_ctx_err(ctx.ptr, rc) - - self.ctx = ctx - self.ptr = attr_ptr - - def __eq__(self, other): - if not isinstance(other, Attr): - return False - if (self.name != other.name or - self.dtype != other.dtype): - return False - return True - - cdef tiledb_datatype_t _get_type(Attr self) except? TILEDB_CHAR: - cdef tiledb_datatype_t typ - check_error(self.ctx, - tiledb_attribute_get_type(self.ctx.ptr, self.ptr, &typ)) - return typ - - def dump(self): - """Dumps a string representation of the Attr object to standard output (stdout)""" - check_error(self.ctx, - tiledb_attribute_dump(self.ctx.ptr, self.ptr, stdout)) - print('\n') - return - - @property - def dtype(self): - """Return numpy dtype object representing the Attr type - - :rtype: numpy.dtype - - """ - cdef tiledb_datatype_t typ - check_error(self.ctx, - tiledb_attribute_get_type(self.ctx.ptr, self.ptr, &typ)) - cdef uint32_t ncells = 0 - check_error(self.ctx, - tiledb_attribute_get_cell_val_num(self.ctx.ptr, self.ptr, &ncells)) - - return np.dtype(_numpy_dtype(typ, ncells)) - - @property - def name(self): - """Attribute string name, empty string if the attribute is anonymous - - :rtype: str - :raises: :py:exc:`tiledb.TileDBError` - - """ - internal_name = self._get_name() - # handle __attr names from arrays written with libtiledb < 2 - if internal_name == "__attr": - return u"" - return internal_name - - @property - def _internal_name(self): - return self._get_name() - - @property - def isanon(self): - """True if attribute is an anonymous attribute - - :rtype: bool - - """ - cdef unicode name = self._get_name() - return name == u"" or name.startswith(u"__attr") - - @property - def compressor(self): - """String label of the attributes compressor and compressor level - - :rtype: tuple(str, int) - :raises: :py:exc:`tiledb.TileDBError` - - """ - # do we want to reimplement this on top of new API? - pass - - @property - def filters(self): - """FilterList of the TileDB attribute - - :rtype: tiledb.FilterList - :raises: :py:exc:`tiledb.TileDBError` - - """ - cdef tiledb_filter_list_t* filter_list_ptr = NULL - cdef int rc = TILEDB_OK - check_error(self.ctx, - tiledb_attribute_get_filter_list(self.ctx.ptr, self.ptr, &filter_list_ptr)) - - return FilterList(PyCapsule_New(filter_list_ptr, "fl", NULL), - is_capsule=True, ctx=self.ctx) - - @property - def fill(self): - """Fill value for unset cells of this attribute - - :rtype: depends on dtype - :raises: :py:exc:`tiledb.TileDBERror` - """ - cdef const uint8_t* value_ptr = NULL - cdef uint64_t size - check_error(self.ctx, - tiledb_attribute_get_fill_value( - self.ctx.ptr, self.ptr, &value_ptr, &size)) - - if value_ptr == NULL: - return None - - if size == 0: - raise TileDBError("Unexpected zero-length non-null fill value") - - cdef np.npy_intp shape[1] - shape[0] = 1 - cdef tiledb_datatype_t tiledb_type = self._get_type() - cdef int typeid = _numpy_typeid(tiledb_type) - assert(typeid != np.NPY_NOTYPE) - cdef np.ndarray fill_array - - if np.issubdtype(self.dtype, np.bytes_): - return (value_ptr)[:size] - elif np.issubdtype(self.dtype, np.unicode_): - return (value_ptr)[:size].decode('utf-8') - else: - fill_array = np.empty(1, dtype=self.dtype) - memcpy(np.PyArray_DATA(fill_array), value_ptr, size) - - if _tiledb_type_is_datetime(tiledb_type): - # Coerce to np.int64 - fill_array.dtype = np.int64 - datetime_dtype = _tiledb_type_to_datetime(tiledb_type).dtype - date_unit = np.datetime_data(datetime_dtype)[0] - tmp_val = None - if fill_array[0] == 0: - # undefined should span the whole dimension domain - tmp_val = int(self.shape[0]) - else: - tmp_val = int(fill_array[0]) - return np.timedelta64(tmp_val, date_unit) - - return fill_array - - @property - def isnullable(self): - """True if the attribute is nullable - - :rtype: bool - :raises: :py:exc:`tiledb.TileDBError` - - """ - cdef uint8_t nullable = 0 - cdef int rc = TILEDB_OK - check_error( - self.ctx, - tiledb_attribute_get_nullable(self.ctx.ptr, self.ptr, &nullable)) - - return nullable - - @property - def isvar(self): - """True if the attribute is variable length - - :rtype: bool - :raises: :py:exc:`tiledb.TileDBError` - - """ - cdef unsigned int ncells = self._cell_val_num() - return ncells == TILEDB_VAR_NUM - - @property - def ncells(self): - """The number of cells (scalar values) for a given attribute value - - :rtype: int - :raises: :py:exc:`tiledb.TileDBError` - - """ - cdef unsigned int ncells = self._cell_val_num() - assert (ncells != 0) - return int(ncells) - - @property - def isascii(self): - """True if the attribute is TileDB dtype TILEDB_STRING_ASCII - - :rtype: bool - :raises: :py:exc:`tiledb.TileDBError` - - """ - return self._get_type() == TILEDB_STRING_ASCII - - def __repr__(self): - filters_str = "" - if self.filters: - filters_str = ", filters=FilterList([" - for f in self.filters: - filters_str += repr(f) + ", " - filters_str += "])" - - attr_dtype = "ascii" if self.isascii else self.dtype - - # filters_str must be last with no spaces - return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """ - f"""var={self.isvar!s}, nullable={self.isnullable!s}""" - f"""{filters_str})""") - - def _repr_html_(self): - output = io.StringIO() - - output.write("") - output.write("") - output.write("") - output.write("") - output.write("") - output.write("") - output.write("") - output.write("") - output.write(f"{self._repr_html_row_only_()}") - output.write("
NameData TypeIs Var-LenIs NullableFilters
") - - return output.getvalue() - - def _repr_html_row_only_(self): - output = io.StringIO() - - output.write("") - output.write(f"{self.name}") - output.write(f"{'ascii' if self.isascii else self.dtype}") - output.write(f"{self.isvar}") - output.write(f"{self.isnullable}") - output.write(f"{self.filters._repr_html_()}") - output.write("") - - return output.getvalue() - - cdef class Dim(object): """Class representing a dimension of a TileDB Array. @@ -2114,8 +1708,7 @@ cdef class Dim(object): check_error(self.ctx, tiledb_dimension_get_filter_list(self.ctx.ptr, self.ptr, &filter_list_ptr)) - return FilterList(PyCapsule_New(filter_list_ptr, "fl", NULL), - is_capsule=True, ctx=self.ctx) + return FilterList(self.ctx, _capsule=PyCapsule_New(filter_list_ptr, "fl", NULL)) cdef unsigned int _cell_val_num(Dim self) except? 0: cdef unsigned int ncells = 0 @@ -2815,12 +2408,13 @@ cdef class ArraySchema(object): _raise_ctx_err(ctx.ptr, rc) cdef tiledb_attribute_t* attr_ptr = NULL - cdef Attr attribute + # cdef Attr attribute for attr in attrs: if not isinstance(attr, Attr): raise TypeError("Cannot create schema with non-Attr value for 'attrs' argument") attribute = attr - attr_ptr = attribute.ptr + attr_ptr = PyCapsule_GetPointer( + attr.__capsule__(), "attr") rc = tiledb_array_schema_add_attribute(ctx.ptr, schema_ptr, attr_ptr) if rc != TILEDB_OK: tiledb_array_schema_free(&schema_ptr) @@ -3042,9 +2636,8 @@ cdef class ArraySchema(object): check_error(self.ctx, tiledb_array_schema_get_offsets_filter_list( self.ctx.ptr, self.ptr, &filter_list_ptr)) - return FilterList( - PyCapsule_New(filter_list_ptr, "fl", NULL), - is_capsule=True, ctx=self.ctx) + return FilterList(self.ctx, + _capsule=PyCapsule_New(filter_list_ptr, "fl", NULL)) @property def coords_filters(self): @@ -3057,9 +2650,8 @@ cdef class ArraySchema(object): check_error(self.ctx, tiledb_array_schema_get_coords_filter_list( self.ctx.ptr, self.ptr, &filter_list_ptr)) - return FilterList( - PyCapsule_New(filter_list_ptr, "fl", NULL), - is_capsule=True, ctx=self.ctx) + return FilterList(self.ctx, + _capsule=PyCapsule_New(filter_list_ptr, "fl", NULL)) @coords_filters.setter def coords_filters(self, value): @@ -3080,9 +2672,8 @@ cdef class ArraySchema(object): check_error(self.ctx, tiledb_array_schema_get_validity_filter_list( self.ctx.ptr, self.ptr, &validity_list_ptr)) - return FilterList( - PyCapsule_New(validity_list_ptr, "fl", NULL), - is_capsule=True, ctx=self.ctx) + return FilterList(self.ctx, + _capsule=PyCapsule_New(validity_list_ptr, "fl", NULL)) @property def domain(self): @@ -3167,14 +2758,14 @@ cdef class ArraySchema(object): check_error(self.ctx, tiledb_array_schema_get_attribute_from_name( self.ctx.ptr, self.ptr, bname, &attr_ptr)) - return Attr.from_ptr(attr_ptr, self.ctx) + return Attr(self.ctx, _capsule=PyCapsule_New(attr_ptr, "attr", NULL)) cdef _attr_idx(self, int idx): cdef tiledb_attribute_t* attr_ptr = NULL check_error(self.ctx, tiledb_array_schema_get_attribute_from_index( self.ctx.ptr, self.ptr, idx, &attr_ptr)) - return Attr.from_ptr(attr_ptr, ctx=self.ctx) + return Attr(self.ctx, _capsule=PyCapsule_New(attr_ptr, "attr", NULL)) def attr(self, object key not None): """Returns an Attr instance given an int index or string label @@ -4660,7 +4251,6 @@ cdef class DenseArrayImpl(Array): cdef tuple idx = replace_ellipsis(domain.ndim, index_as_tuple(selection)) idx,_drop = replace_scalars_slice(domain, idx) cdef object subarray = index_domain_subarray(self, domain, idx) - cdef Attr attr cdef list attributes = list() cdef list values = list() @@ -4783,7 +4373,7 @@ cdef class DenseArrayImpl(Array): cdef tiledb_array_t* array_ptr = self.ptr # attr name - cdef Attr attr = self.schema.attr(0) + attr = self.schema.attr(0) cdef bytes battr_name = attr._internal_name.encode('UTF-8') cdef const char* attr_name_ptr = PyBytes_AS_STRING(battr_name) @@ -4894,7 +4484,6 @@ cdef class DenseArrayImpl(Array): cdef tiledb_ctx_t* ctx_ptr = self.ctx.ptr cdef tiledb_array_t* array_ptr = self.ptr - cdef Attr attr cdef unicode attr_name if name is None and self.schema.nattr != 1: raise ValueError( diff --git a/tiledb/np2buf.py b/tiledb/np2buf.py new file mode 100644 index 0000000000..d1e7de367c --- /dev/null +++ b/tiledb/np2buf.py @@ -0,0 +1,106 @@ +import tiledb.cc as lt + +from collections import deque +import numpy as np + +_dtype_to_tiledb = { + "int32": lt.DataType.INT32, + "int64": lt.DataType.INT64, + "float32": lt.DataType.FLOAT32, + "float64": lt.DataType.FLOAT64, + "int8": lt.DataType.INT8, + "uint8": lt.DataType.UINT8, + "int16": lt.DataType.INT16, + "uint16": lt.DataType.UINT16, + "uint32": lt.DataType.UINT32, + "uint64": lt.DataType.UINT64, + "complex64": lt.DataType.FLOAT32, + "complex129": lt.DataType.FLOAT64, + "datetime64[Y]": lt.DataType.DATETIME_YEAR, + "datetime64[M]": lt.DataType.DATETIME_MONTH, + "datetime64[W]": lt.DataType.DATETIME_WEEK, + "datetime64[D]": lt.DataType.DATETIME_DAY, + "datetime64[h]": lt.DataType.DATETIME_HR, + "datetime64[m]": lt.DataType.DATETIME_MIN, + "datetime64[s]": lt.DataType.DATETIME_SEC, + "datetime64[ms]": lt.DataType.DATETIME_MS, + "datetime64[us]": lt.DataType.DATETIME_US, + "datetime64[ns]": lt.DataType.DATETIME_NS, + "datetime64[ps]": lt.DataType.DATETIME_PS, + "datetime64[fs]": lt.DataType.DATETIME_FS, + "datetime64[as]": lt.DataType.DATETIME_AS, + "timedelta64[h]": lt.DataType.TIME_HR, + "timedelta64[m]": lt.DataType.TIME_MIN, + "timedelta64[s]": lt.DataType.TIME_SEC, + "timedelta64[ms]": lt.DataType.TIME_MS, + "timedelta64[us]": lt.DataType.TIME_US, + "timedelta64[ns]": lt.DataType.TIME_NS, + "timedelta64[ps]": lt.DataType.TIME_PS, + "timedelta64[fs]": lt.DataType.TIME_FS, + "timedelta64[as]": lt.DataType.TIME_AS, + "bool": lt.DataType.BOOL, +} + + +def dtype_to_tiledb(dtype): + if dtype.name not in _dtype_to_tiledb: + raise TypeError(f"data type {dtype!r} not understood") + return _dtype_to_tiledb[dtype.name] + + +def array_type_ncells(dtype): + """ + Returns the TILEDB_{TYPE} and ncells corresponding to a given numpy dtype + """ + checked_dtype = np.dtype(dtype) + + # - flexible datatypes of unknown size have an itemsize of 0 (str, bytes, etc.) + # - unicode and string types are always stored as VAR because we don't want to + # store the pad (numpy pads to max length for 'S' and 'U' dtypes) + + if np.issubdtype(checked_dtype, np.bytes_): + tdb_type = lt.DataType.CHAR + if checked_dtype.itemsize == 0: + ncells = lt.TILEDB_VAR_NUM() + else: + ncells = checked_dtype.itemsize + + elif np.issubdtype(checked_dtype, np.unicode_): + np_unicode_size = np.dtype("U1").itemsize + + # TODO depending on np_unicode_size, tdb_type may be UTF16 or UTF32 + tdb_type = lt.DataType.STRING_UTF8 + + if checked_dtype.itemsize == 0: + ncells = lt.TILEDB_VAR_NUM() + else: + ncells = checked_dtype.itemsize // np_unicode_size + + elif np.issubdtype(checked_dtype, np.complexfloating): + # handle complex dtypes + tdb_type = dtype_to_tiledb(checked_dtype) + ncells = 2 + + elif checked_dtype.kind == "V": + # handles n fixed-size record dtypes + if checked_dtype.shape != (): + raise TypeError("nested sub-array numpy dtypes are not supported") + # check that types are the same + # TODO: make sure this is not too slow for large record types + deq = deque(checked_dtype.fields.values()) + typ0, _ = deq.popleft() + nfields = 1 + for (typ, _) in deq: + nfields += 1 + if typ != typ0: + raise TypeError("heterogenous record numpy dtypes are not supported") + + tdb_type = dtype_to_tiledb(typ0) + ncells = len(checked_dtype.fields.values()) + + else: + # scalar cell type + tdb_type = dtype_to_tiledb(checked_dtype) + ncells = 1 + + return tdb_type, ncells diff --git a/tiledb/tests/test_attribute.py b/tiledb/tests/test_attribute.py new file mode 100644 index 0000000000..58ecaa6a9d --- /dev/null +++ b/tiledb/tests/test_attribute.py @@ -0,0 +1,179 @@ +import xml.etree.ElementTree + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +import tiledb +from tiledb.tests.common import assert_captured, DiskTestCase, has_pandas + + +class AttributeTest(DiskTestCase): + def test_minimal_attribute(self): + attr = tiledb.Attr() + self.assertTrue(attr.isanon) + self.assertEqual(attr.name, "") + self.assertEqual(attr.dtype, np.float_) + # self.assertEqual(attr.compressor, (None, -1)) + self.assertFalse(attr.isvar) + self.assertFalse(attr.isnullable) + + try: + assert xml.etree.ElementTree.fromstring(attr._repr_html_()) is not None + except: + pytest.fail(f"Could not parse attr._repr_html_(). Saw {attr._repr_html_()}") + + def test_attribute(self, capfd): + attr = tiledb.Attr("foo") + + attr.dump() + assert_captured(capfd, "Name: foo") + + assert attr.name == "foo" + assert attr.dtype == np.float64, "default attribute type is float64" + # compressor, level = attr.compressor + # self.assertEqual(compressor, None, "default to no compression") + # self.assertEqual(level, -1, "default compression level when none is specified") + + @pytest.mark.parametrize( + "dtype, fill", + [ + (np.dtype(bytes), b"abc"), + # (str, "defg"), + (np.float32, np.float32(0.4023573667780681)), + (np.float64, np.float64(0.0560602549760851)), + (np.dtype("M8[ns]"), np.timedelta64(11, "ns")), + (np.dtype([("f0", " + # compressor, level = attr.compressor + # self.assertEqual(compressor, "zstd") + # self.assertEqual(level, 10) + + def test_ncell_attribute(self): + dtype = np.dtype([("", np.int32), ("", np.int32), ("", np.int32)]) + attr = tiledb.Attr("foo", dtype=dtype) + + self.assertEqual(attr.dtype, dtype) + self.assertEqual(attr.ncells, 3) + + # dtype subarrays not supported + with self.assertRaises(TypeError): + tiledb.Attr("foo", dtype=np.dtype((np.int32, 2))) + + # mixed type record arrays not supported + with self.assertRaises(TypeError): + tiledb.Attr("foo", dtype=np.dtype([("", np.float32), ("", np.int32)])) + + def test_ncell_bytes_attribute(self): + dtype = np.dtype((np.bytes_, 10)) + attr = tiledb.Attr("foo", dtype=dtype) + + self.assertEqual(attr.dtype, dtype) + self.assertEqual(attr.ncells, 10) + + def test_bytes_var_attribute(self): + with pytest.warns(DeprecationWarning, match="Attr given `var=True` but"): + attr = tiledb.Attr("foo", var=True, dtype="S1") + self.assertEqual(attr.dtype, np.dtype("S")) + self.assertTrue(attr.isvar) + + with pytest.warns(DeprecationWarning, match="Attr given `var=False` but"): + attr = tiledb.Attr("foo", var=False, dtype="S") + self.assertEqual(attr.dtype, np.dtype("S")) + self.assertTrue(attr.isvar) + + attr = tiledb.Attr("foo", var=True, dtype="S") + self.assertEqual(attr.dtype, np.dtype("S")) + self.assertTrue(attr.isvar) + + attr = tiledb.Attr("foo", var=False, dtype="S1") + self.assertEqual(attr.dtype, np.dtype("S1")) + self.assertFalse(attr.isvar) + + attr = tiledb.Attr("foo", dtype="S1") + self.assertEqual(attr.dtype, np.dtype("S1")) + self.assertFalse(attr.isvar) + + attr = tiledb.Attr("foo", dtype="S") + self.assertEqual(attr.dtype, np.dtype("S")) + self.assertTrue(attr.isvar) + + def test_nullable_attribute(self): + attr = tiledb.Attr("nullable", nullable=True, dtype=np.int32) + self.assertEqual(attr.dtype, np.dtype(np.int32)) + self.assertTrue(attr.isnullable) + + def test_datetime_attribute(self): + attr = tiledb.Attr("foo", dtype=np.datetime64("", "D")) + assert attr.dtype == np.dtype(np.datetime64("", "D")) + assert attr.dtype != np.dtype(np.datetime64("", "Y")) + assert attr.dtype != np.dtype(np.datetime64) + + @pytest.mark.parametrize("sparse", [True, False]) + def test_ascii_attribute(self, sparse, capfd): + path = self.path("test_ascii") + dom = tiledb.Domain( + tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32) + ) + + with pytest.raises(TypeError) as exc_info: + tiledb.Attr(name="A", dtype="ascii", var=False) + assert ( + str(exc_info.value) == "dtype is not compatible with var-length attribute" + ) + + attrs = [tiledb.Attr(name="A", dtype="ascii")] + + schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse) + tiledb.Array.create(path, schema) + + ascii_data = ["a", "b", "c", "ABC"] + unicode_data = ["±", "×", "÷", "√"] + + with tiledb.open(path, "w") as A: + if sparse: + with self.assertRaises(tiledb.TileDBError): + A[np.arange(1, 5)] = unicode_data + A[np.arange(1, 5)] = ascii_data + else: + with self.assertRaises(tiledb.TileDBError): + A[:] = unicode_data + A[:] = ascii_data + + with tiledb.open(path, "r") as A: + assert A.schema.nattr == 1 + A.schema.dump() + assert_captured(capfd, "Type: STRING_ASCII") + assert A.schema.attr("A").isvar + assert A.schema.attr("A").dtype == np.bytes_ + assert A.schema.attr("A").isascii + assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_)) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index 98fa5a06ae..5a0cf1abc4 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -390,177 +390,6 @@ def test_ascii_domain(self, capfd): A[ascii_coords] = data -class AttributeTest(DiskTestCase): - def test_minimal_attribute(self): - attr = tiledb.Attr() - self.assertTrue(attr.isanon) - self.assertEqual(attr.name, "") - self.assertEqual(attr.dtype, np.float_) - # self.assertEqual(attr.compressor, (None, -1)) - self.assertFalse(attr.isvar) - self.assertFalse(attr.isnullable) - - try: - assert xml.etree.ElementTree.fromstring(attr._repr_html_()) is not None - except: - pytest.fail(f"Could not parse attr._repr_html_(). Saw {attr._repr_html_()}") - - def test_attribute(self, capfd): - attr = tiledb.Attr("foo") - - attr.dump() - assert_captured(capfd, "Name: foo") - - assert attr.name == "foo" - assert attr.dtype == np.float64, "default attribute type is float64" - # compressor, level = attr.compressor - # self.assertEqual(compressor, None, "default to no compression") - # self.assertEqual(level, -1, "default compression level when none is specified") - - @pytest.mark.parametrize( - "dtype, fill", - [ - (np.dtype(bytes), b"abc"), - # (str, "defg"), - (np.float32, np.float32(0.4023573667780681)), - (np.float64, np.float64(0.0560602549760851)), - (np.dtype("M8[ns]"), np.timedelta64(11, "ns")), - (np.dtype([("f0", " - # compressor, level = attr.compressor - # self.assertEqual(compressor, "zstd") - # self.assertEqual(level, 10) - - def test_ncell_attribute(self): - dtype = np.dtype([("", np.int32), ("", np.int32), ("", np.int32)]) - attr = tiledb.Attr("foo", dtype=dtype) - - self.assertEqual(attr.dtype, dtype) - self.assertEqual(attr.ncells, 3) - - # dtype subarrays not supported - with self.assertRaises(TypeError): - tiledb.Attr("foo", dtype=np.dtype((np.int32, 2))) - - # mixed type record arrays not supported - with self.assertRaises(TypeError): - tiledb.Attr("foo", dtype=np.dtype([("", np.float32), ("", np.int32)])) - - def test_ncell_bytes_attribute(self): - dtype = np.dtype((np.bytes_, 10)) - attr = tiledb.Attr("foo", dtype=dtype) - - self.assertEqual(attr.dtype, dtype) - self.assertEqual(attr.ncells, 10) - - def test_bytes_var_attribute(self): - with pytest.warns(DeprecationWarning, match="Attr given `var=True` but"): - attr = tiledb.Attr("foo", var=True, dtype="S1") - self.assertEqual(attr.dtype, np.dtype("S")) - self.assertTrue(attr.isvar) - - with pytest.warns(DeprecationWarning, match="Attr given `var=False` but"): - attr = tiledb.Attr("foo", var=False, dtype="S") - self.assertEqual(attr.dtype, np.dtype("S")) - self.assertTrue(attr.isvar) - - attr = tiledb.Attr("foo", var=True, dtype="S") - self.assertEqual(attr.dtype, np.dtype("S")) - self.assertTrue(attr.isvar) - - attr = tiledb.Attr("foo", var=False, dtype="S1") - self.assertEqual(attr.dtype, np.dtype("S1")) - self.assertFalse(attr.isvar) - - attr = tiledb.Attr("foo", dtype="S1") - self.assertEqual(attr.dtype, np.dtype("S1")) - self.assertFalse(attr.isvar) - - attr = tiledb.Attr("foo", dtype="S") - self.assertEqual(attr.dtype, np.dtype("S")) - self.assertTrue(attr.isvar) - - def test_nullable_attribute(self): - attr = tiledb.Attr("nullable", nullable=True, dtype=np.int32) - self.assertEqual(attr.dtype, np.dtype(np.int32)) - self.assertTrue(attr.isnullable) - - def test_datetime_attribute(self): - attr = tiledb.Attr("foo", dtype=np.datetime64("", "D")) - assert attr.dtype == np.dtype(np.datetime64("", "D")) - assert attr.dtype != np.dtype(np.datetime64("", "Y")) - assert attr.dtype != np.dtype(np.datetime64) - - @pytest.mark.parametrize("sparse", [True, False]) - def test_ascii_attribute(self, sparse, capfd): - path = self.path("test_ascii") - dom = tiledb.Domain( - tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32) - ) - - with pytest.raises(TypeError) as exc_info: - tiledb.Attr(name="A", dtype="ascii", var=False) - assert ( - str(exc_info.value) == "dtype is not compatible with var-length attribute" - ) - - attrs = [tiledb.Attr(name="A", dtype="ascii")] - - schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse) - tiledb.Array.create(path, schema) - - ascii_data = ["a", "b", "c", "ABC"] - unicode_data = ["±", "×", "÷", "√"] - - with tiledb.open(path, "w") as A: - if sparse: - with self.assertRaises(tiledb.TileDBError): - A[np.arange(1, 5)] = unicode_data - A[np.arange(1, 5)] = ascii_data - else: - with self.assertRaises(tiledb.TileDBError): - A[:] = unicode_data - A[:] = ascii_data - - with tiledb.open(path, "r") as A: - assert A.schema.nattr == 1 - A.schema.dump() - assert_captured(capfd, "Type: STRING_ASCII") - assert A.schema.attr("A").isvar - assert A.schema.attr("A").dtype == np.bytes_ - assert A.schema.attr("A").isascii - assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_)) - - class ArraySchemaTest(DiskTestCase): def test_schema_basic(self): dom = tiledb.Domain( diff --git a/tiledb/util.py b/tiledb/util.py index f9643fcde2..c6098c430d 100644 --- a/tiledb/util.py +++ b/tiledb/util.py @@ -1,6 +1,7 @@ import tiledb +import tiledb.cc as lt + import numpy as np -from typing import Iterable from tiledb.dataframe_ import ColumnInfo @@ -35,3 +36,111 @@ def _sparse_schema_from_dict(input_attrs, input_dims): def schema_from_dict(attrs, dims): return _sparse_schema_from_dict(attrs, dims) + + +# Conversion from TileDB dtype to Numpy dtype +_tiledb_dtype_to_numpy_dtype_convert = { + lt.DataType.INT32: np.int32, + lt.DataType.UINT32: np.uint32, + lt.DataType.INT64: np.int64, + lt.DataType.UINT64: np.uint64, + lt.DataType.FLOAT32: np.float32, + lt.DataType.FLOAT64: np.float64, + lt.DataType.INT8: np.int8, + lt.DataType.UINT8: np.uint8, + lt.DataType.INT16: np.int16, + lt.DataType.UINT16: np.uint16, + lt.DataType.CHAR: np.dtype("S1"), + lt.DataType.STRING_ASCII: np.dtype("S"), + lt.DataType.STRING_UTF8: np.dtype("U1"), + lt.DataType.BLOB: np.byte, + lt.DataType.BOOL: np.bool_, +} + +# Conversion from TileDB dtype to Numpy datetime +_tiledb_dtype_to_datetime_convert = { + lt.DataType.DATETIME_YEAR: np.datetime64("", "Y"), + lt.DataType.DATETIME_MONTH: np.datetime64("", "M"), + lt.DataType.DATETIME_WEEK: np.datetime64("", "W"), + lt.DataType.DATETIME_DAY: np.datetime64("", "D"), + lt.DataType.DATETIME_HR: np.datetime64("", "h"), + lt.DataType.DATETIME_MIN: np.datetime64("", "m"), + lt.DataType.DATETIME_SEC: np.datetime64("", "s"), + lt.DataType.DATETIME_MS: np.datetime64("", "ms"), + lt.DataType.DATETIME_US: np.datetime64("", "us"), + lt.DataType.DATETIME_NS: np.datetime64("", "ns"), + lt.DataType.DATETIME_PS: np.datetime64("", "ps"), + lt.DataType.DATETIME_FS: np.datetime64("", "fs"), + lt.DataType.DATETIME_AS: np.datetime64("", "as"), +} + + +def _tiledb_type_is_datetime(tiledb_type): + """Returns True if the tiledb type is a datetime type""" + return tiledb_type in ( + lt.DataType.DATETIME_YEAR, + lt.DataType.DATETIME_MONTH, + lt.DataType.DATETIME_WEEK, + lt.DataType.DATETIME_DAY, + lt.DataType.DATETIME_HR, + lt.DataType.DATETIME_MIN, + lt.DataType.DATETIME_SEC, + lt.DataType.DATETIME_MS, + lt.DataType.DATETIME_US, + lt.DataType.DATETIME_NS, + lt.DataType.DATETIME_PS, + lt.DataType.DATETIME_FS, + lt.DataType.DATETIME_AS, + ) + + +def _tiledb_type_to_datetime(tiledb_type): + """ + Return a datetime64 with appropriate unit for the given + tiledb_datetype_t enum value + """ + tdb_type = _tiledb_dtype_to_datetime_convert.get(tiledb_type, None) + if tdb_type is None: + raise TypeError("tiledb type is not a datetime {0!r}".format(tiledb_type)) + return tdb_type + + +def _numpy_dtype(tiledb_dtype, cell_size=1): + """Return a numpy type given a tiledb_datatype_t enum value.""" + cell_val_num = cell_size + + if tiledb_dtype == lt.DataType.BLOB: + return np.bytes_ + + elif cell_val_num == 1: + if tiledb_dtype in _tiledb_dtype_to_numpy_dtype_convert: + return _tiledb_dtype_to_numpy_dtype_convert[tiledb_dtype] + elif _tiledb_type_is_datetime(tiledb_dtype): + return _tiledb_type_to_datetime(tiledb_dtype) + + elif cell_val_num == 2 and tiledb_dtype == lt.DataType.FLOAT32: + return np.complex64 + + elif cell_val_num == 2 and tiledb_dtype == lt.DataType.FLOAT64: + return np.complex128 + + elif tiledb_dtype in (lt.DataType.CHAR, lt.DataType.STRING_UTF8): + if tiledb_dtype == lt.DataType.CHAR: + dtype_str = "|S" + elif tiledb_dtype == lt.DataType.STRING_UTF8: + dtype_str = "|U" + if cell_val_num != lt.TILEDB_VAR_NUM(): + dtype_str += str(cell_val_num) + return np.dtype(dtype_str) + + elif cell_val_num == lt.TILEDB_VAR_NUM(): + base_dtype = _numpy_dtype(tiledb_dtype, cell_size=1) + return base_dtype + + elif cell_val_num > 1: + # construct anonymous record dtype + base_dtype = _numpy_dtype(tiledb_dtype, cell_size=1) + rec = np.dtype([("", base_dtype)] * cell_val_num) + return rec + + raise TypeError("tiledb datatype not understood")