Skip to content

Commit

Permalink
Accept constant memoryviews in HashTable.lookup (pandas-dev#21688)
Browse files Browse the repository at this point in the history
  • Loading branch information
xhochy authored and victor committed Sep 30, 2018
1 parent 5b132d0 commit 2fae688
Show file tree
Hide file tree
Showing 21 changed files with 83 additions and 84 deletions.
2 changes: 1 addition & 1 deletion ci/appveyor-27.yaml
Expand Up @@ -24,7 +24,7 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- cython
- cython>=0.28.2
- pytest
- pytest-xdist
- moto
2 changes: 1 addition & 1 deletion ci/appveyor-36.yaml
Expand Up @@ -22,6 +22,6 @@ dependencies:
- xlsxwriter
- xlwt
# universal
- cython
- cython>=0.28.2
- pytest
- pytest-xdist
2 changes: 1 addition & 1 deletion ci/circle-27-compat.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
dependencies:
- bottleneck=1.0.0
- cython=0.24
- cython=0.28.2
- jinja2=2.8
- numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr
- numpy=1.9.2
Expand Down
2 changes: 1 addition & 1 deletion ci/circle-35-ascii.yaml
Expand Up @@ -2,7 +2,7 @@ name: pandas
channels:
- defaults
dependencies:
- cython
- cython>=0.28.2
- nomkl
- numpy
- python-dateutil
Expand Down
2 changes: 1 addition & 1 deletion ci/circle-36-locale.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
dependencies:
- beautifulsoup4
- cython
- cython>=0.28.2
- html5lib
- ipython
- jinja2
Expand Down
2 changes: 1 addition & 1 deletion ci/circle-36-locale_slow.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
dependencies:
- beautifulsoup4
- cython
- cython>=0.28.2
- gcsfs
- html5lib
- ipython
Expand Down
2 changes: 1 addition & 1 deletion ci/environment-dev.yaml
Expand Up @@ -3,7 +3,7 @@ channels:
- defaults
- conda-forge
dependencies:
- Cython
- Cython>=0.28.2
- NumPy
- flake8
- moto
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-27-locale.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
dependencies:
- bottleneck=1.0.0
- cython=0.24
- cython=0.28.2
- lxml
- matplotlib=1.4.3
- numpy=1.9.2
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-27.yaml
Expand Up @@ -5,7 +5,7 @@ channels:
dependencies:
- beautifulsoup4
- bottleneck
- cython=0.24
- cython=0.28.2
- fastparquet
- feather-format
- flake8=3.4.1
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-35-osx.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
dependencies:
- beautifulsoup4
- bottleneck
- cython
- cython>=0.28.2
- html5lib
- jinja2
- lxml
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-36-doc.yaml
Expand Up @@ -6,7 +6,7 @@ channels:
dependencies:
- beautifulsoup4
- bottleneck
- cython
- cython>=0.28.2
- fastparquet
- feather-format
- html5lib
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-36-numpydev.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
dependencies:
- python=3.6*
- pytz
- Cython
- Cython>=0.28.2
# universal
- pytest
- pytest-xdist
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-36-slow.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
dependencies:
- beautifulsoup4
- cython
- cython>=0.28.2
- html5lib
- lxml
- matplotlib
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-36.yaml
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
dependencies:
- beautifulsoup4
- cython
- cython>=0.28.2
- dask
- fastparquet
- feather-format
Expand Down
2 changes: 1 addition & 1 deletion ci/travis-37.yaml
Expand Up @@ -5,7 +5,7 @@ channels:
- c3i_test
dependencies:
- python=3.7
- cython
- cython>=0.28.2
- numpy
- python-dateutil
- nomkl
Expand Down
2 changes: 1 addition & 1 deletion doc/source/install.rst
Expand Up @@ -253,7 +253,7 @@ Optional Dependencies
~~~~~~~~~~~~~~~~~~~~~

* `Cython <http://www.cython.org>`__: Only necessary to build development
version. Version 0.24 or higher.
version. Version 0.28.2 or higher.
* `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.14.0 or higher
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -410,12 +410,17 @@ Reshaping
-
-

Build Changes
^^^^^^^^^^^^^

- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`)
-

Other
^^^^^

- :meth: `~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`)
-
-
-
108 changes: 44 additions & 64 deletions pandas/_libs/hashtable_class_helper.pxi.in
Expand Up @@ -120,7 +120,7 @@ cdef class {{name}}Vector:

append_data_{{dtype}}(self.data, x)

cdef extend(self, {{arg}}[:] x):
cdef extend(self, const {{arg}}[:] x):
for i in range(len(x)):
self.append(x[i])

Expand Down Expand Up @@ -253,56 +253,10 @@ dtypes = [('Float64', 'float64', True, 'nan'),
('UInt64', 'uint64', False, 0),
('Int64', 'int64', False, 'iNaT')]

def get_dispatch(dtypes):
for (name, dtype, float_group, default_na_value) in dtypes:
unique_template = """\
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{dtype}_t val
khiter_t k
bint seen_na = 0
{name}Vector uniques = {name}Vector()
{name}VectorData *ud

ud = uniques.data

with nogil:
for i in range(n):
val = values[i]
IF {float_group}:
if val == val:
k = kh_get_{dtype}(self.table, val)
if k == self.table.n_buckets:
kh_put_{dtype}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{dtype}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{dtype}(ud, NAN)
ELSE:
k = kh_get_{dtype}(self.table, val)
if k == self.table.n_buckets:
kh_put_{dtype}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{dtype}(ud, val)
return uniques.to_array()
"""

unique_template = unique_template.format(name=name, dtype=dtype, float_group=float_group)

yield (name, dtype, float_group, default_na_value, unique_template)
}}


{{for name, dtype, float_group, default_na_value, unique_template in get_dispatch(dtypes)}}
{{for name, dtype, float_group, default_na_value in dtypes}}

cdef class {{name}}HashTable(HashTable):

Expand Down Expand Up @@ -351,7 +305,7 @@ cdef class {{name}}HashTable(HashTable):
raise KeyError(key)

@cython.boundscheck(False)
def map(self, {{dtype}}_t[:] keys, int64_t[:] values):
def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand Down Expand Up @@ -379,7 +333,7 @@ cdef class {{name}}HashTable(HashTable):
self.table.vals[k] = i

@cython.boundscheck(False)
def lookup(self, {{dtype}}_t[:] values):
def lookup(self, const {{dtype}}_t[:] values):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
Expand All @@ -404,7 +358,7 @@ cdef class {{name}}HashTable(HashTable):
return uniques.to_array(), labels

@cython.boundscheck(False)
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
object na_value=None):
cdef:
Expand Down Expand Up @@ -461,7 +415,7 @@ cdef class {{name}}HashTable(HashTable):
return np.asarray(labels)

@cython.boundscheck(False)
def get_labels_groupby(self, {{dtype}}_t[:] values):
def get_labels_groupby(self, const {{dtype}}_t[:] values):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand Down Expand Up @@ -506,20 +460,46 @@ cdef class {{name}}HashTable(HashTable):
return np.asarray(labels), arr_uniques

@cython.boundscheck(False)
def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
if values.flags.writeable:
# If the value is writeable (mutable) then use memview
return self.unique_memview(values)
def unique(self, const {{dtype}}_t[:] values):
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t val
khiter_t k
bint seen_na = 0
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud

# We cannot use the memoryview version on readonly-buffers due to
# a limitation of Cython's typed memoryviews. Instead we can use
# the slightly slower Cython ndarray type directly.
# see https://github.com/cython/cython/issues/1605
{{unique_template}}
ud = uniques.data

@cython.boundscheck(False)
def unique_memview(self, {{dtype}}_t[:] values):
{{unique_template}}
with nogil:
for i in range(n):
val = values[i]
{{if float_group}}
if val == val:
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
elif not seen_na:
seen_na = 1
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, NAN)
{{else}}
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
{{endif}}
return uniques.to_array()

{{endfor}}

Expand Down
8 changes: 8 additions & 0 deletions pandas/conftest.py
Expand Up @@ -138,6 +138,14 @@ def compression_only(request):
return request.param


@pytest.fixture(params=[True, False])
def writable(request):
"""
Fixture that an array is writable
"""
return request.param


@pytest.fixture(scope='module')
def datetime_tz_utc():
from datetime import timezone
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/test_algos.py
Expand Up @@ -1077,15 +1077,19 @@ class TestGroupVarFloat32(GroupVarTestMixin):

class TestHashTable(object):

def test_lookup_nan(self):
def test_lookup_nan(self, writable):
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
# GH 21688 ensure we can deal with readonly memory views
xs.setflags(write=writable)
m = ht.Float64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
dtype=np.int64))

def test_lookup_overflow(self):
def test_lookup_overflow(self, writable):
xs = np.array([1, 2, 2**63], dtype=np.uint64)
# GH 21688 ensure we can deal with readonly memory views
xs.setflags(write=writable)
m = ht.UInt64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
Expand All @@ -1096,12 +1100,14 @@ def test_get_unique(self):
exp = np.array([1, 2, 2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(s.unique(), exp)

def test_vector_resize(self):
def test_vector_resize(self, writable):
# Test for memory errors after internal vector
# reallocations (pull request #7157)

def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
vals = np.array(np.random.randn(1000), dtype=dtype)
# GH 21688 ensure we can deal with readonly memory views
vals.setflags(write=writable)
# get_labels may append to uniques
htable.get_labels(vals[:nvals], uniques, 0, -1)
# to_array() set an external_view_exists flag on uniques.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -32,7 +32,7 @@ def is_platform_mac():
return sys.platform == 'darwin'


min_cython_ver = '0.24'
min_cython_ver = '0.28.2'
try:
import Cython
ver = Cython.__version__
Expand Down

0 comments on commit 2fae688

Please sign in to comment.