Skip to content

Commit

Permalink
COMPAT/TEST test, fix for unsafe Vector.resize(), which allows refche… (
Browse files Browse the repository at this point in the history
pandas-dev#16258)

* COMPAT/TEST test, fix for unsafe Vector.resize(), which allows refcheck=False

* COMPAT/TEST improve error msg, document test as per review

* COMPAT/TEST unify interfaces as per review
  • Loading branch information
mattip authored and jreback committed May 11, 2017
1 parent b1ff291 commit fdc2185
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 24 deletions.
1 change: 1 addition & 0 deletions pandas/_libs/hashtable.pxd
Expand Up @@ -52,6 +52,7 @@ cdef struct Int64VectorData:
cdef class Int64Vector:
cdef Int64VectorData *data
cdef ndarray ao
cdef bint external_view_exists

cdef resize(self)
cpdef to_array(self)
Expand Down
13 changes: 13 additions & 0 deletions pandas/_libs/hashtable.pyx
Expand Up @@ -64,6 +64,10 @@ cdef class Factorizer:
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
if self.uniques.external_view_exists:
uniques = ObjectVector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, check_null)
mask = (labels == na_sentinel)
Expand Down Expand Up @@ -99,6 +103,15 @@ cdef class Int64Factorizer:

def factorize(self, int64_t[:] values, sort=False,
na_sentinel=-1, check_null=True):
"""
Factorize values with nans replaced by na_sentinel
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
if self.uniques.external_view_exists:
uniques = Int64Vector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
check_null)
Expand Down
46 changes: 39 additions & 7 deletions pandas/_libs/hashtable_class_helper.pxi.in
Expand Up @@ -71,6 +71,7 @@ cdef class {{name}}Vector:

{{if dtype != 'int64'}}
cdef:
bint external_view_exists
{{name}}VectorData *data
ndarray ao
{{endif}}
Expand All @@ -80,14 +81,15 @@ cdef class {{name}}Vector:
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.external_view_exists = False
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype={{idtype}})
self.data.data = <{{arg}}*> self.ao.data

cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m)
self.ao.resize(self.data.m, refcheck=False)
self.data.data = <{{arg}}*> self.ao.data

def __dealloc__(self):
Expand All @@ -99,13 +101,20 @@ cdef class {{name}}Vector:
return self.data.n

cpdef to_array(self):
self.ao.resize(self.data.n)
self.data.m = self.data.n
if self.data.m != self.data.n:
if self.external_view_exists:
# should never happen
raise ValueError("should have raised on append()")
self.ao.resize(self.data.n, refcheck=False)
self.data.m = self.data.n
self.external_view_exists = True
return self.ao

cdef inline void append(self, {{arg}} x):

if needs_resize(self.data):
if self.external_view_exists:
raise ValueError("external reference but Vector.resize() needed")
self.resize()

append_data_{{dtype}}(self.data, x)
Expand All @@ -120,15 +129,19 @@ cdef class StringVector:

cdef:
StringVectorData *data
bint external_view_exists

def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(
sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.external_view_exists = False
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()

cdef resize(self):
cdef:
Expand All @@ -138,9 +151,10 @@ cdef class StringVector:
m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)

# TODO: can resize?
orig_data = self.data.data
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
for i in range(m):
self.data.data[i] = orig_data[i]

Expand All @@ -164,6 +178,7 @@ cdef class StringVector:
for i in range(self.data.n):
val = self.data.data[i]
ao[i] = val
self.external_view_exists = True
self.data.m = self.data.n
return ao

Expand All @@ -174,15 +189,20 @@ cdef class StringVector:

append_data_string(self.data, x)

cdef extend(self, ndarray[:] x):
for i in range(len(x)):
self.append(x[i])

cdef class ObjectVector:

cdef:
PyObject **data
size_t n, m
ndarray ao
bint external_view_exists

def __cinit__(self):
self.external_view_exists = False
self.n = 0
self.m = _INIT_VEC_CAP
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
Expand All @@ -193,19 +213,28 @@ cdef class ObjectVector:

cdef inline append(self, object o):
if self.n == self.m:
if self.external_view_exists:
raise ValueError("external reference but Vector.resize() needed")
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m)
self.ao.resize(self.m, refcheck=False)
self.data = <PyObject**> self.ao.data

Py_INCREF(o)
self.data[self.n] = <PyObject*> o
self.n += 1

def to_array(self):
self.ao.resize(self.n)
self.m = self.n
if self.m != self.n:
if self.external_view_exists:
raise ValueError("should have raised on append()")
self.ao.resize(self.n, refcheck=False)
self.m = self.n
self.external_view_exists = True
return self.ao

cdef extend(self, ndarray[:] x):
for i in range(len(x)):
self.append(x[i])

#----------------------------------------------------------------------
# HashTable
Expand Down Expand Up @@ -362,6 +391,9 @@ cdef class {{name}}HashTable(HashTable):

if needs_resize(ud):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to uniques held, "
"but Vector.resize() needed")
uniques.resize()
append_data_{{dtype}}(ud, val)
labels[i] = count
Expand Down
44 changes: 27 additions & 17 deletions pandas/tests/test_algos.py
Expand Up @@ -14,7 +14,7 @@

from pandas import compat
from pandas._libs import (groupby as libgroupby, algos as libalgos,
hashtable)
hashtable as ht)
from pandas._libs.hashtable import unique_label_indices
from pandas.compat import lrange, range
import pandas.core.algorithms as algos
Expand Down Expand Up @@ -259,7 +259,7 @@ def test_factorize_nan(self):
# rizer.factorize should not raise an exception if na_sentinel indexes
# outside of reverse_indexer
key = np.array([1, 2, 1, np.nan], dtype='O')
rizer = hashtable.Factorizer(len(key))
rizer = ht.Factorizer(len(key))
for na_sentinel in (-1, 20):
ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
Expand Down Expand Up @@ -1049,14 +1049,14 @@ class TestHashTable(object):

def test_lookup_nan(self):
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
m = hashtable.Float64HashTable()
m = ht.Float64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
dtype=np.int64))

def test_lookup_overflow(self):
xs = np.array([1, 2, 2**63], dtype=np.uint64)
m = hashtable.UInt64HashTable()
m = ht.UInt64HashTable()
m.map_locations(xs)
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
dtype=np.int64))
Expand All @@ -1070,25 +1070,35 @@ def test_vector_resize(self):
# Test for memory errors after internal vector
# reallocations (pull request #7157)

def _test_vector_resize(htable, uniques, dtype, nvals):
def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
vals = np.array(np.random.randn(1000), dtype=dtype)
# get_labels appends to the vector
# get_labels may append to uniques
htable.get_labels(vals[:nvals], uniques, 0, -1)
# to_array resizes the vector
uniques.to_array()
htable.get_labels(vals, uniques, 0, -1)
# to_array() set an external_view_exists flag on uniques.
tmp = uniques.to_array()
oldshape = tmp.shape
# subsequent get_labels() calls can no longer append to it
# (for all but StringHashTables + ObjectVector)
if safely_resizes:
htable.get_labels(vals, uniques, 0, -1)
else:
with pytest.raises(ValueError) as excinfo:
htable.get_labels(vals, uniques, 0, -1)
assert str(excinfo.value).startswith('external reference')
uniques.to_array() # should not raise here
assert tmp.shape == oldshape

test_cases = [
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
(ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
(ht.StringHashTable, ht.ObjectVector, 'object', True),
(ht.Float64HashTable, ht.Float64Vector, 'float64', False),
(ht.Int64HashTable, ht.Int64Vector, 'int64', False),
(ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]

for (tbl, vect, dtype) in test_cases:
for (tbl, vect, dtype, safely_resizes) in test_cases:
# resizing to empty is a special case
_test_vector_resize(tbl(), vect(), dtype, 0)
_test_vector_resize(tbl(), vect(), dtype, 10)
_test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
_test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)


def test_quantile():
Expand Down

0 comments on commit fdc2185

Please sign in to comment.