Skip to content

Commit

Permalink
PERF: fix getitem unique_check / initialization issue
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Dec 21, 2016
1 parent 4c3d4d4 commit dc32b39
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 34 deletions.
7 changes: 7 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class Iteration(object):
def setup(self):
self.df = DataFrame(randn(10000, 1000))
self.df2 = DataFrame(np.random.randn(50000, 10))
self.df3 = pd.DataFrame(np.random.randn(1000,5000),
columns=['C'+str(c) for c in range(5000)])

def f(self):
if hasattr(self.df, '_item_cache'):
Expand All @@ -85,6 +87,11 @@ def time_iteritems(self):
def time_iteritems_cached(self):
self.g()

def time_iteritems_indexing(self):
df = self.df3
for col in df:
df[col]

def time_itertuples(self):
for row in self.df2.itertuples():
pass
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Performance Improvements

- Improved performance of ``.replace()`` (:issue:`12745`)
- Improved performance of ``PeriodIndex`` (:issue:`14822`)
- Performance regression in indexing with getitem (:issue:`14930`)
- Improved performance ``Series`` creation with a datetime index and dictionary data (:issue:`14894`)

.. _whatsnew_0192.enhancements.other:
Expand Down
66 changes: 32 additions & 34 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -82,20 +82,13 @@ cdef class IndexEngine:

cdef:
bint unique, monotonic_inc, monotonic_dec
bint initialized, monotonic_check, unique_check
bint need_monotonic_check, need_unique_check

def __init__(self, vgetter, n):
self.vgetter = vgetter

self.over_size_threshold = n >= _SIZE_CUTOFF

self.initialized = 0
self.monotonic_check = 0
self.unique_check = 0

self.unique = 0
self.monotonic_inc = 0
self.monotonic_dec = 0
self.clear_mapping()

def __contains__(self, object val):
self._ensure_mapping_populated()
Expand Down Expand Up @@ -213,24 +206,28 @@ cdef class IndexEngine:
property is_unique:

def __get__(self):
if not self.initialized:
self.initialize()
if self.need_unique_check:
self._do_unique_check()

self.unique_check = 1
return self.unique == 1

cdef inline _do_unique_check(self):

# this de-facto the same
self._ensure_mapping_populated()

property is_monotonic_increasing:

def __get__(self):
if not self.monotonic_check:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_inc == 1

property is_monotonic_decreasing:

def __get__(self):
if not self.monotonic_check:
if self.need_monotonic_check:
self._do_monotonic_check()

return self.monotonic_dec == 1
Expand All @@ -246,13 +243,12 @@ cdef class IndexEngine:
self.monotonic_dec = 0
is_unique = 0

self.monotonic_check = 1
self.need_monotonic_check = 0

# we can only be sure of uniqueness if is_unique=1
if is_unique:
self.initialized = 1
self.unique = 1
self.unique_check = 1
self.need_unique_check = 0

cdef _get_index_values(self):
return self.vgetter()
Expand All @@ -266,30 +262,32 @@ cdef class IndexEngine:
cdef _check_type(self, object val):
hash(val)

property is_mapping_populated:

def __get__(self):
return self.mapping is not None

cdef inline _ensure_mapping_populated(self):
# need to reset if we have previously
# set the initialized from monotonic checks
if self.unique_check:
self.initialized = 0
if not self.initialized:
self.initialize()

cdef initialize(self):
values = self._get_index_values()
# this populates the mapping
# if its not already populated
# also satisfies the need_unique_check

self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)
if not self.is_mapping_populated:

if len(self.mapping) == len(values):
self.unique = 1
values = self._get_index_values()

self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)

if len(self.mapping) == len(values):
self.unique = 1

self.initialized = 1
self.need_unique_check = 0

def clear_mapping(self):
self.mapping = None
self.initialized = 0
self.monotonic_check = 0
self.unique_check = 0
self.need_monotonic_check = 1
self.need_unique_check = 1

self.unique = 0
self.monotonic_inc = 0
Expand Down

0 comments on commit dc32b39

Please sign in to comment.