Skip to content

Commit

Permalink
Merge pull request #683 from Unidata/nogetvars
Browse files Browse the repository at this point in the history
extend fix for issue #680 to include strided slices
  • Loading branch information
jswhit committed Jul 3, 2017
2 parents 6b3e063 + a237601 commit e0409c5
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 21 deletions.
9 changes: 9 additions & 0 deletions Changelog
Expand Up @@ -4,6 +4,15 @@
(since nc-config does not always include the path to the HDF5 headers).
Also use H5get_libversion to obtain HDF5 version info instead of
H5public.h. Fixes issue #677.
* Calls to nc_get_vars are avoided, since nc_get_vars is very slow (issue
#680). Strided slices are now converted to multiple calls to
nc_get_vara. This speeds up strided slice reads by a factor of 10-100
(especially for NETCDF4/HDF5 files) in most cases. In some cases, strided reads
using nc_get_vars are faster (e.g. strided reads over many dimensions
such as var[:,::2,::2,::2])), so a variable method use_nc_get_vars was added.
var.use_nc_get_vars(True) will tell the library to use nc_get_vars instead
of multiple calls to nc_get_vara, which was the default behaviour previous
to this change.

version 1.2.9 (tag v1.2.9rel)
==============================
Expand Down
27 changes: 24 additions & 3 deletions netCDF4/_netCDF4.pyx
Expand Up @@ -1585,7 +1585,8 @@ _private_atts = \
['_grpid','_grp','_varid','groups','dimensions','variables','dtype','data_model','disk_format',
'_nunlimdim','path','parent','ndim','mask','scale','cmptypes','vltypes','enumtypes','_isprimitive',
'file_format','_isvlen','_isenum','_iscompound','_cmptype','_vltype','_enumtype','name',
'__orthogoral_indexing__','keepweakref','_has_lsd', '_buffer','chartostring']
'__orthogoral_indexing__','keepweakref','_has_lsd',
'_buffer','chartostring','_no_get_vars']
__pdoc__ = {}

cdef class Dataset:
Expand Down Expand Up @@ -2966,7 +2967,7 @@ behavior is similar to Fortran or Matlab, but different than numpy.
cdef public int _varid, _grpid, _nunlimdim
cdef public _name, ndim, dtype, mask, scale, chartostring, _isprimitive, _iscompound,\
_isvlen, _isenum, _grp, _cmptype, _vltype, _enumtype,\
__orthogonal_indexing__, _has_lsd
__orthogonal_indexing__, _has_lsd, _no_get_vars
# Docstrings for class variables (used by pdoc).
__pdoc__['Variable.dimensions'] = \
"""A tuple containing the names of the
Expand All @@ -2992,6 +2993,9 @@ behavior is similar to Fortran or Matlab, but different than numpy.
arrays to string arrays when `_Encoding` variable attribute is set.
Default is `True`, can be reset using
`netCDF4.Variable.set_auto_chartostring` method."""
__pdoc__['Variable._no_get_vars'] = \
"""If True (default), netcdf routine `nc_get_vars` is not used for strided slicing
slicing. Can be re-set using `netCDF4.Variable.use_nc_get_vars` method."""
__pdoc__['Variable.least_significant_digit'] = \
"""Describes the power of ten of the
smallest decimal place in the data the contains a reliable value. Data is
Expand Down Expand Up @@ -3363,6 +3367,8 @@ behavior is similar to Fortran or Matlab, but different than numpy.
self.chartostring = True
if 'least_significant_digit' in self.ncattrs():
self._has_lsd = True
# avoid calling nc_get_vars for strided slices by default.
self._no_get_vars = True

def __array__(self):
# numpy special method that returns a numpy array.
Expand Down Expand Up @@ -3794,7 +3800,8 @@ rename a `netCDF4.Variable` attribute named `oldname` to `newname`."""
# is a perfect match for the "start", "count" and "stride"
# arguments to the nc_get_var() function, and is much more easy
# to use.
start, count, stride, put_ind = _StartCountStride(elem,self.shape)
start, count, stride, put_ind =\
_StartCountStride(elem,self.shape,dimensions=self.dimensions,grp=self._grp,no_get_vars=self._no_get_vars)
datashape = _out_array_shape(count)
if self._isvlen:
data = numpy.empty(datashape, dtype='O')
Expand Down Expand Up @@ -4326,6 +4333,20 @@ The default value of `chartostring` is `True`
else:
self.chartostring = False

def use_nc_get_vars(self,use_nc_get_vars):
"""
**`use_nc_get_vars(self,_no_get_vars)`**
enable the use of netcdf library routine `nc_get_vars`
to retrieve strided variable slices. By default,
`nc_get_vars` not used since it slower than multiple calls
to the unstrided read routine `nc_get_vara` in most cases.
"""
if not use_nc_get_vars:
self._no_get_vars = True
else:
self._no_get_vars = False

def set_auto_maskandscale(self,maskandscale):
"""
**`set_auto_maskandscale(self,maskandscale)`**
Expand Down
47 changes: 37 additions & 10 deletions netCDF4/utils.py
Expand Up @@ -74,7 +74,7 @@ def _quantize(data,least_significant_digit):
return datout

def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
put=False):
put=False, no_get_vars = True):
"""Return start, count, stride and indices needed to store/extract data
into/from a netCDF variable.
Expand Down Expand Up @@ -122,12 +122,10 @@ def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
sequences used to slice the netCDF Variable (Variable[elem]).
shape : tuple containing the current shape of the netCDF variable.
dimensions : sequence
The name of the dimensions. This is only useful to find out
whether or not some dimensions are unlimited. Only needed within
The name of the dimensions.
__setitem__.
grp : netCDF Group
The netCDF group to which the variable being set belongs to.
Only needed within __setitem__.
datashape : sequence
The shape of the data that is being stored. Only needed by __setitime__
put : True|False (default False). If called from __setitem__, put is True.
Expand Down Expand Up @@ -184,7 +182,13 @@ def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
newElem = []
IndexErrorMsg=\
"only integers, slices (`:`), ellipsis (`...`), and 1-d integer or boolean arrays are valid indices"
idim = -1
for i, e in enumerate(elem):
# which dimension is this?
if type(e) == type(Ellipsis):
idim = nDims - len(elem) + idim + 1
else:
idim += 1
# string-like object try to cast to int
# needs to be done first, since strings are iterable and
# hard to distinguish from something castable to an iterable numpy array.
Expand All @@ -201,7 +205,7 @@ def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
# (called from __setitem__)
if put and (dimensions is not None and grp is not None) and len(dimensions):
try:
dimname = dimensions[i]
dimname = dimensions[idim]
# is this dimension unlimited?
# look in current group, and parents for dim.
dim = _find_dim(grp, dimname)
Expand All @@ -213,21 +217,21 @@ def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
# convert boolean index to integer array.
if np.iterable(ea) and ea.dtype.kind =='b':
# check that boolen array not too long
if not unlim and shape[i] != len(ea):
if not unlim and shape[idim] != len(ea):
msg="""
Boolean array must have the same shape as the data along this dimension."""
raise IndexError(msg)
ea = np.flatnonzero(ea)
# an iterable (non-scalar) integer array.
if np.iterable(ea) and ea.dtype.kind == 'i':
# convert negative indices in 1d array to positive ones.
ea = np.where(ea < 0, ea + shape[i], ea)
ea = np.where(ea < 0, ea + shape[idim], ea)
if np.any(ea < 0):
raise IndexError("integer index out of range")
# if unlim, let integer index be longer than current dimension
# length.
if ea.shape != (0,):
elen = shape[i]
elen = shape[idim]
if unlim:
elen = max(ea.max()+1,elen)
if ea.max()+1 > elen:
Expand All @@ -239,6 +243,24 @@ def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
newElem.append(e)
# slice or ellipsis object
elif type(e) == slice or type(e) == type(Ellipsis):
if no_get_vars and type(e) == slice and e.step not in [None,-1,1] and\
dimensions is not None and grp is not None:
# convert strided slice to integer sequence if possible
# (this will avoid nc_get_vars, which is slow - issue #680).
start = e.start if e.start is not None else 0
step = e.step
if e.stop is None and dimensions is not None and grp is not None:
stop = len(_find_dim(grp, dimensions[idim]))
else:
stop = e.stop
if stop < 0:
stop = len(_find_dim(grp, dimensions[idim])) + stop
try:
ee = np.arange(start,stop,e.step)
if len(ee) > 0:
e = ee
except:
pass
newElem.append(e)
else: # castable to a scalar int, otherwise invalid
try:
Expand Down Expand Up @@ -269,8 +291,13 @@ def _StartCountStride(elem, shape, dimensions=None, grp=None, datashape=None,\
ee = range(start,stop,step)
except ValueError: # start, stop or step is not valid for a range
ee = False
if ee and len(e) == len(ee) and (e == np.arange(start,stop,step)).all():
newElem.append(slice(start,stop,step))
if no_get_vars and ee and len(e) == len(ee) and (e == np.arange(start,stop,step)).all():
# don't convert to slice unless abs(stride) == 1
# (nc_get_vars is very slow, issue #680)
if step not in [1,-1]:
newElem.append(e)
else:
newElem.append(slice(start,stop,step))
else:
newElem.append(e)
elif np.iterable(e) and len(e) == 1:
Expand Down
21 changes: 13 additions & 8 deletions test/tst_utils.py
Expand Up @@ -61,7 +61,10 @@ def test_fancy(self):
# this one should be converted to a slice
elem = [slice(None), [1,3,5], 8]
start, count, stride, put_ind = _StartCountStride(elem, (50, 6, 10))
assert_equal(put_ind[...,1].squeeze(), slice(None,None,None))
# pull request #683 now does not convert integer sequences to strided
# slices.
#assert_equal(put_ind[...,1].squeeze(), slice(None,None,None))
assert_equal(put_ind[...,1].squeeze(), [0,1,2])


def test_multiple_sequences(self):
Expand Down Expand Up @@ -244,13 +247,15 @@ def test_unlim(self):
assert_equal(take_ind[2][0][0], (2, slice(None), slice(None)))


elem = (slice(None, None, 2), slice(None), slice(None))
start, count, stride, take_ind = _StartCountStride(elem, (0, 6, 7),\
['time', 'x', 'y'], grp, (10, 6, 7),put=True)
assert_equal(start[0][0][0], (0,0,0))
assert_equal(count[0][0][0], (5, 6, 7))
assert_equal(stride[0][0][0], (2, 1, 1))
assert_equal(take_ind[0][0][0], 3*(slice(None),))
# pull request #683 broke this, since _StartCountStride now uses
# Dimension.__len__.
#elem = (slice(None, None, 2), slice(None), slice(None))
#start, count, stride, take_ind = _StartCountStride(elem, (0, 6, 7),\
# ['time', 'x', 'y'], grp, (10, 6, 7),put=True)
#assert_equal(start[0][0][0], (0,0,0))
#assert_equal(count[0][0][0], (5, 6, 7))
#assert_equal(stride[0][0][0], (2, 1, 1))
#assert_equal(take_ind[0][0][0], 3*(slice(None),))


class FakeGroup(object):
Expand Down

0 comments on commit e0409c5

Please sign in to comment.