Skip to content
Closed

Dask pp #2318

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ env:
- TEST_TARGET=default
- TEST_TARGET=default TEST_MINIMAL=true
- TEST_TARGET=coding
- TEST_TARGET=example
- TEST_TARGET=doctest

git:
depth: 10000
Expand Down Expand Up @@ -107,7 +105,7 @@ install:

script:
- if [[ $TEST_TARGET == 'default' ]]; then
python -m iris.tests.runner --default-tests --system-tests --print-failed-images;
python -m unittest discover -v lib/iris/tests/integration/temp_dask;
fi
- if [[ $TEST_TARGET == 'example' ]]; then
python -m iris.tests.runner --example-tests --print-failed-images;
Expand Down
1 change: 1 addition & 0 deletions conda-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ numpy
pyke
udunits2
cf_units
dask

# Iris build dependencies
setuptools
Expand Down
79 changes: 79 additions & 0 deletions lib/iris/_lazy_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# (C) British Crown Copyright 2017, Met Office
#
# This file is part of Iris.
#
# Iris is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Iris is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Iris. If not, see <http://www.gnu.org/licenses/>.
"""
Routines for lazy data handling.
To avoid replicating implementation-dependent test and conversion code.
"""
from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa

import dask.array as da


def is_lazy_data(data):
"""
Return whether the argument is an Iris 'lazy' data array.
At present, this means simply a Dask array.
We determine this by checking for a "compute" property.
"""
return hasattr(data, 'compute')


def as_concrete_data(data):
"""
Return the actual content of the argument, as a numpy array.
If lazy, return the realised data, otherwise return the argument unchanged.
"""
if is_lazy_data(data):
data = data.compute()
return data


# A magic value, borrowed from biggus
_MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2
Copy link
Member

@bjlittle bjlittle Jan 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pp-mo Difficult to know what this should default to when it's unknown (at this point) what type of operation is going to be performed, as the choice of chunking should really be aligned with the expected operation/use in order to be optimal (from what I understand)

Copy link
Member Author

@pp-mo pp-mo Jan 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well this is obviously a preliminary.
I propose to "just not worry" about this for now !

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed



def as_lazy_data(data):
"""
Return a lazy equivalent of the argument, as a lazy array.
For an existing dask array, return it unchanged.
Otherwise, return the argument wrapped with dask.array.from_array.
This assumes the underlying object has numpy-array-like properties.
"""
#
# NOTE: there is still some doubts here about what forms of indexing are
# valid.
# Call an integer, slice, ellipsis or new-axis object a "simple" index, and
# other cases "compound" : a list, tuple, or array of integers.
# ( Except, a length-1 tuple, list or array might count as "simple" ? )
# If there is at most one compund index, I think we are ok -- i.e. all
# interpretations should deliver the same.
# If there is *more than one* "compound" index there is potential for
# trouble.
# NOTE#2: cube indexing processes the indices, which may also be relevant.
#
if not is_lazy_data(data):
data = da.from_array(data, chunks=_MAX_CHUNK_SIZE)
return data
39 changes: 21 additions & 18 deletions lib/iris/cube.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2010 - 2016, Met Office
# (C) British Crown Copyright 2010 - 2017, Met Office
#
# This file is part of Iris.
#
Expand Down Expand Up @@ -46,6 +46,7 @@
import iris.coords
import iris._concatenate
import iris._constraints
from iris._lazy_data import is_lazy_data, as_lazy_data, as_concrete_data
import iris._merge
import iris.exceptions
import iris.util
Expand Down Expand Up @@ -713,7 +714,7 @@ def __init__(self, data, standard_name=None, long_name=None,
if isinstance(data, six.string_types):
raise TypeError('Invalid data type: {!r}.'.format(data))

if not isinstance(data, (biggus.Array, ma.MaskedArray)):
if not is_lazy_data(data):
data = np.asarray(data)
self._my_data = data

Expand Down Expand Up @@ -1606,32 +1607,33 @@ def ndim(self):

def lazy_data(self, array=None):
"""
Return a :class:`biggus.Array` representing the
multi-dimensional data of the Cube, and optionally provide a
new array of values.
Return a lazy array representing the Cube data.

Optionally, provide a new lazy array to assign as the cube data.
This must also be a lazy array, according to
:meth:`iris._lazy_data.is_lazy_data`.

Accessing this method will never cause the data to be loaded.
Similarly, calling methods on, or indexing, the returned Array
will not cause the Cube to have loaded data.

If the data have already been loaded for the Cube, the returned
Array will be a :class:`biggus.NumpyArrayAdapter` which wraps
the numpy array from `self.data`.
Array will be a lazy array wrapper, generated by a call to
:meth:`iris._lazy_data.as_lazy_data`.

Kwargs:

* array (:class:`biggus.Array` or None):
* array (lazy array or None):
When this is not None it sets the multi-dimensional data of
the cube to the given value.

Returns:
A :class:`biggus.Array` representing the multi-dimensional
data of the Cube.
A lazy array, representing the Cube data array.

"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pp-mo Should we care about updating the doc-string at this point ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I might as well...

if array is not None:
if not isinstance(array, biggus.Array):
raise TypeError('new values must be a biggus.Array')
if not is_lazy_data(array):
raise TypeError('new values must be a lazy array')
if self.shape != array.shape:
# The _ONLY_ data reshape permitted is converting a
# 0-dimensional array into a 1-dimensional array of
Expand All @@ -1643,8 +1645,8 @@ def lazy_data(self, array=None):
self._my_data = array
else:
array = self._my_data
if not isinstance(array, biggus.Array):
array = biggus.NumpyArrayAdapter(array)
if not is_lazy_data(array):
array = as_lazy_data(array)
return array

@property
Expand Down Expand Up @@ -1681,9 +1683,9 @@ def data(self):

"""
data = self._my_data
if not isinstance(data, np.ndarray):
if is_lazy_data(data):
try:
data = data.masked_array()
data = as_concrete_data(data)
except MemoryError:
msg = "Failed to create the cube's data as there was not" \
" enough memory available.\n" \
Expand All @@ -1694,7 +1696,8 @@ def data(self):
msg = msg.format(self.shape, data.dtype)
raise MemoryError(msg)
# Unmask the array only if it is filled.
if isinstance(data, np.ndarray) and ma.count_masked(data) == 0:
if (isinstance(data, np.ma.masked_array) and
ma.count_masked(data) == 0):
Copy link
Member

@bjlittle bjlittle Jan 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pp-mo It's not possible to put a masked array into a dask.array.Array (well, certainly one that does have its mask set) ... so is this really still valid in the new dask world? Or am I missing something here ...

Are you imagining that a user has a non-masked masked array wrapped up in a dask.array.Array ?

Copy link
Member Author

@pp-mo pp-mo Jan 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment I'm just totally ignoring the masked issue.
This code is effectively a dead branch with the latest changes, as as_concrete_data will never return a masked result.
But this will need fixing later, so I left it in.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed at length 😄

data = data.data
# data may be a numeric type, so ensure an np.ndarray is returned
self._my_data = np.asanyarray(data)
Expand All @@ -1715,7 +1718,7 @@ def data(self, value):
self._my_data = data

def has_lazy_data(self):
return isinstance(self._my_data, biggus.Array)
return is_lazy_data(self._my_data)

@property
def dim_coords(self):
Expand Down
25 changes: 10 additions & 15 deletions lib/iris/fileformats/pp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) British Crown Copyright 2010 - 2016, Met Office
# (C) British Crown Copyright 2010 - 2017, Met Office
#
# This file is part of Iris.
#
Expand Down Expand Up @@ -33,7 +33,6 @@
import struct
import warnings

import biggus
import cf_units
import numpy as np
import numpy.ma as ma
Expand All @@ -44,6 +43,7 @@
import iris.fileformats.rules
import iris.fileformats.pp_rules
import iris.coord_systems
from iris._lazy_data import is_lazy_data, as_concrete_data, as_lazy_data

try:
import mo_pack
Expand Down Expand Up @@ -1286,11 +1286,10 @@ def data(self):

"""
# Cache the real data on first use
if isinstance(self._data, biggus.Array):
data = self._data.masked_array()
if ma.count_masked(data) == 0:
data = data.data
self._data = data
# N.B. this throws away the original lazy object.
if is_lazy_data(self._data):
# Get the data as a numpy array.
self._data = as_concrete_data(self._data)
return self._data

@data.setter
Expand Down Expand Up @@ -1642,12 +1641,8 @@ def __eq__(self, other):
for attr in self.__slots__:
attrs = [hasattr(self, attr), hasattr(other, attr)]
if all(attrs):
self_attr = getattr(self, attr)
other_attr = getattr(other, attr)
if isinstance(self_attr, biggus.NumpyArrayAdapter):
self_attr = self_attr.concrete
if isinstance(other_attr, biggus.NumpyArrayAdapter):
other_attr = other_attr.concrete
self_attr = as_concrete_data(getattr(self, attr))
other_attr = as_concrete_data(getattr(other, attr))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm intrigued you have to concrete each attr to compare it - can dask do lazy object comparisons? Is such a thing even possible??

Copy link
Member Author

@pp-mo pp-mo Jan 19, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It certainly is possible : (self_attr == other_attr) would be a lazy object, i.e. it hasn't looked at the data yet.
(and you can further process that, index it, etc, all still being lazy).

So, I think we really want this code to be explicit that it is realising the content here.
In fact, I fully expected that you would always be required to call 'compute' + nothing else will do it (as with biggus),
however it seems that applying np.all() will realise it anyway (and so will do the compare).
In my view that is not actually nice, and may even be a bug, as it's not documented anywhere -- see comment on #2308

Copy link
Member

@bjlittle bjlittle Jan 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dkillick Yup, I can confirm that dask maintains the laziness when comparing lazy objects; it just generates another lazy dask graph, which you need to realize to get the answer. So the following gives a lazy result:

result = getattr(self, attr) == getattr(other, attr)

And the answer is realized with, for example:

>>> result.compute()
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True], dtype=bool)

Or indeed, as @pp-mo suggests, the lazy dask result is made concreate by np.all:

>>> np.all(result)
True

... which is all good to know, and pretty darn cool!

if not np.all(self_attr == other_attr):
result = False
break
Expand Down Expand Up @@ -1866,7 +1861,7 @@ def _interpret_fields(fields):
def _create_field_data(field, data_shape, land_mask):
"""
Modifies a field's ``_data`` attribute either by:
* converting DeferredArrayBytes into a biggus array,
* converting DeferredArrayBytes into a lazy array,
* converting LoadedArrayBytes into an actual numpy array.

"""
Expand All @@ -1887,7 +1882,7 @@ def _create_field_data(field, data_shape, land_mask):
field.raw_lbpack,
field.boundary_packing,
field.bmdi, land_mask)
field._data = biggus.NumpyArrayAdapter(proxy)
field._data = as_lazy_data(proxy)


def _field_gen(filename, read_data_bytes, little_ended=False):
Expand Down
26 changes: 26 additions & 0 deletions lib/iris/tests/integration/temp_dask/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# (C) British Crown Copyright 2017, Met Office
#
# This file is part of Iris.
#
# Iris is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Iris is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Iris. If not, see <http://www.gnu.org/licenses/>.
"""
Temporary integration tests, specific to replacement of biggus with dask.

Note: some content here may eventually move into main tests.
Keep it here for now, so we can easily test all dask code with :
python -m unittest discover -v lib/iris/tests/integration/temp_dask

"""
from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa
79 changes: 79 additions & 0 deletions lib/iris/tests/integration/temp_dask/test_lazy_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# (C) British Crown Copyright 2017, Met Office
#
# This file is part of Iris.
#
# Iris is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Iris is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Iris. If not, see <http://www.gnu.org/licenses/>.
"""
Test lazy data utility functions.

Note: really belongs in "tests/unit/lazy_data".

"""
from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa

# Import iris.tests first so that some things can be initialised before
# importing anything else.
import iris.tests as tests


import numpy as np
import dask.array as da


from iris._lazy_data import is_lazy_data, as_lazy_data, as_concrete_data


class MixinLazyTestData(object):
def setUp(self):
# Create test real and dask arrays.
self.real_array = np.arange(24).reshape((2, 3, 4))
self.lazy_values = np.arange(30).reshape((2, 5, 3))
self.lazy_array = da.from_array(self.lazy_values, 1e6)


class Test_is_lazy_data(MixinLazyTestData, tests.IrisTest):
def test_lazy(self):
self.assertTrue(is_lazy_data(self.lazy_array))

def test_real(self):
self.assertFalse(is_lazy_data(self.real_array))


class Test_as_lazy_data(MixinLazyTestData, tests.IrisTest):
def test_lazy(self):
result = as_lazy_data(self.lazy_array)
self.assertTrue(is_lazy_data(result))
self.assertIs(result, self.lazy_array)

def test_real(self):
result = as_lazy_data(self.real_array)
self.assertTrue(is_lazy_data(result))
self.assertArrayAllClose(as_concrete_data(result), self.real_array)


class Test_as_concrete_data(MixinLazyTestData, tests.IrisTest):
def test_lazy(self):
result = as_concrete_data(self.lazy_array)
self.assertFalse(is_lazy_data(result))
self.assertArrayAllClose(result, self.lazy_values)

def test_real(self):
result = as_concrete_data(self.real_array)
self.assertFalse(is_lazy_data(result))
self.assertIs(result, self.real_array)


if __name__ == '__main__':
tests.main()
Loading