From 7cf6bdef637ec5871c6e297822a7153f15e07e2e Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Wed, 24 May 2017 14:58:40 +0100 Subject: [PATCH 01/15] first draft of first section --- .../iris/src/userguide/real_and_lazy_data.rst | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 docs/iris/src/userguide/real_and_lazy_data.rst diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst new file mode 100644 index 0000000000..28add51bdc --- /dev/null +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -0,0 +1,76 @@ +.. _real_and_lazy_data: + +================== +Real and Lazy Data +================== + +What is Real and Lazy Data? +--------------------------- + +Every Iris cube contains an n-dimensional data array, which could be real or +lazy. + +Real data is contained in an array which has a shape, a data type, some other +useful information and many data points, each of which use up a small +allocation of memory. This generally takes the form of a numpy array. + +Lazy data is contained in a conceptual array which retains the information +about its real counterpart but has no actual data points, so its memory +allocation is much smaller. This will be in the form of a dask array. + +Arrays in Iris can be converted flexibly(?) between their real and lazy states, +although there are some limits to this process. The advantage of using lazy +data is that it has a small memory footprint, so certain operations +(such as...?) can be much faster. However, in order to perform other +operations (such as calculations on actual data values) the real data must be +realized. + +* When a cube is loaded, are the data arrays always lazy to begin with? * + +You can check whether the data array on your cube is lazy using the Iris +function 'has_lazy_data'. For example: + +>>> import iris +>>> filename = iris.sample_data_path('uk_hires.pp') +>>> cube = iris.load_cube(filename, 'air_potential_temperature') +>>> cube.has_lazy_data() +True +>>> _ = cube.data +>>> cube.has_lazy_data() +False + +Assigning the data array to a variable causes the real data to be realized, at +which point the array ceases to be lazy. Any action which requires the use of +actual data values (such as cube maths) will have this effect, although data +realization is always deferred until the last possible moment: + +>>> my_cube = iris.load_cube(iris.sample_data_path('rotated_pole.nc')) +>>> my_cube.has_lazy_data() +True +>>> my_cube += 5 +>>> my_cube.has_lazy_data() +True +>>> my_cube.data +>>> my_cube.has_lazy_data() +False + +Core data refers to the current state of the cube's data, be it real or +lazy. This can be used if you wish to refer to the data array but are +indifferent to its current state. If the cube's data is lazy, it will not be +realized when you reference the core data attribute (?): + +>>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) +>>> cube.has_lazy_data() +True +>>> the_data = cube.core_data +>>> cube.has_lazy_data() +True +>>> real_data = cube.data +>>> cube.has_lazy_data() +False + + +Changing a Cube's Data +---------------------- + + From 473e8ea1620de64fc3dbdb948536bbea7b67b694 Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Wed, 24 May 2017 16:04:28 +0100 Subject: [PATCH 02/15] first draft of second section --- .../iris/src/userguide/real_and_lazy_data.rst | 131 ++++++++++++++---- 1 file changed, 105 insertions(+), 26 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 28add51bdc..7e1b2e9007 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -30,47 +30,126 @@ realized. You can check whether the data array on your cube is lazy using the Iris function 'has_lazy_data'. For example: ->>> import iris ->>> filename = iris.sample_data_path('uk_hires.pp') ->>> cube = iris.load_cube(filename, 'air_potential_temperature') ->>> cube.has_lazy_data() -True ->>> _ = cube.data ->>> cube.has_lazy_data() -False +.. doctest:: + + >>> import iris + >>> cube = iris.load_cube(filename, 'air_temp.pp') + >>> cube.has_lazy_data() + True + >>> _ = cube.data + >>> cube.has_lazy_data() + False Assigning the data array to a variable causes the real data to be realized, at which point the array ceases to be lazy. Any action which requires the use of actual data values (such as cube maths) will have this effect, although data realization is always deferred until the last possible moment: ->>> my_cube = iris.load_cube(iris.sample_data_path('rotated_pole.nc')) ->>> my_cube.has_lazy_data() -True ->>> my_cube += 5 ->>> my_cube.has_lazy_data() -True ->>> my_cube.data ->>> my_cube.has_lazy_data() -False +.. doctest:: + + >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) + >>> cube.has_lazy_data() + True + >>> cube += 5 + >>> cube.has_lazy_data() + True + >>> cube.data + >>> cube.has_lazy_data() + False + +You can also convert realized data back into a lazy array: + +.. doctest:: + + >>> cube.has_lazy_data() + False + >>> cube.data = cube.lazy_data() + >>> cube.has_lazy_data() + True + + Core data refers to the current state of the cube's data, be it real or lazy. This can be used if you wish to refer to the data array but are indifferent to its current state. If the cube's data is lazy, it will not be realized when you reference the core data attribute (?): ->>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) ->>> cube.has_lazy_data() -True ->>> the_data = cube.core_data ->>> cube.has_lazy_data() -True ->>> real_data = cube.data ->>> cube.has_lazy_data() -False +.. doctest:: + + >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) + >>> cube.has_lazy_data() + True + >>> the_data = cube.core_data + >>> cube.has_lazy_data() + True + >>> real_data = cube.data + >>> cube.has_lazy_data() + False Changing a Cube's Data ---------------------- +There are several methods of modifying a cube's data array, each one subtly +different from the others. + +Maths +^^^^^ + +You can use :ref:`cube maths ` to make in-place modifications to +each point in a cube's existing data array. Provided you do not directly +reference the cube's data, the array will remain lazy: + +.. doctest:: + + >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) + >>> cube.has_lazy_data() + True + >>> cube *= 10 + >>> cube.has_lazy_data() + True + +Copy +^^^^ + +You can copy a cube and assign a completely new data array to the copy. All the +original cube's metadata will be the same as the new cube's metadata. However, +the new cube's data array will not be lazy if you replace it with a real array: + +.. doctest:: + + >>> import numpy as np + >>> data = np.zeros((73, 96)) + >>> new_cube = cube.copy(data=data) + >>> new_cube.has_lazy_data() + False + +Replace +^^^^^^^ + +This does essentially the same thing as `cube.copy()`, except that it provides +a safe method of doing so for the specific edge case of a lazy masked integer +array: + +.. doctest:: + + >>> values = np.zeros((73, 96), dtype=int) + >>> data =np.ma.masked_values(values, 0) + >>> print(data) + [[-- -- -- ..., -- -- --] + [-- -- -- ..., -- -- --] + [-- -- -- ..., -- -- --] + ..., + [-- -- -- ..., -- -- --] + [-- -- -- ..., -- -- --] + [-- -- -- ..., -- -- --]] + >>> new_cube = cube.copy(data=data) + >>> new_cube.has_lazy_data() + False + >>> new_cube.data = new_cube.lazy_data() + >>> new_cube.has_lazy_data() + True + +This method is necessary as dask is currently unable to handle masked arrays. +Please refer to the Whitepaper for further details. From df96ea0e9b29bd82f4354bf07150332eeb331cdb Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Thu, 25 May 2017 11:05:39 +0100 Subject: [PATCH 03/15] another section added --- .../iris/src/userguide/real_and_lazy_data.rst | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 7e1b2e9007..d5afd827be 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -16,7 +16,7 @@ allocation of memory. This generally takes the form of a numpy array. Lazy data is contained in a conceptual array which retains the information about its real counterpart but has no actual data points, so its memory -allocation is much smaller. This will be in the form of a dask array. +allocation is much smaller. This will be in the form of a Dask array. Arrays in Iris can be converted flexibly(?) between their real and lazy states, although there are some limits to this process. The advantage of using lazy @@ -150,6 +150,55 @@ array: >>> new_cube.has_lazy_data() True -This method is necessary as dask is currently unable to handle masked arrays. +This method is necessary as Dask is currently unable to handle masked arrays. Please refer to the Whitepaper for further details. + +Dask Processing Options +----------------------- + +As well as Dask offering the benefit of a smaller memory footprint through the +handling of lazy arrays, it can significantly speed up performance by allowing +Iris to use multiprocessing. + +There are some default values which are set by Dask and passed through to Iris. +If you wish to change these options, you can override them globally or using a +context manager. + +Here are some examples of the options that you may wish to change: + +You can set the number of threads on which to work like this: + + >>> from multiprocessing.pool import ThreadPool + >>> with dask.set_options(pool=ThreadPool(4)): + ... x.compute() + +Multiple threads work well with heavy computation. + + +You can change the default option between threaded scheduler and +multiprocessing scheduler, for example: + + >>> with dask.set_options(get=dask.multiprocessing.get): + ... x.sum().compute() + +Multiprocessing works well with strings, lists or custom Dask objects. + + +You can choose to run all processes in serial (which is currently the Iris +default): + + >>> dask.set_options(get=dask.get) + +This option is particularly good for debugging scripts. + + +Further Reading +--------------- + + + +Stuff still to add (?): +- Links to dask docs and distributed docs + + From 2be161be909f153c1f9a36fef33c1af17e5ddd67 Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Thu, 25 May 2017 15:28:55 +0100 Subject: [PATCH 04/15] first draft of new page of user guide --- .../iris/src/userguide/real_and_lazy_data.rst | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index d5afd827be..6a0af1606c 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -169,6 +169,9 @@ Here are some examples of the options that you may wish to change: You can set the number of threads on which to work like this: +.. doctest:: + + >>> import dask >>> from multiprocessing.pool import ThreadPool >>> with dask.set_options(pool=ThreadPool(4)): ... x.compute() @@ -179,6 +182,8 @@ Multiple threads work well with heavy computation. You can change the default option between threaded scheduler and multiprocessing scheduler, for example: +.. doctest:: + >>> with dask.set_options(get=dask.multiprocessing.get): ... x.sum().compute() @@ -188,6 +193,8 @@ Multiprocessing works well with strings, lists or custom Dask objects. You can choose to run all processes in serial (which is currently the Iris default): +.. doctest:: + >>> dask.set_options(get=dask.get) This option is particularly good for debugging scripts. @@ -196,9 +203,23 @@ This option is particularly good for debugging scripts. Further Reading --------------- +Dask offers much more fine control than is described in this user guide, +although a good understanding of the package would be required to properly +utilize it. + +For example, it is possible to write callback functions to customize processing +options, of which there are many more than we have outlined. Also, you may +wish to use some of the available Dask functionality regarding deferred +operations for your own scripts and objects. + +For more information about these tools, how they work and what you can do with +them, please visit the following package documentation pages: + +.. _Dask: http://dask.pydata.org/en/latest/ +.. _Dask.distributed: http://distributed.readthedocs.io/en/latest/ +`Dask`_ +`Dask.distributed`_ -Stuff still to add (?): -- Links to dask docs and distributed docs From 1414f28b4cf563bd801bcefe8ce12eb32f79c146 Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Fri, 26 May 2017 10:39:37 +0100 Subject: [PATCH 05/15] first revision of user guide --- .../iris/src/userguide/real_and_lazy_data.rst | 77 ++++++++++++------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 6a0af1606c..cef6b88020 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -7,25 +7,30 @@ Real and Lazy Data What is Real and Lazy Data? --------------------------- -Every Iris cube contains an n-dimensional data array, which could be real or +Every Iris cube contains an n-dimensional data array, which could be real or lazy. -Real data is contained in an array which has a shape, a data type, some other -useful information and many data points, each of which use up a small -allocation of memory. This generally takes the form of a numpy array. +Real data is contained in a NumPy array which has a shape, a data type, some +other useful information and many data points, each of which use up a small +allocation of memory. Lazy data is contained in a conceptual array which retains the information about its real counterpart but has no actual data points, so its memory allocation is much smaller. This will be in the form of a Dask array. -Arrays in Iris can be converted flexibly(?) between their real and lazy states, -although there are some limits to this process. The advantage of using lazy -data is that it has a small memory footprint, so certain operations -(such as...?) can be much faster. However, in order to perform other -operations (such as calculations on actual data values) the real data must be -realized. +Arrays in Iris can be converted between their real and lazy states, +although there are some limits to this process (*explain this better - what +limits?*). -* When a cube is loaded, are the data arrays always lazy to begin with? * +The advantage of using lazy data is that it has a small memory footprint which +enables the user to load and manipulate datasets that you would otherwise not +be able to fit into memory. + +However, in order to execute certain operations (such as calculations on actual +data values) the real data must be realized. Using Dask, the operation will +be deferred until you request the result, at which point it will be executed +using Dask's parallel processing schedulers. The combination of these two +behaviours can offer a significant performance boost. You can check whether the data array on your cube is lazy using the Iris function 'has_lazy_data'. For example: @@ -36,14 +41,20 @@ function 'has_lazy_data'. For example: >>> cube = iris.load_cube(filename, 'air_temp.pp') >>> cube.has_lazy_data() True - >>> _ = cube.data + >>> cube.data >>> cube.has_lazy_data() False -Assigning the data array to a variable causes the real data to be realized, at -which point the array ceases to be lazy. Any action which requires the use of -actual data values (such as cube maths) will have this effect, although data -realization is always deferred until the last possible moment: +When does my Data Become Real? +------------------------------ + +If the data on your cube is in its lazy state, it will only become real if you +'touch' the data. This means any way of directly accessing the data, such as +assigning it to a variable or simply using 'cube.data' as in the example above. + +Any action which requires the use of actual data values (such as cube maths) +will also cause the data to be loaded into memory, although data realization +is always deferred until the result is requested: .. doctest:: @@ -67,8 +78,6 @@ You can also convert realized data back into a lazy array: >>> cube.has_lazy_data() True - - Core data refers to the current state of the cube's data, be it real or lazy. This can be used if you wish to refer to the data array but are indifferent to its current state. If the cube's data is lazy, it will not be @@ -79,7 +88,7 @@ realized when you reference the core data attribute (?): >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() True - >>> the_data = cube.core_data + >>> the_data = cube.core_data() >>> cube.has_lazy_data() True >>> real_data = cube.data @@ -154,18 +163,30 @@ This method is necessary as Dask is currently unable to handle masked arrays. Please refer to the Whitepaper for further details. +Coordinate Arrays +----------------- + +Cubes possess coordinate arrays as well as data arrays, so these also benefit +from Dask's functionality, although there are some distinctions between how +the different coordinate types are treated. + +Auxiliary coordinates can now contain lazy arrays, so they will adhere to the +same rules and behaviour as the data arrays. Dimension coordinates, however, +undergo monotonicity checks which cause the arrays to be realized upon +construction, so they can only contain real arrays. + + Dask Processing Options ----------------------- +Dask applies some default values to certain aspects of the parallel processing +that it offers with Iris. It is possible to change these values and override +the defaults by using 'dask.set_options(option)' in your script. -As well as Dask offering the benefit of a smaller memory footprint through the -handling of lazy arrays, it can significantly speed up performance by allowing -Iris to use multiprocessing. - -There are some default values which are set by Dask and passed through to Iris. -If you wish to change these options, you can override them globally or using a -context manager. +You can use this as a global variable if you wish to use your chosen option for +the full length of the script, or you can use it with a context manager to +control the span of the option. -Here are some examples of the options that you may wish to change: +Here are some examples of the options that you may wish to change. You can set the number of threads on which to work like this: @@ -195,7 +216,7 @@ default): .. doctest:: - >>> dask.set_options(get=dask.get) + >>> dask.set_options(get=dask.async.get_sync) This option is particularly good for debugging scripts. From aff57395465f9f3e1ddca7796d0638c0839f1acf Mon Sep 17 00:00:00 2001 From: Peter Killick Date: Tue, 30 May 2017 12:46:40 +0100 Subject: [PATCH 06/15] Titles --- docs/iris/src/userguide/real_and_lazy_data.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index cef6b88020..abceff246d 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -4,7 +4,7 @@ Real and Lazy Data ================== -What is Real and Lazy Data? +What is real and lazy data? --------------------------- Every Iris cube contains an n-dimensional data array, which could be real or @@ -45,7 +45,7 @@ function 'has_lazy_data'. For example: >>> cube.has_lazy_data() False -When does my Data Become Real? +When does my data become real? ------------------------------ If the data on your cube is in its lazy state, it will only become real if you @@ -96,7 +96,7 @@ realized when you reference the core data attribute (?): False -Changing a Cube's Data +Changing a cube's data ---------------------- There are several methods of modifying a cube's data array, each one subtly @@ -163,8 +163,8 @@ This method is necessary as Dask is currently unable to handle masked arrays. Please refer to the Whitepaper for further details. -Coordinate Arrays ------------------ +Coordinates +----------- Cubes possess coordinate arrays as well as data arrays, so these also benefit from Dask's functionality, although there are some distinctions between how @@ -176,8 +176,9 @@ undergo monotonicity checks which cause the arrays to be realized upon construction, so they can only contain real arrays. -Dask Processing Options +Dask processing options ----------------------- + Dask applies some default values to certain aspects of the parallel processing that it offers with Iris. It is possible to change these values and override the defaults by using 'dask.set_options(option)' in your script. @@ -221,7 +222,7 @@ default): This option is particularly good for debugging scripts. -Further Reading +Further reading --------------- Dask offers much more fine control than is described in this user guide, From 8ae778523e05fb28bba02ec5980b0c42d62c26e5 Mon Sep 17 00:00:00 2001 From: Peter Killick Date: Tue, 30 May 2017 12:50:17 +0100 Subject: [PATCH 07/15] Code blocks --- .../iris/src/userguide/real_and_lazy_data.rst | 49 +++++++------------ 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index abceff246d..63e9cf25c6 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -1,5 +1,13 @@ .. _real_and_lazy_data: + +.. testsetup:: * + + import dask.array as da + import iris + import numpy as np + + ================== Real and Lazy Data ================== @@ -33,11 +41,8 @@ using Dask's parallel processing schedulers. The combination of these two behaviours can offer a significant performance boost. You can check whether the data array on your cube is lazy using the Iris -function 'has_lazy_data'. For example: +function 'has_lazy_data'. For example:: -.. doctest:: - - >>> import iris >>> cube = iris.load_cube(filename, 'air_temp.pp') >>> cube.has_lazy_data() True @@ -54,9 +59,7 @@ assigning it to a variable or simply using 'cube.data' as in the example above. Any action which requires the use of actual data values (such as cube maths) will also cause the data to be loaded into memory, although data realization -is always deferred until the result is requested: - -.. doctest:: +is always deferred until the result is requested:: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() @@ -68,9 +71,7 @@ is always deferred until the result is requested: >>> cube.has_lazy_data() False -You can also convert realized data back into a lazy array: - -.. doctest:: +You can also convert realized data back into a lazy array:: >>> cube.has_lazy_data() False @@ -81,9 +82,7 @@ You can also convert realized data back into a lazy array: Core data refers to the current state of the cube's data, be it real or lazy. This can be used if you wish to refer to the data array but are indifferent to its current state. If the cube's data is lazy, it will not be -realized when you reference the core data attribute (?): - -.. doctest:: +realized when you reference the core data attribute (?):: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() @@ -107,9 +106,7 @@ Maths You can use :ref:`cube maths ` to make in-place modifications to each point in a cube's existing data array. Provided you do not directly -reference the cube's data, the array will remain lazy: - -.. doctest:: +reference the cube's data, the array will remain lazy:: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() @@ -123,9 +120,7 @@ Copy You can copy a cube and assign a completely new data array to the copy. All the original cube's metadata will be the same as the new cube's metadata. However, -the new cube's data array will not be lazy if you replace it with a real array: - -.. doctest:: +the new cube's data array will not be lazy if you replace it with a real array:: >>> import numpy as np >>> data = np.zeros((73, 96)) @@ -138,9 +133,7 @@ Replace This does essentially the same thing as `cube.copy()`, except that it provides a safe method of doing so for the specific edge case of a lazy masked integer -array: - -.. doctest:: +array:: >>> values = np.zeros((73, 96), dtype=int) >>> data =np.ma.masked_values(values, 0) @@ -189,9 +182,7 @@ control the span of the option. Here are some examples of the options that you may wish to change. -You can set the number of threads on which to work like this: - -.. doctest:: +You can set the number of threads on which to work like this:: >>> import dask >>> from multiprocessing.pool import ThreadPool @@ -202,9 +193,7 @@ Multiple threads work well with heavy computation. You can change the default option between threaded scheduler and -multiprocessing scheduler, for example: - -.. doctest:: +multiprocessing scheduler, for example:: >>> with dask.set_options(get=dask.multiprocessing.get): ... x.sum().compute() @@ -213,9 +202,7 @@ Multiprocessing works well with strings, lists or custom Dask objects. You can choose to run all processes in serial (which is currently the Iris -default): - -.. doctest:: +default):: >>> dask.set_options(get=dask.async.get_sync) From 0c35413d689990d481f631c9c6f4ba968dbb82bb Mon Sep 17 00:00:00 2001 From: Peter Killick Date: Wed, 31 May 2017 11:47:29 +0100 Subject: [PATCH 08/15] A significant rewrite --- .../iris/src/userguide/real_and_lazy_data.rst | 281 ++++++++---------- 1 file changed, 129 insertions(+), 152 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 63e9cf25c6..9069447256 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -12,36 +12,40 @@ Real and Lazy Data ================== -What is real and lazy data? ---------------------------- +We have seen in the :doc:`user_guide_introduction` section of the user guide that +Iris cubes contain data and metadata about a phenomenon. The data attribute of a +cube contains the actual numerical values recorded for the phenomenon the cube describes. +The data element of a cube is always an array, but the array may be either +"real" or "lazy". -Every Iris cube contains an n-dimensional data array, which could be real or -lazy. +In this section of the user guide we will look specifically at the concepts of +real and lazy data as they apply to the cube and other data structures in Iris. -Real data is contained in a NumPy array which has a shape, a data type, some -other useful information and many data points, each of which use up a small -allocation of memory. -Lazy data is contained in a conceptual array which retains the information -about its real counterpart but has no actual data points, so its memory -allocation is much smaller. This will be in the form of a Dask array. +What is real and lazy data? +--------------------------- -Arrays in Iris can be converted between their real and lazy states, -although there are some limits to this process (*explain this better - what -limits?*). +In Iris, we use the term **real data** to describe data arrays that are loaded +into memory. Real data is typically provided as a +`NumPy array `_, +which has a shape and data type that are used to describe the array's data points. +Each data point takes up a small amount of memory, which means large NumPy arrays can +take up a large amount of memory. -The advantage of using lazy data is that it has a small memory footprint which -enables the user to load and manipulate datasets that you would otherwise not -be able to fit into memory. +Conversely, we use the term **lazy data** to describe data that is not loaded into memory. +(This is sometimes also referred to as **deferred data**.) +In Iris, lazy data is provided as a +`dask array `_. +A dask array also has a shape and data type +but typically the dask array's data points are not loaded into memory. +Instead the data points are stored on disk and only loaded into memory in +small chunks when absolutely necessary. -However, in order to execute certain operations (such as calculations on actual -data values) the real data must be realized. Using Dask, the operation will -be deferred until you request the result, at which point it will be executed -using Dask's parallel processing schedulers. The combination of these two -behaviours can offer a significant performance boost. +The primary advantage of using lazy data is that it enables the loading and manipulating +of datasets that would otherwise not fit into memory. -You can check whether the data array on your cube is lazy using the Iris -function 'has_lazy_data'. For example:: +You can check whether a cube has real data or lazy data by using the method +:meth:`~iris.cube.Cube.has_lazy_data`. For example:: >>> cube = iris.load_cube(filename, 'air_temp.pp') >>> cube.has_lazy_data() @@ -53,13 +57,12 @@ function 'has_lazy_data'. For example:: When does my data become real? ------------------------------ -If the data on your cube is in its lazy state, it will only become real if you -'touch' the data. This means any way of directly accessing the data, such as -assigning it to a variable or simply using 'cube.data' as in the example above. - -Any action which requires the use of actual data values (such as cube maths) -will also cause the data to be loaded into memory, although data realization -is always deferred until the result is requested:: +Most operations on data arrays can be run equivalently on both real and lazy data. +If the data array is real then the operation will be run on the data array +immediately with the results becoming available as soon as processing is completed. +If the data array is lazy then the operation will be deferred until you request +the result (such as when you call ``cube.data``). In this case the data array will +remain lazy:: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() @@ -67,168 +70,142 @@ is always deferred until the result is requested:: >>> cube += 5 >>> cube.has_lazy_data() True - >>> cube.data - >>> cube.has_lazy_data() - False -You can also convert realized data back into a lazy array:: +This is referred to as **lazy evaluation**. - >>> cube.has_lazy_data() - False - >>> cube.data = cube.lazy_data() - >>> cube.has_lazy_data() - True +Certain operations, including regridding and plotting, can only be run on real data. +Calling such operations on lazy data will automatically realise your lazy data. + +You can also realise (and so load into memory) your cube's lazy if you 'touch' the data. +This means directly accessing the data by calling ``cube.data``, as in the previous example. + +Core data +^^^^^^^^^ -Core data refers to the current state of the cube's data, be it real or -lazy. This can be used if you wish to refer to the data array but are -indifferent to its current state. If the cube's data is lazy, it will not be -realized when you reference the core data attribute (?):: +Cubes have the concept of "core data". This returns the cube's data in its +current state. If a cube has lazy data, calling the cube's +:meth:`~iris.cube.Cube.core_data` method will return the cube's lazy dask array. +If the cube has real data, calling the cube's +:meth:`~iris.cube.Cube.core_data` method will return the cube's real NumPy array. + +Calling the cube's :meth:`~iris.cube.Cube.core_data` method will not change the +state of the cube's data. Thus, if the cube's data is lazy then calling +:meth:`~iris.cube.Cube.core_data` will return the cube's lazy data and not +realise it. + +For example:: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() True >>> the_data = cube.core_data() + >>> type(the_data) + >>> cube.has_lazy_data() True - >>> real_data = cube.data + >>> cube.data + >>> the_data = cube.core_data() + >>> type(the_data) + >>> cube.has_lazy_data() False -Changing a cube's data ----------------------- +Coordinates +----------- -There are several methods of modifying a cube's data array, each one subtly -different from the others. +In the same way that Iris cubes contain a data array, Iris coordinates contain +points and bounds arrays. Coordinate points and bounds arrays can also be real or lazy: -Maths -^^^^^ + * A :class:`~iris.coords.DimCoord` will only ever have **real** points and bounds + arrays because of monotonicity checks that realise lazy arrays. + * An :class:`~iris.coords.AuxCoord` can have **real or lazy** points and bounds. + * An :class:`~iris.aux_factory.AuxCoordFactory` (or derived coordinate) + can have **real or lazy** points and bounds. If all of the + :class:`~iris.coords.AuxCoord` instances that the coordinate is derived from have + real points and bounds then the derived coordinate will also have real points + and bounds, otherwise the derived coordinate will have lazy points and bounds. -You can use :ref:`cube maths ` to make in-place modifications to -each point in a cube's existing data array. Provided you do not directly -reference the cube's data, the array will remain lazy:: +Iris cubes and coordinates have very similar interfaces, which extends to accessing +coordinates' lazy points and bounds: - >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) - >>> cube.has_lazy_data() +.. doctest:: + + >>> cube = iris.load_cube(iris.sample_data_path('hybrid_height.nc')) + >>> dim_coord = cube.coord('model_level_number') + >>> print dim_coord.has_lazy_points() + False + >>> print dim_coord.has_bounds() + False + >>> print dim_coord.has_lazy_bounds() + False + >>> aux_coord = cube.coord('sigma') + >>> print aux_coord.has_lazy_points() True - >>> cube *= 10 - >>> cube.has_lazy_data() + >>> print aux_coord.has_bounds() True - -Copy -^^^^ - -You can copy a cube and assign a completely new data array to the copy. All the -original cube's metadata will be the same as the new cube's metadata. However, -the new cube's data array will not be lazy if you replace it with a real array:: - - >>> import numpy as np - >>> data = np.zeros((73, 96)) - >>> new_cube = cube.copy(data=data) - >>> new_cube.has_lazy_data() + >>> print aux_coord.has_lazy_bounds() False - -Replace -^^^^^^^ - -This does essentially the same thing as `cube.copy()`, except that it provides -a safe method of doing so for the specific edge case of a lazy masked integer -array:: - - >>> values = np.zeros((73, 96), dtype=int) - >>> data =np.ma.masked_values(values, 0) - >>> print(data) - [[-- -- -- ..., -- -- --] - [-- -- -- ..., -- -- --] - [-- -- -- ..., -- -- --] - ..., - [-- -- -- ..., -- -- --] - [-- -- -- ..., -- -- --] - [-- -- -- ..., -- -- --]] - >>> new_cube = cube.copy(data=data) - >>> new_cube.has_lazy_data() + >>> points = aux_coord.points + >>> print aux_coord.has_lazy_points() False - >>> new_cube.data = new_cube.lazy_data() - >>> new_cube.has_lazy_data() + >>> print derived_coord.has_lazy_points() True + >>> print derived_coord.has_bounds() + True + >>> print derived_coord.has_lazy_bounds() + False -This method is necessary as Dask is currently unable to handle masked arrays. -Please refer to the Whitepaper for further details. - - -Coordinates ------------ - -Cubes possess coordinate arrays as well as data arrays, so these also benefit -from Dask's functionality, although there are some distinctions between how -the different coordinate types are treated. +.. note:: + Printing a lazy :class:`~iris.coords.AuxCoord` will realise its points and bounds arrays! -Auxiliary coordinates can now contain lazy arrays, so they will adhere to the -same rules and behaviour as the data arrays. Dimension coordinates, however, -undergo monotonicity checks which cause the arrays to be realized upon -construction, so they can only contain real arrays. +Derived coordinates (also called aux factories) . Dask processing options ----------------------- -Dask applies some default values to certain aspects of the parallel processing -that it offers with Iris. It is possible to change these values and override -the defaults by using 'dask.set_options(option)' in your script. +As stated earlier in this user guide section, Iris uses dask to provide +lazy data arrays for both Iris cubes and coordinates. Iris also uses dask +functionality for processing deferred operations on lazy arrays. -You can use this as a global variable if you wish to use your chosen option for -the full length of the script, or you can use it with a context manager to -control the span of the option. +There are a wide range of dask processing options that can be adjusted to +control how dask processes deferred operations on lazy arrays. You can make use +of these dask processing options to control how lazy arrays within Iris are +processed as well. -Here are some examples of the options that you may wish to change. +Iris by default applies a single dask processing option. This specifies that +all dask processing in Iris should be run in serial (that is, without any +parallel processing enabled). -You can set the number of threads on which to work like this:: - - >>> import dask - >>> from multiprocessing.pool import ThreadPool - >>> with dask.set_options(pool=ThreadPool(4)): - ... x.compute() +The dask processing option applied by Iris can be overridden by manually setting +dask processing options for either or both of: -Multiple threads work well with heavy computation. + * the number of parallel workers to use, + * the scheduler to use. +This must be done **before** importing Iris. For example, to specify that dask +processing within Iris should use four workers in a thread pool:: -You can change the default option between threaded scheduler and -multiprocessing scheduler, for example:: - - >>> with dask.set_options(get=dask.multiprocessing.get): - ... x.sum().compute() - -Multiprocessing works well with strings, lists or custom Dask objects. - - -You can choose to run all processes in serial (which is currently the Iris -default):: + >>> from multiprocessing.pool import ThreadPool + >>> import dask + >>> dask.set_options(get=dask.threaded.get, pool=ThreadPool(4)) + + >>> import iris + >>> # Iris processing here... - >>> dask.set_options(get=dask.async.get_sync) +.. note:: + These dask processing options will last for the lifetime of the Python session + and must be re-applied in other or subsequent sessions. -This option is particularly good for debugging scripts. +See the +`dask documentation `_ +for more information on setting dask processing options. Further reading --------------- -Dask offers much more fine control than is described in this user guide, -although a good understanding of the package would be required to properly -utilize it. - -For example, it is possible to write callback functions to customize processing -options, of which there are many more than we have outlined. Also, you may -wish to use some of the available Dask functionality regarding deferred -operations for your own scripts and objects. - -For more information about these tools, how they work and what you can do with -them, please visit the following package documentation pages: - -.. _Dask: http://dask.pydata.org/en/latest/ -.. _Dask.distributed: http://distributed.readthedocs.io/en/latest/ - -`Dask`_ -`Dask.distributed`_ - - - +This section of the user guide has been designed to give a quick overview of the +key concepts of real and lazy data within Iris. If you require more detail, we +have produced a much more detailed whitepaper on the subject. From 48c9a89ea616eabbef63d94e78bef6a69cd420e5 Mon Sep 17 00:00:00 2001 From: Peter Killick Date: Thu, 1 Jun 2017 09:42:28 +0100 Subject: [PATCH 09/15] Review actions rewrites --- .../iris/src/userguide/real_and_lazy_data.rst | 57 +++++++++---------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 9069447256..d0af39aeda 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -13,10 +13,8 @@ Real and Lazy Data ================== We have seen in the :doc:`user_guide_introduction` section of the user guide that -Iris cubes contain data and metadata about a phenomenon. The data attribute of a -cube contains the actual numerical values recorded for the phenomenon the cube describes. -The data element of a cube is always an array, but the array may be either -"real" or "lazy". +Iris cubes contain data and metadata about a phenomenon. The data element of a cube +is always an array, but the array may be either "real" or "lazy". In this section of the user guide we will look specifically at the concepts of real and lazy data as they apply to the cube and other data structures in Iris. @@ -59,10 +57,9 @@ When does my data become real? Most operations on data arrays can be run equivalently on both real and lazy data. If the data array is real then the operation will be run on the data array -immediately with the results becoming available as soon as processing is completed. -If the data array is lazy then the operation will be deferred until you request -the result (such as when you call ``cube.data``). In this case the data array will -remain lazy:: +immediately. The results of the operation will be available as soon as processing is completed. +If the data array is lazy then the operation will be deferred and the data array will +remain lazy until you request the result (such as when you call ``cube.data``):: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() @@ -71,27 +68,27 @@ remain lazy:: >>> cube.has_lazy_data() True -This is referred to as **lazy evaluation**. +The process by which the operation is deferred until the result is requested is +referred to as **lazy evaluation**. Certain operations, including regridding and plotting, can only be run on real data. Calling such operations on lazy data will automatically realise your lazy data. You can also realise (and so load into memory) your cube's lazy if you 'touch' the data. -This means directly accessing the data by calling ``cube.data``, as in the previous example. +To 'touch' the data means directly accessing the data by calling ``cube.data``, +as in the previous example. Core data ^^^^^^^^^ Cubes have the concept of "core data". This returns the cube's data in its -current state. If a cube has lazy data, calling the cube's -:meth:`~iris.cube.Cube.core_data` method will return the cube's lazy dask array. -If the cube has real data, calling the cube's -:meth:`~iris.cube.Cube.core_data` method will return the cube's real NumPy array. +current state: -Calling the cube's :meth:`~iris.cube.Cube.core_data` method will not change the -state of the cube's data. Thus, if the cube's data is lazy then calling -:meth:`~iris.cube.Cube.core_data` will return the cube's lazy data and not -realise it. + * If a cube has lazy data, calling the cube's :meth:`~iris.cube.Cube.core_data` method + will return the cube's lazy dask array. Calling the cube's + :meth:`~iris.cube.Cube.core_data` method **will not realise** the cube's data. + * If a cube has real data, calling the cube's :meth:`~iris.cube.Cube.core_data` method + will return the cube's real NumPy array. For example:: @@ -145,7 +142,7 @@ coordinates' lazy points and bounds: >>> print aux_coord.has_bounds() True >>> print aux_coord.has_lazy_bounds() - False + True >>> points = aux_coord.points >>> print aux_coord.has_lazy_points() False @@ -154,13 +151,11 @@ coordinates' lazy points and bounds: >>> print derived_coord.has_bounds() True >>> print derived_coord.has_lazy_bounds() - False + True .. note:: Printing a lazy :class:`~iris.coords.AuxCoord` will realise its points and bounds arrays! -Derived coordinates (also called aux factories) . - Dask processing options ----------------------- @@ -169,10 +164,11 @@ As stated earlier in this user guide section, Iris uses dask to provide lazy data arrays for both Iris cubes and coordinates. Iris also uses dask functionality for processing deferred operations on lazy arrays. -There are a wide range of dask processing options that can be adjusted to -control how dask processes deferred operations on lazy arrays. You can make use -of these dask processing options to control how lazy arrays within Iris are -processed as well. +Dask provides processing options to control how deferred operations on lazy arrays +are computed. This is provided via the ``dask.set_options`` interface. +We can make use of this functionality in Iris. This means we can +control how dask arrays in Iris are processed, for example giving us power to +run Iris processing in parallel. Iris by default applies a single dask processing option. This specifies that all dask processing in Iris should be run in serial (that is, without any @@ -198,7 +194,7 @@ processing within Iris should use four workers in a thread pool:: These dask processing options will last for the lifetime of the Python session and must be re-applied in other or subsequent sessions. -See the +Other dask processing options are also available. See the `dask documentation `_ for more information on setting dask processing options. @@ -206,6 +202,7 @@ for more information on setting dask processing options. Further reading --------------- -This section of the user guide has been designed to give a quick overview of the -key concepts of real and lazy data within Iris. If you require more detail, we -have produced a much more detailed whitepaper on the subject. +This section of the user guide is intended as a quick overview of the +key concepts of real and lazy data within Iris. For more detail and more in-depth +discussions on the concepts introduced here and related concepts, see the related +whitepaper. From ef5851fe224c6e6d95b5e50c701cf03158c629f6 Mon Sep 17 00:00:00 2001 From: Peter Killick Date: Fri, 2 Jun 2017 09:12:57 +0100 Subject: [PATCH 10/15] Improve wording --- .../iris/src/userguide/real_and_lazy_data.rst | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index d0af39aeda..05291984cf 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -55,6 +55,10 @@ You can check whether a cube has real data or lazy data by using the method When does my data become real? ------------------------------ +When you load a dataset using Iris the data array will almost always initially be +a lazy array. This section details some operations that will realise lazy data +as well as some operations that will maintain lazy data. + Most operations on data arrays can be run equivalently on both real and lazy data. If the data array is real then the operation will be run on the data array immediately. The results of the operation will be available as soon as processing is completed. @@ -74,7 +78,7 @@ referred to as **lazy evaluation**. Certain operations, including regridding and plotting, can only be run on real data. Calling such operations on lazy data will automatically realise your lazy data. -You can also realise (and so load into memory) your cube's lazy if you 'touch' the data. +You can also realise (and so load into memory) your cube's lazy data if you 'touch' the data. To 'touch' the data means directly accessing the data by calling ``cube.data``, as in the previous example. @@ -115,10 +119,10 @@ In the same way that Iris cubes contain a data array, Iris coordinates contain points and bounds arrays. Coordinate points and bounds arrays can also be real or lazy: * A :class:`~iris.coords.DimCoord` will only ever have **real** points and bounds - arrays because of monotonicity checks that realise lazy arrays. + arrays because of monotonicity checks that realise lazy arrays. * An :class:`~iris.coords.AuxCoord` can have **real or lazy** points and bounds. - * An :class:`~iris.aux_factory.AuxCoordFactory` (or derived coordinate) - can have **real or lazy** points and bounds. If all of the + * An :class:`~iris.aux_factory.AuxCoordFactory` (or derived coordinate) + can have **real or lazy** points and bounds. If all of the :class:`~iris.coords.AuxCoord` instances that the coordinate is derived from have real points and bounds then the derived coordinate will also have real points and bounds, otherwise the derived coordinate will have lazy points and bounds. @@ -186,7 +190,7 @@ processing within Iris should use four workers in a thread pool:: >>> from multiprocessing.pool import ThreadPool >>> import dask >>> dask.set_options(get=dask.threaded.get, pool=ThreadPool(4)) - + >>> import iris >>> # Iris processing here... @@ -202,7 +206,6 @@ for more information on setting dask processing options. Further reading --------------- -This section of the user guide is intended as a quick overview of the -key concepts of real and lazy data within Iris. For more detail and more in-depth -discussions on the concepts introduced here and related concepts, see the related -whitepaper. +This section of the Iris user guide provides a quick overview of real and lazy +data within Iris. For more details on these and related concepts, +see the whitepaper on lazy data. From abf687cf52ee5f8f38a9bbc739abdad2fe81b8fb Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Fri, 2 Jun 2017 09:47:26 +0100 Subject: [PATCH 11/15] adjustments to docstrings --- docs/iris/src/userguide/real_and_lazy_data.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 05291984cf..cce32012a1 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -99,11 +99,13 @@ For example:: >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() True + >>> the_data = cube.core_data() >>> type(the_data) >>> cube.has_lazy_data() True + >>> cube.data >>> the_data = cube.core_data() >>> type(the_data) @@ -133,6 +135,7 @@ coordinates' lazy points and bounds: .. doctest:: >>> cube = iris.load_cube(iris.sample_data_path('hybrid_height.nc')) + >>> dim_coord = cube.coord('model_level_number') >>> print dim_coord.has_lazy_points() False @@ -140,6 +143,7 @@ coordinates' lazy points and bounds: False >>> print dim_coord.has_lazy_bounds() False + >>> aux_coord = cube.coord('sigma') >>> print aux_coord.has_lazy_points() True @@ -150,6 +154,8 @@ coordinates' lazy points and bounds: >>> points = aux_coord.points >>> print aux_coord.has_lazy_points() False + + >>> derived_coord = cube.coord('altitude') >>> print derived_coord.has_lazy_points() True >>> print derived_coord.has_bounds() From 623e8b5b44030d17e99712f6f602019e96fd4b80 Mon Sep 17 00:00:00 2001 From: Peter Killick Date: Fri, 2 Jun 2017 12:06:05 +0100 Subject: [PATCH 12/15] Further review actions --- docs/iris/src/userguide/index.rst | 7 +++-- .../iris/src/userguide/real_and_lazy_data.rst | 30 +++++++++++++------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/docs/iris/src/userguide/index.rst b/docs/iris/src/userguide/index.rst index eeabb3872c..8c0b24bec3 100644 --- a/docs/iris/src/userguide/index.rst +++ b/docs/iris/src/userguide/index.rst @@ -6,11 +6,11 @@ Iris user guide How to use the user guide --------------------------- -If you are reading this user guide for the first time it is strongly recommended that you read the user guide -fully before experimenting with your own data files. +If you are reading this user guide for the first time it is strongly recommended that you read the user guide +fully before experimenting with your own data files. -Much of the content has supplementary links to the reference documentation; you will not need to follow these +Much of the content has supplementary links to the reference documentation; you will not need to follow these links in order to understand the guide but they may serve as a useful reference for future exploration. .. htmlonly:: @@ -30,6 +30,7 @@ User guide table of contents saving_iris_cubes.rst navigating_a_cube.rst subsetting_a_cube.rst + real_and_lazy_data.rst plotting_a_cube.rst interpolation_and_regridding.rst merge_and_concat.rst diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index cce32012a1..fb37da2087 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -37,10 +37,12 @@ In Iris, lazy data is provided as a A dask array also has a shape and data type but typically the dask array's data points are not loaded into memory. Instead the data points are stored on disk and only loaded into memory in -small chunks when absolutely necessary. +small chunks when absolutely necessary (see the section :ref:`when_real_data` +for examples of when this might happen). -The primary advantage of using lazy data is that it enables the loading and manipulating -of datasets that would otherwise not fit into memory. +The primary advantage of using lazy data is that it enables +`out-of-core processing `_; +that is, the loading and manipulating of datasets that otherwise would not fit into memory. You can check whether a cube has real data or lazy data by using the method :meth:`~iris.cube.Cube.has_lazy_data`. For example:: @@ -48,16 +50,21 @@ You can check whether a cube has real data or lazy data by using the method >>> cube = iris.load_cube(filename, 'air_temp.pp') >>> cube.has_lazy_data() True + # Realise the lazy data. >>> cube.data >>> cube.has_lazy_data() False + +.. _when_real_data: + When does my data become real? ------------------------------ When you load a dataset using Iris the data array will almost always initially be a lazy array. This section details some operations that will realise lazy data -as well as some operations that will maintain lazy data. +as well as some operations that will maintain lazy data. We use the term **realise** +to mean converting lazy data into real data. Most operations on data arrays can be run equivalently on both real and lazy data. If the data array is real then the operation will be run on the data array @@ -90,7 +97,7 @@ current state: * If a cube has lazy data, calling the cube's :meth:`~iris.cube.Cube.core_data` method will return the cube's lazy dask array. Calling the cube's - :meth:`~iris.cube.Cube.core_data` method **will not realise** the cube's data. + :meth:`~iris.cube.Cube.core_data` method **will never realise** the cube's data. * If a cube has real data, calling the cube's :meth:`~iris.cube.Cube.core_data` method will return the cube's real NumPy array. @@ -106,6 +113,7 @@ For example:: >>> cube.has_lazy_data() True + # Realise the lazy data. >>> cube.data >>> the_data = cube.core_data() >>> type(the_data) @@ -117,16 +125,17 @@ For example:: Coordinates ----------- -In the same way that Iris cubes contain a data array, Iris coordinates contain -points and bounds arrays. Coordinate points and bounds arrays can also be real or lazy: +In the same way that Iris cubes contain a data array, Iris coordinates contain a +points array and an optional bounds array. +Coordinate points and bounds arrays can also be real or lazy: * A :class:`~iris.coords.DimCoord` will only ever have **real** points and bounds arrays because of monotonicity checks that realise lazy arrays. * An :class:`~iris.coords.AuxCoord` can have **real or lazy** points and bounds. * An :class:`~iris.aux_factory.AuxCoordFactory` (or derived coordinate) can have **real or lazy** points and bounds. If all of the - :class:`~iris.coords.AuxCoord` instances that the coordinate is derived from have - real points and bounds then the derived coordinate will also have real points + :class:`~iris.coords.AuxCoord` instances used to construct the derived coordinate + have real points and bounds then the derived coordinate will have real points and bounds, otherwise the derived coordinate will have lazy points and bounds. Iris cubes and coordinates have very similar interfaces, which extends to accessing @@ -151,9 +160,12 @@ coordinates' lazy points and bounds: True >>> print aux_coord.has_lazy_bounds() True + # Realise the lazy points. This will **not** realise the lazy bounds. >>> points = aux_coord.points >>> print aux_coord.has_lazy_points() False + >>> print aux_coord.has_lazy_bounds() + True >>> derived_coord = cube.coord('altitude') >>> print derived_coord.has_lazy_points() From 70d4767aed730007e6937254f742823d03b51e81 Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Mon, 5 Jun 2017 09:01:32 +0100 Subject: [PATCH 13/15] changed filename to iris sample data path --- docs/iris/src/userguide/real_and_lazy_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index fb37da2087..ee806db85b 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -47,7 +47,7 @@ that is, the loading and manipulating of datasets that otherwise would not fit i You can check whether a cube has real data or lazy data by using the method :meth:`~iris.cube.Cube.has_lazy_data`. For example:: - >>> cube = iris.load_cube(filename, 'air_temp.pp') + >>> cube = iris.load_cube(iris.sample_data_path('air_temp.pp')) >>> cube.has_lazy_data() True # Realise the lazy data. From 61e827a8bc20ac560be3c353d849f471a54356ab Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Wed, 7 Jun 2017 11:44:59 +0100 Subject: [PATCH 14/15] added newline before truth check --- docs/iris/src/userguide/real_and_lazy_data.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index ee806db85b..17e6800439 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -160,6 +160,7 @@ coordinates' lazy points and bounds: True >>> print aux_coord.has_lazy_bounds() True + # Realise the lazy points. This will **not** realise the lazy bounds. >>> points = aux_coord.points >>> print aux_coord.has_lazy_points() From e0333959a8fe66005031dfb3ba8f1ac310958ffe Mon Sep 17 00:00:00 2001 From: Corinne Bosley Date: Wed, 7 Jun 2017 12:01:37 +0100 Subject: [PATCH 15/15] print statements corrected --- .../iris/src/userguide/real_and_lazy_data.rst | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/iris/src/userguide/real_and_lazy_data.rst b/docs/iris/src/userguide/real_and_lazy_data.rst index 17e6800439..16a4ee2ab6 100644 --- a/docs/iris/src/userguide/real_and_lazy_data.rst +++ b/docs/iris/src/userguide/real_and_lazy_data.rst @@ -146,34 +146,34 @@ coordinates' lazy points and bounds: >>> cube = iris.load_cube(iris.sample_data_path('hybrid_height.nc')) >>> dim_coord = cube.coord('model_level_number') - >>> print dim_coord.has_lazy_points() + >>> print(dim_coord.has_lazy_points()) False - >>> print dim_coord.has_bounds() + >>> print(dim_coord.has_bounds()) False - >>> print dim_coord.has_lazy_bounds() + >>> print(dim_coord.has_lazy_bounds()) False >>> aux_coord = cube.coord('sigma') - >>> print aux_coord.has_lazy_points() + >>> print(aux_coord.has_lazy_points()) True - >>> print aux_coord.has_bounds() + >>> print(aux_coord.has_bounds()) True - >>> print aux_coord.has_lazy_bounds() + >>> print(aux_coord.has_lazy_bounds()) True # Realise the lazy points. This will **not** realise the lazy bounds. >>> points = aux_coord.points - >>> print aux_coord.has_lazy_points() + >>> print(aux_coord.has_lazy_points()) False - >>> print aux_coord.has_lazy_bounds() + >>> print(aux_coord.has_lazy_bounds()) True >>> derived_coord = cube.coord('altitude') - >>> print derived_coord.has_lazy_points() + >>> print(derived_coord.has_lazy_points()) True - >>> print derived_coord.has_bounds() + >>> print(derived_coord.has_bounds()) True - >>> print derived_coord.has_lazy_bounds() + >>> print(derived_coord.has_lazy_bounds()) True .. note::