In [1]:
import intake

from access_intake_utils.chunking import validate_chunkspec


In [2]:
# Let's look at the documentation for validate_chunkspec
validate_chunkspec?

[0;31mSignature:[0m
[0mvalidate_chunkspec[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdataset[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m [0;34m|[0m [0mcollections[0m[0;34m.[0m[0mabc[0m[0;34m.[0m[0mIterable[0m[0;34m[[0m[0mstr[0m [0;34m|[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m][0m [0;34m|[0m [0mxarray[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdataset[0m[0;34m.[0m[0mDataset[0m [0;34m|[0m [0mxarray[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdataarray[0m[0;34m.[0m[0mDataArray[0m [0;34m|[0m [0mintake_esm[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mesm_datastore[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunkspec[0m[0;34m:[0m [0mdict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mtyping[0m[0;34m.[0m[0mAny[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvarnames[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0mlist[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m|[0m [0;32mNone[0m [0;34m=

In [3]:
# Let's load up an esm_datastore to see how we can use this
catalog = intake.cat.access_nri
expt_ds = catalog['01deg_jra55_ryf_Control']
expt_ds = expt_ds.search(file_id='ocean_month',variable='temp') # Subset down to a single dataset & variable
expt_ds

Unnamed: 0,unique
filename,1
file_id,1
path,14
filename_timestamp,0
frequency,1
start_date,14
end_date,14
variable,56
variable_long_name,52
variable_standard_name,23


In [4]:
# Let's look at our datastore
expt_ds.to_dask(xarray_open_kwargs={'decode_timedelta' : False})

Unnamed: 0,Array,Chunk
Bytes,114.06 GiB,1.76 MiB
Shape,"(42, 75, 2700, 3600)","(1, 19, 135, 180)"
Dask graph,67200 chunks in 29 graph layers,67200 chunks in 29 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 114.06 GiB 1.76 MiB Shape (42, 75, 2700, 3600) (1, 19, 135, 180) Dask graph 67200 chunks in 29 graph layers Data type float32 numpy.ndarray",42  1  3600  2700  75,

Unnamed: 0,Array,Chunk
Bytes,114.06 GiB,1.76 MiB
Shape,"(42, 75, 2700, 3600)","(1, 19, 135, 180)"
Dask graph,67200 chunks in 29 graph layers,67200 chunks in 29 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Okay. Lets see what happens if we try to increase our chunk size to some random prime numbers (these are *very unlikely* to be integer multiples of the disk chunking)

In [16]:
chunks = {'time' : 37, 'st_ocean' : 61, 'yt_ocean': 467, 'xt_ocean': 277}

In [17]:
validate_chunkspec(expt_ds, chunks,varnames = 'temp')

  validate_chunkspec(expt_ds, chunks,varnames = 'temp')


{'time': 37, 'st_ocean': 57, 'yt_ocean': 405, 'xt_ocean': 360}

In [10]:
expt_ds.df.path.head(1)

0    /g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output976/ocean/ocean_month.nc
Name: path, dtype: object

In [15]:
# Let's look at the chunks on disk with ncdump -hs (and filter other variables out'
!ncdump -hs /g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output976/ocean/ocean_month.nc   | head -160 | tail -15

	float temp(time, st_ocean, yt_ocean, xt_ocean) ;
		temp:long_name = "Conservative temperature" ;
		temp:units = "K" ;
		temp:valid_range = -10.f, 500.f ;
		temp:missing_value = -1.e+20f ;
		temp:_FillValue = -1.e+20f ;
		temp:cell_methods = "time: mean" ;
		temp:time_avg_info = "average_T1,average_T2,average_DT" ;
		temp:coordinates = "geolon_t geolat_t" ;
		temp:standard_name = "sea_water_conservative_temperature" ;
		temp:_Storage = "chunked" ;
		temp:_ChunkSizes = 1, 19, 135, 180 ;
		temp:_Shuffle = "true" ;
		temp:_DeflateLevel = 5 ;
		temp:_Endianness = "little" ;


So temperature has disk chunks `{'time': 1 , 'st_ocean' : 19, 'yt_ocean' : 135, 'xt_ocean' : 180}`

`validate_chunkspec` has returned `{'time': 37, 'st_ocean': 57, 'yt_ocean': 405, 'xt_ocean': 360}` which is
`{'time': 37 * 1, 'st_ocean': 19 * 3, 'yt_ocean': 135 * 3, 'xt_ocean' : 180 * 2}`, which is the closest we can get to the original chunks, whilst respecting the disk chunking. We can then load our dataset with this:

In [18]:
optimised_chunks = validate_chunkspec(expt_ds, chunks,varnames = 'temp')
expt_ds.to_dask(xarray_open_kwargs={'decode_timedelta' : False, 'chunks' : optimised_chunks})

  optimised_chunks = validate_chunkspec(expt_ds, chunks,varnames = 'temp')
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)


Unnamed: 0,Array,Chunk
Bytes,114.06 GiB,95.11 MiB
Shape,"(42, 75, 2700, 3600)","(3, 57, 405, 360)"
Dask graph,1960 chunks in 29 graph layers,1960 chunks in 29 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 114.06 GiB 95.11 MiB Shape (42, 75, 2700, 3600) (3, 57, 405, 360) Dask graph 1960 chunks in 29 graph layers Data type float32 numpy.ndarray",42  1  3600  2700  75,

Unnamed: 0,Array,Chunk
Bytes,114.06 GiB,95.11 MiB
Shape,"(42, 75, 2700, 3600)","(3, 57, 405, 360)"
Dask graph,1960 chunks in 29 graph layers,1960 chunks in 29 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


# What if I don't want to use intake?

- We can pass a list of files and/or paths:


In [20]:
paths_as_strings = expt_ds.df.path.tolist()
paths_as_strings

['/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output976/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output977/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output978/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output979/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output980/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output981/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output982/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output983/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output984/ocean/ocean_month.nc',
 '/g/data/cj50/access-om2/ra

In [21]:
validate_chunkspec(paths_as_strings, chunks,varnames = 'temp')

  validate_chunkspec(paths_as_strings, chunks,varnames = 'temp')


{'time': 37, 'st_ocean': 57, 'yt_ocean': 405, 'xt_ocean': 360}

In [25]:
from pathlib import Path

paths = [Path(pathstr) for pathstr in paths_as_strings]
paths

[PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output976/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output977/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output978/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output979/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output980/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output981/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output982/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/access-om2-01/01deg_jra55_ryf_Control/output983/ocean/ocean_month.nc'),
 PosixPath('/g/data/cj50/access-om2/raw-output/a

In [26]:
validate_chunkspec(paths, chunks,varnames = 'temp')

  validate_chunkspec(paths, chunks,varnames = 'temp')


{'time': 37, 'st_ocean': 57, 'yt_ocean': 405, 'xt_ocean': 360}

- Or, in some instances, an xarray dataset - but only if the dataset contains the file handles (this isn't guaranteed, so preferably use the other methods). In the example below, this won't work - but it might for others. This example will be updated as suppot for xarray detasets becomes more robust.

In [29]:
import xarray as xr

ds = xr.open_mfdataset(paths, decode_timedelta = False)

validate_chunkspec(ds, chunks,varnames = 'temp')

ValueError:  Dataset/DataArray does contain source attribute describing file path(s). Please provide a dataset with a source attribute, an esm_datastore, or a list of file paths.