In [1]:
import xarray as xr
import numpy as np
import pandas as pd
from flox.xarray import xarray_reduce
import gc

In [2]:
import dask
import distributed

with dask.config.set({"distributed.scheduler.worker-saturation": 1.0,
                      "distributed.nanny.pre-spawn-environ.MALLOC_TRIM_THRESHOLD_": 0}):
    client = distributed.Client()

In [7]:
ds = xr.tutorial.open_dataset('air_temperature')

# make minute data

In [8]:
finer_times = pd.date_range(start=ds.time[0].values, end=ds.time[-1].values, freq='min')

In [9]:
finer_times

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 00:01:00',
               '2013-01-01 00:02:00', '2013-01-01 00:03:00',
               '2013-01-01 00:04:00', '2013-01-01 00:05:00',
               '2013-01-01 00:06:00', '2013-01-01 00:07:00',
               '2013-01-01 00:08:00', '2013-01-01 00:09:00',
               ...
               '2014-12-31 17:51:00', '2014-12-31 17:52:00',
               '2014-12-31 17:53:00', '2014-12-31 17:54:00',
               '2014-12-31 17:55:00', '2014-12-31 17:56:00',
               '2014-12-31 17:57:00', '2014-12-31 17:58:00',
               '2014-12-31 17:59:00', '2014-12-31 18:00:00'],
              dtype='datetime64[ns]', length=1050841, freq='min')

In [10]:
finer_ds = ds.interp(time=finer_times, method='linear')

In [11]:
finer_ds.nbytes/1e9

11.14732164

# coarsen to 3 minute data

In [12]:
coarsened_ds = finer_ds.coarsen(time=3, boundary='trim').mean()
coarsened_ds

In [15]:
coarsened_ds.nbytes/1e9

3.715770552

In [16]:
data = coarsened_ds

# chunked

In [20]:
ds_dask = coarsened_ds.chunk({'lon': 27, 'lat': 12,'time':10000})
ds_dask

Unnamed: 0,Array,Chunk
Bytes,3.46 GiB,24.72 MiB
Shape,"(350280, 25, 53)","(10000, 12, 27)"
Dask graph,216 chunks in 1 graph layer,216 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 3.46 GiB 24.72 MiB Shape (350280, 25, 53) (10000, 12, 27) Dask graph 216 chunks in 1 graph layer Data type float64 numpy.ndarray",53  25  350280,

Unnamed: 0,Array,Chunk
Bytes,3.46 GiB,24.72 MiB
Shape,"(350280, 25, 53)","(10000, 12, 27)"
Dask graph,216 chunks in 1 graph layer,216 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [21]:
%%time
! rm /Users/moo270/data/test.nc
# Specify chunk sizes in the encoding dictionary
encoding = {'air': {'chunksizes': (10000, 12, 27)}}

ds_dask.to_netcdf('/Users/moo270/data/test.nc',encoding=encoding)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


CPU times: user 3.37 s, sys: 3.09 s, total: 6.46 s
Wall time: 9.27 s


In [27]:
data = xr.open_mfdataset('/Users/moo270/data/test.nc')
data

Unnamed: 0,Array,Chunk
Bytes,3.46 GiB,24.72 MiB
Shape,"(350280, 25, 53)","(10000, 12, 27)"
Dask graph,216 chunks in 2 graph layers,216 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 3.46 GiB 24.72 MiB Shape (350280, 25, 53) (10000, 12, 27) Dask graph 216 chunks in 2 graph layers Data type float64 numpy.ndarray",53  25  350280,

Unnamed: 0,Array,Chunk
Bytes,3.46 GiB,24.72 MiB
Shape,"(350280, 25, 53)","(10000, 12, 27)"
Dask graph,216 chunks in 2 graph layers,216 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [None]:
%%time
result_cohort = xarray_reduce(
    data,
    'time.month',
    func="mean",
    method="cohorts",
).compute()
result_cohort.air.mean(['lat','lon']).plot()

In [None]:
%%time
result_map = xarray_reduce(
    data,
    'time.month',
    func="mean",
    method="map-reduce",
).compute()
result_map.air.mean(['lat','lon']).plot()

In [None]:
%%time
result = data.groupby('time.month').mean('time')
result.air.mean(['lat','lon']).plot()

In [32]:
%%time
method = 'map-reduce'
var_string = list(data.data_vars.keys())[0]
merged_ds = xr.merge([xarray_reduce(data,'time.month',func="nanmean",method=method).rename({var_string:'mean_'+var_string}),
                      xarray_reduce(data,'time.month',func="min",method=method).rename({var_string:'min_'+var_string}),
                      xarray_reduce(data,'time.month',func="max",method=method).rename({var_string:'max_'+var_string}),
                      xarray_reduce(data,'time.month',func="nanstd",method=method).rename({var_string:'std_'+var_string}),
                      xarray_reduce(data,'time.month',func="nanmedian",method='blockwise').rename({var_string:'median_'+var_string})
                                    ])
merged_ds = merged_ds.compute()

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [36]:
%%time
var_string = list(data.data_vars.keys())[0]
merged_ds = xr.merge([data.groupby('time.month').mean(dim='time',skipna=True).rename({var_string:'mean_'+var_string}),
                      data.groupby('time.month').min(dim='time',skipna=True).rename({var_string:'min_'+var_string}),
                      data.groupby('time.month').max(dim='time',skipna=True).rename({var_string:'max_'+var_string}),
                      data.groupby('time.month').std(dim='time',skipna=True).rename({var_string:'std_'+var_string}),
                      data.groupby('time.month').median(dim='time',skipna=True).rename({var_string:'median_'+var_string})
                                    ])
merged_ds = merged_ds.compute()

CPU times: user 3.95 s, sys: 915 ms, total: 4.86 s
Wall time: 11.4 s


In [4]:
var_string = list(ds.data_vars.keys())[0]
result_DS = xr.merge([ds.groupby('time.month').mean(dim='time',engine='flox',method='cohorts').rename({var_string:'mean_'+var_string}),
                      ds.groupby('time.month').min(dim='time',engine='flox',method='cohorts').rename({var_string:'min_'+var_string}),
                      ds.groupby('time.month').max(dim='time',engine='flox',method='cohorts').rename({var_string:'max_'+var_string}),
                      ds.groupby('time.month').std(dim='time',engine='flox',method='cohorts').rename({var_string:'std_'+var_string}),
                      ds.groupby('time.month').median(dim='time').rename({var_string:'median_'+var_string})
])
result_DS

NameError: name 'ds' is not defined

In [None]:
var_string = list(ds.data_vars.keys())[0]
quant = ds.groupby('time.month').quantile([0.05,0.95],skipna=False,dim='time')
quant

In [None]:
quant_ds = xr.merge([quant.isel(quantile=0).reset_coords(drop=True).rename({var_string:'quantile_05_'+var_string}),quant.isel(quantile=1).reset_coords(drop=True).rename({var_string:'quantile_95_'+var_string})])
quant_ds

In [None]:
result_DS = xr.merge([result_DS,quant_ds])
result_DS