In [1]:
import climtas
import climtas.nci
import xarray

In [2]:
climtas.nci.GadiClient()

0,1
Client  Scheduler: tcp://127.0.0.1:39787  Dashboard: /proxy/8787/status,Cluster  Workers: 1  Cores: 1  Memory: 4.29 GB


In [3]:
# Open the dataset - The initial size is 1.5 TB

# We're starting out with the same latitude and longitude chunking as are in the file, chunking
# along the time axis defaults to the file size, so one month in this case

ds = climtas.nci.data.era5('2T', 'surface')
t2m = ds.t2m

print("File chunking:", dict(zip(t2m.dims, t2m.encoding['chunksizes'])))

t2m.data

File chunking: {'time': 93, 'latitude': 91, 'longitude': 180}


Unnamed: 0,Array,Chunk
Bytes,1.48 TB,48.75 MB
Shape,"(356472, 721, 1440)","(744, 91, 180)"
Count,62952 Tasks,31232 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.48 TB 48.75 MB Shape (356472, 721, 1440) (744, 91, 180) Count 62952 Tasks 31232 Chunks Type float32 numpy.ndarray",1440  721  356472,

Unnamed: 0,Array,Chunk
Bytes,1.48 TB,48.75 MB
Shape,"(356472, 721, 1440)","(744, 91, 180)"
Count,62952 Tasks,31232 Chunks
Type,float32,numpy.ndarray


In [4]:
# Convert to daily mean - or max, min etc. This reduces the size to 62 GB, with the same number of chunks

# blocked_resample does the resampling within each dask chunk, so you don't end up with a chunk for each day

t2m_daily = climtas.blocked_resample(t2m, time=24).mean()
t2m_daily.data

Unnamed: 0,Array,Chunk
Bytes,61.68 GB,2.03 MB
Shape,"(14853, 721, 1440)","(31, 91, 180)"
Count,94184 Tasks,31232 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 61.68 GB 2.03 MB Shape (14853, 721, 1440) (31, 91, 180) Count 94184 Tasks 31232 Chunks Type float32 numpy.ndarray",1440  721  14853,

Unnamed: 0,Array,Chunk
Bytes,61.68 GB,2.03 MB
Shape,"(14853, 721, 1440)","(31, 91, 180)"
Count,94184 Tasks,31232 Chunks
Type,float32,numpy.ndarray


In [5]:
# Smooth out the data with a rolling average

t2m_smooth = t2m_daily.rolling(time=15, center=True).mean()
t2m_smooth.data

Unnamed: 0,Array,Chunk
Bytes,61.68 GB,2.03 MB
Shape,"(14853, 721, 1440)","(31, 91, 180)"
Count,782440 Tasks,31232 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 61.68 GB 2.03 MB Shape (14853, 721, 1440) (31, 91, 180) Count 782440 Tasks 31232 Chunks Type float32 numpy.ndarray",1440  721  14853,

Unnamed: 0,Array,Chunk
Bytes,61.68 GB,2.03 MB
Shape,"(14853, 721, 1440)","(31, 91, 180)"
Count,782440 Tasks,31232 Chunks
Type,float32,numpy.ndarray


In [6]:
# Calculate percentiles

# We've done the processing so far on a wider time range than we need, so that the
# rolling operation doesn't leave NAN values at our analysis start and end dates. 
# Now's the time to select just the dates we need

clim_period = t2m_smooth.sel(time=slice('1980','2018'))

# blocked_groupby does the groupby based on chunks, so you don't end up with a chunk
# for each day

t2m_percentile = climtas.blocked_groupby(clim_period, time='dayofyear').mean()
t2m_percentile

In [7]:
# At this point we've reduced our 1.5 TB of data to 1.5 GB - time to save it to a file

t2m_percentile.data

Unnamed: 0,Array,Chunk
Bytes,1.52 GB,2.03 MB
Shape,"(366, 721, 1440)","(31, 91, 180)"
Count,1142376 Tasks,1472 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.52 GB 2.03 MB Shape (366, 721, 1440) (31, 91, 180) Count 1142376 Tasks 1472 Chunks Type float32 numpy.ndarray",1440  721  366,

Unnamed: 0,Array,Chunk
Bytes,1.52 GB,2.03 MB
Shape,"(366, 721, 1440)","(31, 91, 180)"
Count,1142376 Tasks,1472 Chunks
Type,float32,numpy.ndarray


In [None]:
# Use the throttled saver to write to netcdf one chunk at a time, so that memory doesn't get filled up

climtas.io.to_netcdf_throttled(t2m_percentile.to_dataset(name='t2m_percentile'),
                               '/g/data/w35/saw562/era5_heatwave_clim.nc')

HBox(children=(FloatProgress(value=0.0, max=1472.0), HTML(value='')))

## 