In [None]:
import climtas
import xarray
from glob import glob

In [1]:
import os
import dask.distributed

# Edit as desired
threads_per_worker = 1

try:
    c # Already running
except NameError:
    c = dask.distributed.Client(
        n_workers=int(os.environ.get('PBS_NCPUS', 1))//threads_per_worker,
        threads_per_worker=threads_per_worker,
        memory_limit=f'{4*threads_per_worker}gb',
        local_directory=os.path.join(os.environ.get('PBS_JOBFS'),'dask-worker-space')
    )
c

0,1
Client  Scheduler: tcp://127.0.0.1:42675  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 4.00 GB


In [None]:
# Open the dataset - The initial size is 1.5 TB

# We're starting out with the same latitude and longitude chunking as are in the file, chunking
# along the time axis defaults to the file size, so one month in this case

ds = xarray.open_mfdataset(sorted(glob('/g/data/ub4/era5/netcdf/surface/2T/*/2T_era5_global_*.nc')),
                           combine='nested',
                           concat_dim='time',
                           chunks={'latitude': 91, 'longitude': 180})
t2m = ds.t2m

print("File chunking:", dict(zip(t2m.dims, t2m.encoding['chunksizes'])))

t2m.data

In [None]:
# Convert to daily mean - or max, min etc. This reduces the size to 62 GB, with the same number of chunks
import climtas.blocked

t2m_daily = climtas.blocked.blocked_resample(t2m, time=24).mean()
t2m_daily.data

In [None]:
# Smooth out the data
# We also increase the chunking along the time dimension at this point - too small chunks makes the 'Tasks' count increase rapidly

t2m_smooth = t2m_daily.chunk({'time': 600}).rolling(time=15, center=True).mean()
t2m_smooth.data

In [None]:
# Calculate percentiles

# We've done the processing so far on a wider time range than we need, so that the
# rolling operation doesn't start right at our analysis start date. Now's the time to
# select just the dates we need

t2m_percentile = climtas.apply_doy.percentile_doy(t2m_smooth.sel(time=slice('1980','2018')), 90)
t2m_percentile

In [None]:
# At this point we've reduced our 1.5 TB of data to 1.5 GB - time to save it to a file

t2m_percentile.data

In [None]:
# Use the throttled saver to write to netcdf one chunk at a time, so that memory doesn't get filled up

climtas.io.to_netcdf_throttled(t2m_percentile.to_dataset(name='t2m_percentile'),
                               '/g/data/w35/saw562/era5_heatwave_clim.nc')

In [None]:
# Read the climatology back out of the file

threshold = xarray.open_dataset('/g/data/w35/saw562/era5_heatwave_clim.nc',
                           chunks={'latitude': 200, 'longitude': 200}).t2m_percentile
threshold

In [None]:
%matplotlib inline
threshold.sel(latitude=-37.8, longitude=144.9, method='nearest').plot()

In [None]:
t2m_daily.groupby('time.dayofyear') > threshold

In [None]:
climtas.event.find_events(t2m_daily.groupby('time.dayofyear') > threshold, min_duration=3)

In [None]:
t2m_daily.groupby('time.dayofyear') > threshold.transpose('dayofyear','latitude','longitude')

## 