# make test datasets

Date: 17 April, 2024

Author = {"name": "Thomas Moore", "affiliation": "CSIRO", "email": "thomas.moore@csiro.au", "orcid": "0000-0003-3930-1946"}

### BRAN2020 is over 50TB of `float32` data over nearly 9000 `netcdf` file assests in total.

#### required packages

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
import datetime

#### ignore warnings

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### start a local Dask client

In [3]:
import dask
import distributed

with dask.config.set({"distributed.scheduler.worker-saturation": 1.0,
                      "distributed.nanny.pre-spawn-environ.MALLOC_TRIM_THRESHOLD_": 0,
                    "logging.distributed": "error"}):
    client = distributed.Client()

# coarsen and write test data

In [4]:
%%time
### run var on what variable
#var_name = 'temp'
var_name = 'mld'
#var_name = 'eta_t'

#
zarr_path = '/scratch/es60/ard/reanalysis/BRAN2020/ARD/'
path_dict = {'eta_t':'BRAN2020-daily-eta_t-v14032024.zarr',
                 'mld':'BRAN2020-daily-mld-v04042024.zarr',
                 'temp':'BRAN2020-daily-temp-v07022024.zarr'}

depth_dict = {'eta_t':None,'mld':None,'temp':'st_ocean'}
lon_dict = {'eta_t':'xt_ocean','mld':'xt_ocean','temp':'xt_ocean'}
lat_dict = {'eta_t':'yt_ocean','mld':'yt_ocean','temp':'yt_ocean'}
time_dim = 'Time'
results_path = '/scratch/es60/ard/reanalysis/BRAN2020/ARD/coarsened_tests/'
results_file = 'BRAN2020_'+var_name+'_COARSENED.zarr'
collection_path = zarr_path + path_dict[var_name]
# load BRAN data
ds = xr.open_zarr(collection_path,consolidated=True)

##### vvvv coarsen for testing
coarsen_dims = {'xt_ocean': 10, 'yt_ocean': 10}#, 'st_ocean': 10}
ds = ds.coarsen(coarsen_dims, boundary='trim').mean()  # using mean as the aggregation method


CPU times: user 676 ms, sys: 290 ms, total: 966 ms
Wall time: 1.31 s


In [5]:
def remove_zarr_encoding(DS):
    for var in DS:
        DS[var].encoding = {}

    for coord in DS.coords:
        DS[coord].encoding = {}
    return DS

In [6]:
ds = ds.chunk({'Time':1000,'yt_ocean':-1,'xt_ocean':-1})
ds = remove_zarr_encoding(ds)
ds.to_zarr(results_path+results_file,consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x14d9eaef3f40>

In [7]:
ds = ds.chunk({'Time':-1,'yt_ocean':-1,'xt_ocean':36})
ds = remove_zarr_encoding(ds)
results_file = 'BRAN2020_'+var_name+'_chunked4time_COARSENED.zarr'
ds.to_zarr(results_path+results_file,consolidated=True)

<xarray.backends.zarr.ZarrStore at 0x14d95079e840>

In [8]:
reload_ds = xr.open_zarr(results_path+results_file,consolidated=True)

# $The$ $End$

In [None]:
client.shutdown()