# make 2D zarr

In [1]:
Author_dict = {"name": "Thomas Moore", 
               "affiliation": "CSIRO", 
               "email": "thomas.moore@csiro.au",
               "orchid_ID":'https://orcid.org/0000-0003-3930-1946'}

In [3]:
import intake
import xarray as xr
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

from dask.distributed import Client, LocalCluster
import dask
import datetime
import zarr

from rechunker import rechunk

import gc
import sys
import subprocess
from tabulate import tabulate
import os
import glob
import streamjoy

In [4]:
# Append the directory of the module to sys.path - import functions
sys.path.append('/g/data/es60/users/thomas_moore/code/Climatology-generator-demo/src/')
import bran2020_demo_functions as my_tools
from bran2020_demo_functions import keep_only_selected_vars, load_rechunker_config, print_chunks, rechunk_each_st_ocean, remove_zarr_encoding, version_table, concatinate_st_ocean_zarrs

In [5]:
version_table()

+-----------+-----------+------------+-----------+
| Package   | Version   | Package    | Version   |
| numpy     | 1.26.4    | numba      | 0.59.1    |
+-----------+-----------+------------+-----------+
| xarray    | 2024.3.0  | numbagg    | 0.8.1     |
+-----------+-----------+------------+-----------+
| dask      | 2024.4.0  | flox       | 0.9.6     |
+-----------+-----------+------------+-----------+
| scipy     | 1.12.0    | bottleneck | 1.3.8     |
+-----------+-----------+------------+-----------+


#### start a local Dask client

In [6]:
# Set configuration options
dask.config.set({
    'distributed.comm.timeouts.connect': '90s',  # Timeout for connecting to a worker
    'distributed.comm.timeouts.tcp': '90s',  # Timeout for TCP communications
})

cluster = LocalCluster(
    n_workers=28,          # Number of workers
    threads_per_worker=1#,
    #memory_limit='8GB' # Number of threads per each worker
)
client = Client(cluster)

# load `mld` from `netcdf` and write to ...

In [9]:
var='mld'
xarray_open_kwargs = {"Time": 1, "xt_ocean": 3600, "yt_ocean": 1500}
vars_to_keep=[var,'Time','yt_ocean','xt_ocean']
ds_mld = xr.open_mfdataset('/g/data/gb6/BRAN/BRAN2020/daily/ocean_'+var+'_*.nc',
                    parallel=True,chunks=xarray_open_kwargs,
                    preprocess=lambda ds: keep_only_selected_vars(ds, vars_to_keep=vars_to_keep))
ds_mld


Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 745 graph layers,11322 chunks in 745 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 227.76 GiB 20.60 MiB Shape (11322, 1500, 3600) (1, 1500, 3600) Dask graph 11322 chunks in 745 graph layers Data type float32 numpy.ndarray",3600  1500  11322,

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 745 graph layers,11322 chunks in 745 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [10]:
ds_mld

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 745 graph layers,11322 chunks in 745 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 227.76 GiB 20.60 MiB Shape (11322, 1500, 3600) (1, 1500, 3600) Dask graph 11322 chunks in 745 graph layers Data type float32 numpy.ndarray",3600  1500  11322,

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 745 graph layers,11322 chunks in 745 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [11]:
%%time
ds_mld.to_zarr('/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-mld-chunks.Time1.xt_ocean3600.yt_ocean1500.20052024.zarr',consolidated=True)

CPU times: user 1min 50s, sys: 8.47 s, total: 1min 59s
Wall time: 2min 27s


<xarray.backends.zarr.ZarrStore at 0x14aa000459c0>

In [12]:
var='eta_t'
xarray_open_kwargs = {"Time": 1, "xt_ocean": 3600, "yt_ocean": 1500}
vars_to_keep=[var,'Time','yt_ocean','xt_ocean']
ds_eta_t = xr.open_mfdataset('/g/data/gb6/BRAN/BRAN2020/daily/ocean_'+var+'_*.nc',
                    parallel=True,chunks=xarray_open_kwargs,
                    preprocess=lambda ds: keep_only_selected_vars(ds, vars_to_keep=vars_to_keep))
ds_eta_t

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 745 graph layers,11322 chunks in 745 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 227.76 GiB 20.60 MiB Shape (11322, 1500, 3600) (1, 1500, 3600) Dask graph 11322 chunks in 745 graph layers Data type float32 numpy.ndarray",3600  1500  11322,

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 745 graph layers,11322 chunks in 745 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [13]:
%%time
ds_eta_t.to_zarr('/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-eta_t-chunks.Time1.xt_ocean3600.yt_ocean1500.20052024.zarr',consolidated=True)

CPU times: user 2min 22s, sys: 13.8 s, total: 2min 35s
Wall time: 5min 12s


<xarray.backends.zarr.ZarrStore at 0x14a9f003f2c0>

# rechunk over `all time`

In [15]:
#mld & eta_t
load_mld = xr.open_zarr('/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-mld-chunks.Time1.xt_ocean3600.yt_ocean1500.20052024.zarr',consolidated=True)
load_eta_t = xr.open_zarr('/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-eta_t-chunks.Time1.xt_ocean3600.yt_ocean1500.20052024.zarr',consolidated=True)

In [16]:
load_mld

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 2 graph layers,11322 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 227.76 GiB 20.60 MiB Shape (11322, 1500, 3600) (1, 1500, 3600) Dask graph 11322 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1500  11322,

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 2 graph layers,11322 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [18]:
load_eta_t

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 2 graph layers,11322 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 227.76 GiB 20.60 MiB Shape (11322, 1500, 3600) (1, 1500, 3600) Dask graph 11322 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1500  11322,

Unnamed: 0,Array,Chunk
Bytes,227.76 GiB,20.60 MiB
Shape,"(11322, 1500, 3600)","(1, 1500, 3600)"
Dask graph,11322 chunks in 2 graph layers,11322 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## chunking settings

In [31]:
rechunk_dict = {'Time':-1,'xt_ocean':30,'yt_ocean':30}
chunking_string = 'chunks.' + ''.join(str(key) + str(value)+ '.' for key, value in rechunk_dict.items())
chunking_string

'chunks.Time-1.xt_ocean30.yt_ocean30.'

## zarr encoding cleared

In [32]:
load_mld = remove_zarr_encoding(load_mld)
load_eta_t = remove_zarr_encoding(load_eta_t)

## now stamp

In [33]:
now = datetime.datetime.now()
nowstamp_str = now.strftime("%Y.%m.%d.%H.%M.%S")
nowstamp_str

'2024.05.20.11.36.48'

In [35]:
%%time
now = datetime.datetime.now()
nowstamp_str = now.strftime("%Y.%m.%d.%H.%M.%S")
load_mld.chunk(rechunk_dict).to_zarr('/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-daily-mld.'+chunking_string+nowstamp_str+'.zarr',consolidated=True)

CPU times: user 4min 43s, sys: 29.5 s, total: 5min 12s
Wall time: 9min 19s


<xarray.backends.zarr.ZarrStore at 0x14a9e6f43f40>

In [36]:
%%time
now = datetime.datetime.now()
nowstamp_str = now.strftime("%Y.%m.%d.%H.%M.%S")
load_eta_t.chunk(rechunk_dict).to_zarr('/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-daily-eta_t.'+chunking_string+nowstamp_str+'.zarr',consolidated=True)

CPU times: user 5min 17s, sys: 46.3 s, total: 6min 3s
Wall time: 12min 14s


<xarray.backends.zarr.ZarrStore at 0x14a9ea966a40>

In [None]:
chunking_string = 'chunks_' + ''.join(str(key) + str(value)+ '.' for key, value in xarray_open_kwargs.items())
ard_rcTime_file_ID = 'BRAN2020-'+var+'-'+chunking_string+datestamp+'.zarr'

In [None]:
ard_rcTime_file_ID

In [None]:
%%time
ds.to_zarr(BRAN2020_ard_path+ard_rcTime_file_ID,consolidated=True)

In [None]:
!touch /scratch/es60/ard/reanalysis/BRAN2020/ARD/rechunk_done.log

# load the resulting float64 zarr and write to float32

In [None]:
ds_1_51_3600_1500_float64 = xr.open_zarr(BRAN2020_ard_path+ard_rcTime_file_ID,consolidated=True)

In [None]:
ds_1_51_3600_1500_float32 = ds_1_51_3600_1500_float64.copy(deep=True)

In [None]:
ds_1_51_3600_1500_float32['salt'] = ds_1_51_3600_1500_float64.salt.astype('float32')

In [None]:
ds_1_51_3600_1500_float32


In [None]:
%%time
ds_1_51_3600_1500_float32.to_zarr(BRAN2020_ard_path+'float32.'+ard_rcTime_file_ID,consolidated=True)


In [None]:
!touch /scratch/es60/ard/reanalysis/BRAN2020/ARD/float32_write_done.log

# load the float32 `ds_1_51_3600_1500`

In [6]:
ds_1_51_3600_1500_float32_reloaded = xr.open_zarr(BRAN2020_ard_path+'float32.BRAN2020-salt-chunks_Time1.st_ocean51.xt_ocean3600.yt_ocean1500.v08052024.zarr',
                                                  consolidated=True)
ds_1_51_3600_1500_float32_reloaded

Unnamed: 0,Array,Chunk
Bytes,11.34 TiB,1.03 GiB
Shape,"(11322, 51, 1500, 3600)","(1, 51, 1500, 3600)"
Dask graph,11322 chunks in 2 graph layers,11322 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.34 TiB 1.03 GiB Shape (11322, 51, 1500, 3600) (1, 51, 1500, 3600) Dask graph 11322 chunks in 2 graph layers Data type float32 numpy.ndarray",11322  1  3600  1500  51,

Unnamed: 0,Array,Chunk
Bytes,11.34 TiB,1.03 GiB
Shape,"(11322, 51, 1500, 3600)","(1, 51, 1500, 3600)"
Dask graph,11322 chunks in 2 graph layers,11322 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
ds_1_51_3600_1500_float32_reloaded

# chunking

In [7]:
chunking_1st_dict = {"Time": 11322, "st_ocean": 1, "xt_ocean": 30, "yt_ocean": 30}

In [8]:
chunking_string_1st = 'chunks_' + ''.join(str(key) + str(value)+ '.' for key, value in chunking_1st_dict.items())
ard_rc1_file_ID = 'BRAN2020-'+var+'-'+chunking_string_1st+datestamp+'.zarr'
ard_rc1_file_ID

'BRAN2020-salt-chunks_Time11322.st_ocean1.xt_ocean30.yt_ocean30.v09052024.zarr'

In [9]:
ds_11322_1_30_30 = ds_1_51_3600_1500_float32_reloaded.copy(deep=True).chunk(chunking_1st_dict)
ds_11322_1_30_30

Unnamed: 0,Array,Chunk
Bytes,11.34 TiB,38.87 MiB
Shape,"(11322, 51, 1500, 3600)","(11322, 1, 30, 30)"
Dask graph,306000 chunks in 6 graph layers,306000 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.34 TiB 38.87 MiB Shape (11322, 51, 1500, 3600) (11322, 1, 30, 30) Dask graph 306000 chunks in 6 graph layers Data type float32 numpy.ndarray",11322  1  3600  1500  51,

Unnamed: 0,Array,Chunk
Bytes,11.34 TiB,38.87 MiB
Shape,"(11322, 51, 1500, 3600)","(11322, 1, 30, 30)"
Dask graph,306000 chunks in 6 graph layers,306000 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [10]:
BRAN2020_ard_path+ard_rc1_file_ID

'/scratch/es60/ard/reanalysis/BRAN2020/ARD/BRAN2020-salt-chunks_Time11322.st_ocean1.xt_ocean30.yt_ocean30.v09052024.zarr'

In [11]:
ds_11322_1_30_30 = remove_zarr_encoding(ds_11322_1_30_30)

In [None]:
%%time
ds_11322_1_30_30.to_zarr(BRAN2020_ard_path+ard_rc1_file_ID,consolidated=True)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [None]:
!touch /scratch/es60/ard/reanalysis/BRAN2020/ARD/rechunk_11322_done.log

# netcdf to first zarr

In [None]:
xarray_open_kwargs = {"Time": 1, "st_ocean": 5, "xt_ocean": 3600, "yt_ocean": 1500}
ds = xr.open_mfdataset('/g/data/gb6/BRAN/BRAN2020/daily/ocean_salt_*.nc',parallel=True,chunks=xarray_open_kwargs,preprocess=keep_only_selected_vars)
ds

In [None]:
%%time
ds.astype('float32').to_zarr(BRAN2020_ard_path+ard_file_ID,consolidated=True)

# load first zarr

In [None]:
ds_1_5_1500_3600 = xr.open_zarr(BRAN2020_ard_path+ard_file_ID,consolidated=True)

In [None]:
ds_1_5_1500_3600

In [None]:
chunking_dict={'Time':500,'xt_ocean':150}

In [None]:
ds_1_5_1500_3600.chunk(chunking_dict)

In [None]:
var='salt'

In [None]:
chunking_string = 'chunks_'+''.join(str(key) + str(value) for key, value in chunking_dict.items())
ard_rcTime_file_ID = 'BRAN2020-'+var+'-'+chunking_string+'-v07052024.zarr'

In [None]:
ds_1_5_1500_3600 = remove_zarr_encoding(ds_1_5_1500_3600)

In [None]:
from rechunker import rechunk
!rm -rf /scratch/es60/ard/reanalysis/BRAN2020/ARD/temp_store/*
# Define target chunking
target_chunks = chunking_dict
target_store = BRAN2020_ard_path+ard_rcTime_file_ID
temp_store = BRAN2020_ard_path+'temp_store'  # Optional based on dataset size

# Execute rechunking
rechunk_plan = rechunk(ds_1_5_1500_3600, target_chunks, target_store=target_store, temp_store=temp_store,max_mem='8GB')

In [None]:
%%time
rechunk_plan.execute()

In [None]:
!touch /scratch/es60/ard/reanalysis/BRAN2020/ARD/rechunk_done.log

In [None]:
zarr.consolidate_metadata(target_store)
ds_all_1_150_150 = xr.open_zarr(target_store,consolidated=True)

In [None]:
ds_all_1_150_150