# build-cat-BRAN2020

### use `ecgtools` to build a custom `intake-esm` catalogue for BRAN2020

Date: 25 August 2023

Author = {"name": "Thomas Moore", "affiliation": "CSIRO", "email": "thomas.moore@csiro.au", "orcid": "0000-0003-3930-1946"}

#### Reference documents: https://ecgtools.readthedocs.io/en/latest

In [1]:
import glob
import pathlib
import traceback
from datetime import datetime

import xarray as xr

from ecgtools import Builder
from ecgtools.builder import INVALID_ASSET, TRACEBACK

from matplotlib import pyplot as plt

### get catalog path from config file

In [2]:
import configparser

# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the config file
#########
#### you will need to specifiy your correct path the the `data-catalogue/config.ini` file 
#########
config.read('./code/BRAN2020-intake-catalog/config.ini')

# Get the value of a variable
catalog_path = config.get('paths', 'catalog_path')
catalog_path

'/g/data/v14/tm4888/code/BRAN2020-intake-catalog/catalogs/'

### build the catalogue from the listing of files on `NCI` project `gb6` = 8950 files

In [17]:
root_source_path = '/g/data/gb6/BRAN/BRAN2020/'

In [18]:
files = sorted(glob.glob(root_source_path+'*/*'))
len(files)

8950

### Write `ecgtools` parser, <br> see: https://ecgtools.readthedocs.io/en/latest/how-to/use-a-custom-parser.html

In [35]:
def parse_BRAN2020(file):
    """BRAN2020 data stored in"""
    file = pathlib.Path(file)
    info = {}

    try:
        stem = file.stem
        split = stem.split('_')
        domain = split[0]
        time_period = file.parts[-2]
        source = file.parts[-3]
        
        with xr.open_dataset(file, chunks={}, decode_times=False, decode_timedelta=False,
                             decode_coords=False, drop_variables = ['average_DT','average_T1', 'average_T2', 'Time_bnds','Time_bounds']) as ds:
            variable_list = [var for var in ds if 'long_name' in ds[var].attrs]
            #remove 'average_DT','average_T1', 'average_T2'
            #variable_list.remove('average_DT')
            #variable_list.remove('average_T1')
            #variable_list.remove('average_T2')
            #check length of variable_list
            assert len(variable_list) != 0, "empty variable list"
            #assert len(variable_list) < 1, "more than one variable"

            info = {
                'source': source,
                'domain': domain,
                'time_period': time_period,
                'variable': variable_list,
                'path': str(file),
            }

        return info

    except Exception:
        return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}

#### test parser with single file

In [36]:
parse_BRAN2020(files[3])

{'source': 'BRAN2020',
 'domain': 'atm',
 'time_period': 'annual',
 'variable': ['land_mask',
  'ice_mask',
  'wind',
  't_surf',
  'tau_x',
  'tau_y',
  'u_star',
  'u_atm',
  'v_atm',
  'shflx',
  'lwflx',
  'rh_ref',
  'q_ref'],
 'path': '/g/data/gb6/BRAN/BRAN2020/annual/atm_flux_diag_ann_1996.nc'}

# setup builder object

In [49]:
root_source_path = '/g/data/gb6/BRAN/BRAN2020/'

In [50]:
%%time
b = Builder([root_source_path],depth=1)

CPU times: user 1.18 ms, sys: 0 ns, total: 1.18 ms
Wall time: 1.23 ms


In [51]:
%%time
b.build(parsing_func = parse_BRAN2020)

CPU times: user 1min 20s, sys: 13.2 s, total: 1min 34s
Wall time: 1min 46s


Builder(paths=['/g/data/gb6/BRAN/BRAN2020/'], storage_options={}, depth=1, exclude_patterns=[], include_patterns=[], joblib_parallel_kwargs={})

In [53]:
b.df

Unnamed: 0,source,domain,time_period,variable,path
0,BRAN2020,atm,annual,"[land_mask, ice_mask, wind, t_surf, tau_x, tau...",/g/data/gb6/BRAN/BRAN2020/annual/atm_flux_diag...
1,BRAN2020,atm,annual,"[land_mask, ice_mask, wind, t_surf, tau_x, tau...",/g/data/gb6/BRAN/BRAN2020/annual/atm_flux_diag...
2,BRAN2020,atm,annual,"[land_mask, ice_mask, wind, t_surf, tau_x, tau...",/g/data/gb6/BRAN/BRAN2020/annual/atm_flux_diag...
3,BRAN2020,atm,annual,"[land_mask, ice_mask, wind, t_surf, tau_x, tau...",/g/data/gb6/BRAN/BRAN2020/annual/atm_flux_diag...
4,BRAN2020,atm,annual,"[land_mask, ice_mask, wind, t_surf, tau_x, tau...",/g/data/gb6/BRAN/BRAN2020/annual/atm_flux_diag...
...,...,...,...,...,...
8944,BRAN2020,ocean,month,[w],/g/data/gb6/BRAN/BRAN2020/month/ocean_w_mth_20...
8945,BRAN2020,ocean,month,[w],/g/data/gb6/BRAN/BRAN2020/month/ocean_w_mth_20...
8946,BRAN2020,ocean,month,[w],/g/data/gb6/BRAN/BRAN2020/month/ocean_w_mth_20...
8947,BRAN2020,grid,static,"[x_T, y_T, x_vert_T, y_vert_T, area_T, angle_T...",/g/data/gb6/BRAN/BRAN2020/static/grid_spec.nc


#### "invalid assets" ??

In [54]:
b.invalid_assets['INVALID_ASSET'].values

KeyError: 'INVALID_ASSET'

## save your `intake-esm` catalogue

In [55]:
b.save(
    # File path - could save as .csv (uncompressed csv) or .csv.gz (compressed csv)
    name = "BRAN2020",
    directory = catalog_path,
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["domain", "time_period"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "date",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Successfully wrote ESM catalog json file to: file:///g/data/v14/tm4888/code/BRAN2020-intake-catalog/catalogs/BRAN2020.json


# TEST catalog

In [56]:
import intake

In [57]:
intake.open_esm_datastore(catalog_path+'BRAN2020.json')

Unnamed: 0,unique
source,1
domain,4
time_period,4
variable,16
path,8949
derived_variable,0


# THE END