# Using Xarray for ingestion of example datasets

In this notebook we look at all the example datasets provided in Parcels - seeing how we currently ingest them (ultimately using `Fieldset.from_netcdf()`), and how we can ingest them differently by first creating xarray objects.


This is important to explore as it allows us to see the limitations of Xarray for representing these datasets, allowing us to explore the design of v4 how we can then pass those xarray datasets to Parcels.

Note:

> For Arakawa B-and C-grids, Parcels requires the locations of the _corner_ points (f-points) of the grid cells for the `dimensions` dictionary of velocity `Fields`

In [1]:
from glob import glob
from pathlib import Path

import xarray as xr

import parcels

# Accounting


In [2]:
accounted = [  # Key: ✅ ingestion example exists in this notebook, 📔 -> used in notebook
    # A grid
    "OFAM_example_data",  # ✅📔
    "GlobCurrent_example_d  ata",  # ✅📔
    "Peninsula_data",  # ✅📔
    "MovingEddies_data",  # ✅📔 # idealised A grid data
    # C grid
    "NemoCurvilinear_data",  # ✅📔
    "NemoNorthSeaORCA025-N006_data",  # ✅📔
    "MITgcm_example_data",  # ✅ in example
    # uncategorized
    "DecayingMovingEddy_data",  # not used in codebase
    "POPSouthernOcean_data",  # not used in codebase
    "SWASH_data",  # 📔 # timevarying depth dimension
    "WOA_data",  # not used in codebase
    "CROCOidealized_data",  # 📔 # croco notebook (also timevarying depth dimension)
]

set(parcels.list_example_datasets()) - set(accounted)

{'GlobCurrent_example_data'}

In [3]:
DATA_HOME = Path("data")

In [4]:
for name in parcels.list_example_datasets():
    parcels.download_example_dataset(name, data_home=DATA_HOME)

# A grid datasets
## OFAM_example_data

In [5]:
ofam_data = DATA_HOME / "OFAM_example_data"


# Prior usage
filenames = {
    "U": f"{ofam_data}/OFAM_simple_U.nc",
    "V": f"{ofam_data}/OFAM_simple_V.nc",
}
variables = {"U": "u", "V": "v"}
dimensions = {
    "lat": "yu_ocean",
    "lon": "xu_ocean",
    "depth": "st_ocean",
    "time": "Time",
}

fieldset = parcels.FieldSet.from_netcdf(
    filenames,
    variables,
    dimensions,
    allow_time_extrapolation=True,
)


# xarray equivalent
ds_U = xr.open_dataset(f"{ofam_data}/OFAM_simple_U.nc").rename(
    {
        "yu_ocean": "lat",
        "xu_ocean": "lon",
        "st_ocean": "depth",
        "Time": "time",
        "u": "U",
    }
)

ds_V = xr.open_dataset(f"{ofam_data}/OFAM_simple_V.nc").rename(
    {
        "yu_ocean": "lat",
        "xu_ocean": "lon",
        "st_ocean": "depth",
        "Time": "time",
        "v": "V",
    }
)

dimensions_identity = {"lon": "lon", "lat": "lat", "depth": "depth", "time": "time"}

parcels.Field.from_xarray(ds_U["U"], name="U", dimensions=dimensions_identity)
parcels.Field.from_xarray(ds_V["V"], name="V", dimensions=dimensions_identity)

<Field>
    name            : 'V'
    grid            : RectilinearZGrid(lon=array([ 100.00,  100.10,  100.20, ...,  299.80,  299.90,  300.00], dtype=float32), lat=array([-30.00, -29.90, -29.80, ...,  29.80,  29.90,  30.00], dtype=float32), time=array([ 0.00,  86400.00,  172800.00,  259200.00]), time_origin=1993-01-01T12:00:00.000000000, mesh='spherical')
    extrapolate time: False
    time_periodic   : False
    gridindexingtype: 'nemo'
    to_write        : False

## GlobCurrent_example_data

In [6]:
globcurrent_data = DATA_HOME / "GlobCurrent_example_data"

filenames = {
    "U": f"{globcurrent_data}/20*.nc",
    "V": f"{globcurrent_data}/20*.nc",
}
variables = {
    "U": "eastward_eulerian_current_velocity",
    "V": "northward_eulerian_current_velocity",
}
dimensions = {"lat": "lat", "lon": "lon", "time": "time"}
fieldset = parcels.FieldSet.from_netcdf(filenames, variables, dimensions)


# xarray equivalent
ds = xr.open_mfdataset(f"{globcurrent_data}/20*.nc").rename(
    {
        "eastward_eulerian_current_velocity": "U",
        "northward_eulerian_current_velocity": "V",
    }
)

dimensions_identity = {"lat": "lat", "lon": "lon", "time": "time"}

parcels.Field.from_xarray(ds["U"], name="U", dimensions=dimensions_identity)
parcels.Field.from_xarray(ds["V"], name="V", dimensions=dimensions_identity)

<Field>
    name            : 'V'
    grid            : RectilinearZGrid(lon=array([ 14.88,  15.12,  15.38, ...,  34.38,  34.62,  34.88], dtype=float32), lat=array([-40.12, -39.88, -39.62, ..., -30.62, -30.38, -30.12], dtype=float32), time=array([ 0.00,  86400.00,  172800.00, ...,  31363200.00,  31449600.00,  31536000.00]), time_origin=2002-01-01T00:00:00.000000000, mesh='spherical')
    extrapolate time: False
    time_periodic   : False
    gridindexingtype: 'nemo'
    to_write        : False

## Peninsula_data

In [7]:
peninsula_data = DATA_HOME / "Peninsula_data"

example_dataset_folder = parcels.download_example_dataset("Peninsula_data")
filenames = {
    "U": peninsula_data / "peninsulaU.nc",
    "V": peninsula_data / "peninsulaV.nc",
    "P": peninsula_data / "peninsulaP.nc",
}
variables = {"U": "vozocrtx", "V": "vomecrty", "P": "P"}
dimensions = {"lon": "nav_lon", "lat": "nav_lat", "time": "time_counter"}
fieldset = parcels.FieldSet.from_netcdf(filenames, variables, dimensions, allow_time_extrapolation=True)


# xarray equivalent
ds = xr.open_mfdataset(
    [peninsula_data / "peninsulaU.nc", peninsula_data / "peninsulaV.nc", peninsula_data / "peninsulaP.nc"]
).rename(
    {
        "nav_lon": "lon",
        "nav_lat": "lat",
        "time_counter": "time",
        "vozocrtx": "U",
        "vomecrty": "V",
    }
)

dimensions_identity = {"lat": "lat", "lon": "lon", "time": "time"}

parcels.Field.from_xarray(ds["U"], name="U", dimensions=dimensions_identity)
parcels.Field.from_xarray(ds["V"], name="V", dimensions=dimensions_identity)
parcels.Field.from_xarray(ds["P"], name="P", dimensions=dimensions_identity)

<Field>
    name            : 'P'
    grid            : CurvilinearZGrid(lon=array([[ 1000.00,  1989.90,  2979.80, ...,  97020.20,  98010.10,  99000.00],
       [ 1000.00,  1989.90,  2979.80, ...,  97020.20,  98010.10,  99000.00],
       [ 1000.00,  1989.90,  2979.80, ...,  97020.20,  98010.10,  99000.00],
       ...,
       [ 1000.00,  1989.90,  2979.80, ...,  97020.20,  98010.10,  99000.00],
       [ 1000.00,  1989.90,  2979.80, ...,  97020.20,  98010.10,  99000.00],
       [ 1000.00,  1989.90,  2979.80, ...,  97020.20,  98010.10,  99000.00]], dtype=float32), lat=array([[ 1000.00,  1000.00,  1000.00, ...,  1000.00,  1000.00,  1000.00],
       [ 1979.59,  1979.59,  1979.59, ...,  1979.59,  1979.59,  1979.59],
       [ 2959.18,  2959.18,  2959.18, ...,  2959.18,  2959.18,  2959.18],
       ...,
       [ 47040.82,  47040.82,  47040.82, ...,  47040.82,  47040.82,  47040.82],
       [ 48020.41,  48020.41,  48020.41, ...,  48020.41,  48020.41,  48020.41],
       [ 49000.00,  49000.00,  490

## MovingEddies_data

In [8]:
movingeddies_data = DATA_HOME / "MovingEddies_data"

example_dataset_folder = parcels.download_example_dataset("MovingEddies_data")

filenames = {
    "U": movingeddies_data / "moving_eddiesU.nc",
    "V": movingeddies_data / "moving_eddiesV.nc",
}
variables = {"U": "vozocrtx", "V": "vomecrty"}
dimensions = {"lon": "nav_lon", "lat": "nav_lat", "time": "time_counter"}
fieldset = parcels.FieldSet.from_netcdf(filenames, variables, dimensions)

# xarray equivalent
ds = xr.open_mfdataset([movingeddies_data / "moving_eddiesU.nc", movingeddies_data / "moving_eddiesV.nc"]).rename(
    {
        "nav_lon": "lon",
        "nav_lat": "lat",
        "time_counter": "time",
        "vozocrtx": "U",
        "vomecrty": "V",
    }
)

dimensions_identity = {"lat": "lat", "lon": "lon", "time": "time"}

parcels.Field.from_xarray(ds["U"], name="U", dimensions=dimensions_identity)
parcels.Field.from_xarray(ds["V"], name="V", dimensions=dimensions_identity)

<Field>
    name            : 'V'
    grid            : CurvilinearZGrid(lon=array([[ 0.00,  2010.05,  4020.10, ...,  395979.91,  397989.94,  400000.00],
       [ 0.00,  2010.05,  4020.10, ...,  395979.91,  397989.94,  400000.00],
       [ 0.00,  2010.05,  4020.10, ...,  395979.91,  397989.94,  400000.00],
       ...,
       [ 0.00,  2010.05,  4020.10, ...,  395979.91,  397989.94,  400000.00],
       [ 0.00,  2010.05,  4020.10, ...,  395979.91,  397989.94,  400000.00],
       [ 0.00,  2010.05,  4020.10, ...,  395979.91,  397989.94,  400000.00]], dtype=float32), lat=array([[ 0.00,  0.00,  0.00, ...,  0.00,  0.00,  0.00],
       [ 2005.73,  2005.73,  2005.73, ...,  2005.73,  2005.73,  2005.73],
       [ 4011.46,  4011.46,  4011.46, ...,  4011.46,  4011.46,  4011.46],
       ...,
       [ 695988.56,  695988.56,  695988.56, ...,  695988.56,  695988.56,  695988.56],
       [ 697994.25,  697994.25,  697994.25, ...,  697994.25,  697994.25,  697994.25],
       [ 700000.00,  700000.00,  700000.

# C-grid
## NemoCurvilinear_data


In [9]:
nemocurvilinear_data = DATA_HOME / "NemoCurvilinear_data"

filenames = {
    "U": {
        "lon": nemocurvilinear_data / "mesh_mask.nc4",
        "lat": nemocurvilinear_data / "mesh_mask.nc4",
        "data": nemocurvilinear_data / "U_purely_zonal-ORCA025_grid_U.nc4",
    },
    "V": {
        "lon": nemocurvilinear_data / "mesh_mask.nc4",
        "lat": nemocurvilinear_data / "mesh_mask.nc4",
        "data": nemocurvilinear_data / "V_purely_zonal-ORCA025_grid_V.nc4",
    },
}
variables = {"U": "U", "V": "V"}

dimensions = {"lon": "glamf", "lat": "gphif", "time": "time_counter"}

fieldset = parcels.FieldSet.from_nemo(filenames, variables, dimensions, allow_time_extrapolation=True)

# xarray equivalent
ds = (
    xr.open_mfdataset(
        [
            nemocurvilinear_data / "mesh_mask.nc4",
            nemocurvilinear_data / "U_purely_zonal-ORCA025_grid_U.nc4",
            nemocurvilinear_data / "V_purely_zonal-ORCA025_grid_V.nc4",
        ]
    )
    .drop_vars("time")  # use `time_counter` as the time dimension instead, as in other implementation
    .rename(
        {
            "glamf": "lon",
            "gphif": "lat",
            "time_counter": "time",
        }
    )
    .set_coords(["lon", "lat"])  # otherwise its not brought along on the DataArray object
    .isel(z=0, z_a=0)  # Extract the depth dimension. Its size 1 anyway.
)

dimensions_identity = {"lat": "lat", "lon": "lon", "time": "time"}

parcels.Field.from_xarray(ds["U"], name="U", dimensions=dimensions_identity, gridindexingtype="nemo")
parcels.Field.from_xarray(ds["V"], name="V", dimensions=dimensions_identity, gridindexingtype="nemo")

<Field>
    name            : 'V'
    grid            : CurvilinearZGrid(lon=array([[ 72.88,  73.12,  73.38, ...,  72.62,  72.88,  73.12],
       [ 72.88,  73.12,  73.38, ...,  72.62,  72.88,  73.12],
       [ 72.88,  73.12,  73.38, ...,  72.62,  72.88,  73.12],
       ...,
       [ 73.00,  73.00,  73.00, ...,  73.00,  73.00,  73.00],
       [ 73.00,  73.00,  73.00, ...,  73.00,  73.00,  73.00],
       [ 73.00,  73.00,  73.00, ...,  73.00,  73.00,  73.00]], dtype=float32), lat=array([[-76.98, -76.98, -76.98, ..., -76.98, -76.98, -76.98],
       [-76.93, -76.93, -76.93, ..., -76.93, -76.93, -76.93],
       [-76.87, -76.87, -76.87, ..., -76.87, -76.87, -76.87],
       ...,
       [ 50.00,  50.00,  50.01, ...,  50.01,  50.00,  50.00],
       [ 50.00,  50.00,  50.01, ...,  50.01,  50.00,  50.00],
       [ 49.99,  49.99,  50.00, ...,  50.00,  49.99,  49.99]], dtype=float32), time=array([ 0.00]), time_origin=3.0, mesh='spherical')
    extrapolate time: True
    time_periodic   : False
    gr

## NemoNorthSeaORCA025-N006_data

In [10]:
nemonorthseaorca025_n006_data = DATA_HOME / "NemoNorthSeaORCA025-N006_data"


ufiles = sorted(glob(f"{nemonorthseaorca025_n006_data}/ORCA*U.nc"))
vfiles = sorted(glob(f"{nemonorthseaorca025_n006_data}/ORCA*V.nc"))
wfiles = sorted(glob(f"{nemonorthseaorca025_n006_data}/ORCA*W.nc"))
mesh_mask = f"{nemonorthseaorca025_n006_data}/coordinates.nc"

coords = xr.open_dataset(mesh_mask, decode_times=False)

filenames = {
    "U": {"lon": mesh_mask, "lat": mesh_mask, "depth": wfiles[0], "data": ufiles},
    "V": {"lon": mesh_mask, "lat": mesh_mask, "depth": wfiles[0], "data": vfiles},
    "W": {"lon": mesh_mask, "lat": mesh_mask, "depth": wfiles[0], "data": wfiles},
}

variables = {"U": "uo", "V": "vo", "W": "wo"}
dimensions = {
    "U": {"lon": "glamf", "lat": "gphif", "depth": "depthw", "time": "time_counter"},
    "V": {"lon": "glamf", "lat": "gphif", "depth": "depthw", "time": "time_counter"},
    "W": {"lon": "glamf", "lat": "gphif", "depth": "depthw", "time": "time_counter"},
}
fieldset = parcels.FieldSet.from_nemo(filenames, variables, dimensions)


# xarray equivalent
mesh_mask = (
    xr.open_mfdataset(f"{nemonorthseaorca025_n006_data}/coordinates.nc", decode_times=False)
    .isel(time=0)
    .drop_vars("time")[["glamf", "gphif"]]  # Static mesh
    .rename({"glamf": "lon", "gphif": "lat"})
    .drop_attrs(deep=False)  # prevent collision with other dataset level attributes when merging
)

ds_U = (
    xr.open_mfdataset(f"{nemonorthseaorca025_n006_data}/ORCA*U.nc")
    .rename({"uo": "U", "time_counter": "time", "depthu": "depth"})[["U"]]
    .drop_vars(["nav_lon", "nav_lat", "depth"])  # using lat and lon from mesh, and depth from W
    .drop_attrs(deep=False)  # prevent collision with other dataset level attributes when merging
)

ds_V = (
    xr.open_mfdataset(f"{nemonorthseaorca025_n006_data}/ORCA*V.nc")
    .rename({"vo": "V", "time_counter": "time", "depthv": "depth"})[["V"]]
    .drop_vars(["nav_lon", "nav_lat", "depth"])  # using lat and lon from mesh, and depth from W
    .drop_attrs(deep=False)  # prevent collision with other dataset level attributes when merging
)

ds_W = (
    xr.open_mfdataset(f"{nemonorthseaorca025_n006_data}/ORCA*W.nc")
    .rename({"wo": "W", "time_counter": "time", "depthw": "depth"})[["W"]]
    .drop_vars(["nav_lon", "nav_lat"])
    .drop_attrs(deep=False)  # prevent collision with other dataset level attributes when merging
)

ds = xr.combine_by_coords([mesh_mask, ds_U, ds_V, ds_W]).set_coords(["lon", "lat"])


dimensions_identity = {"lat": "lat", "lon": "lon", "time": "time", "depth": "depth"}

parcels.Field.from_xarray(ds["U"], name="U", dimensions=dimensions_identity, gridindexingtype="nemo")
parcels.Field.from_xarray(ds["V"], name="V", dimensions=dimensions_identity, gridindexingtype="nemo")
parcels.Field.from_xarray(ds["W"], name="W", dimensions=dimensions_identity, gridindexingtype="nemo")

  with _grid_fb_class(
  with _grid_fb_class(
  with _grid_fb_class(


<Field>
    name            : 'W'
    grid            : CurvilinearZGrid(lon=array([[-24.56, -24.30, -24.05, ...,  13.01,  13.26,  13.51],
       [-24.56, -24.30, -24.04, ...,  13.03,  13.28,  13.53],
       [-24.55, -24.30, -24.04, ...,  13.05,  13.30,  13.55],
       ...,
       [-24.48, -24.01, -23.54, ...,  31.76,  32.01,  32.25],
       [-24.50, -24.02, -23.55, ...,  31.99,  32.24,  32.48],
       [-24.52, -24.04, -23.56, ...,  32.23,  32.47,  32.72]], dtype=float32), lat=array([[ 45.37,  45.37,  45.37, ...,  44.05,  44.03,  44.02],
       [ 45.56,  45.56,  45.56, ...,  44.21,  44.19,  44.18],
       [ 45.74,  45.74,  45.74, ...,  44.37,  44.35,  44.34],
       ...,
       [ 75.57,  75.57,  75.58, ...,  68.63,  68.54,  68.46],
       [ 75.70,  75.71,  75.71, ...,  68.72,  68.64,  68.55],
       [ 75.84,  75.84,  75.84, ...,  68.81,  68.73,  68.64]], dtype=float32), time=array([ 0.00,  432000.00,  864000.00,  1296000.00,  1728000.00,  2160000.00]), time_origin=2000-01-02T12:00:00.0

## MITgcm_example_data


In [11]:
mitgcm_example_data = DATA_HOME / "MITgcm_example_data"

filenames = {
    "U": f"{mitgcm_example_data}/mitgcm_UV_surface_zonally_reentrant.nc",
    "V": f"{mitgcm_example_data}/mitgcm_UV_surface_zonally_reentrant.nc",
}
variables = {"U": "UVEL", "V": "VVEL"}
dimensions = {
    "U": {"lon": "XG", "lat": "YG", "time": "time"},
    "V": {"lon": "XG", "lat": "YG", "time": "time"},
}
fieldset = parcels.FieldSet.from_mitgcm(filenames, variables, dimensions, mesh="flat")

In [12]:
# xarray equivalent
def update_to_fpoints(ds: xr.Dataset) -> xr.Dataset:
    """Parcels assumes that data is provided on the f-points, so we need to swap the dimensions."""
    ds["UVEL"] = ds["UVEL"].swap_dims({"YC": "YG"})
    ds["VVEL"] = ds["VVEL"].swap_dims({"XC": "XG"})
    return ds


ds = (
    xr.open_dataset(mitgcm_example_data / "mitgcm_UV_surface_zonally_reentrant.nc")
    .pipe(update_to_fpoints)
    .rename({"XG": "lon", "YG": "lat", "time": "time", "UVEL": "U", "VVEL": "V"})
)

dimensions_identity = {"lat": "lat", "lon": "lon", "time": "time"}

parcels.Field.from_xarray(ds["U"], name="U", dimensions=dimensions_identity, gridindexingtype="mitgcm")
parcels.Field.from_xarray(ds["V"], name="V", dimensions=dimensions_identity, gridindexingtype="mitgcm")

<Field>
    name            : 'V'
    grid            : RectilinearZGrid(lon=array([ 0.00,  5000.00,  10000.00, ...,  985000.00,  990000.00,  995000.00], dtype=float32), lat=array([ 0.00,  5000.00,  10000.00, ...,  1985000.00,  1990000.00,  1995000.00], dtype=float32), time=array([ 0.00,  86400.00,  172800.00, ...,  604800.00,  691200.00,  777600.00]), time_origin=2000-01-02 00:00:00, mesh='spherical')
    extrapolate time: False
    time_periodic   : False
    gridindexingtype: 'mitgcm'
    to_write        : False

## Additional Notes