# Converting a simple NetCDF file to a TileDB array

## Import packages

In [None]:
import netCDF4
import numpy as np
import tiledb
from tiledb.cf import AttrMetadata, Group, GroupSchema, NetCDF4ConverterEngine
import matplotlib.pyplot as plt

## Create an example NetCDF file

### Example dataset

Create two 100x100 numpy arrays:

In [None]:
x_data = np.linspace(-5.0, 5.0, 100)
y_data = np.linspace(-5.0, 5.0, 100)
xv, yv = np.meshgrid(x_data, y_data, sparse=True)
A1_data = xv + yv
A2_data = np.sin((xv / 2.0) ** 2 + yv ** 2)

If the file does not exist yet, write the example data to a netcdf file:

In [None]:
netcdf_file = "output/simple1.nc"
vfs = tiledb.VFS()
if not vfs.is_file(netcdf_file):
    with netCDF4.Dataset(netcdf_file, mode="w") as dataset:
        dataset.setncatts({"title": "Simple dataset for examples"})
        dataset.createDimension("x", 100)
        dataset.createDimension("y", 100)
        A1 = dataset.createVariable("A1", np.float64, ("x", "y"))
        A1.setncattr("full_name", "Example matrix A1")
        A1.setncattr("description", "x + y")
        A1[:, :] = A1_data
        A2 = dataset.createVariable("A2", np.float64, ("x", "y"))
        A2[:, :] = A2_data
        A2.setncattr("full_name", "Example matrix A2")
        A2.setncattr("description", "sin((x/2)^2 + y^2")
        x1 = dataset.createVariable("x_data", np.float64, ("x",))
        x1[:] = x_data
        y = dataset.createVariable("y_data", np.float64, ("y",))
        y[:] = y_data
    print(f"Created example NetCDF file `{netcdf_file}`.")
else:
    print(f"Example NetCDF file `{netcdf_file}` already exists.")


Examine the variables in the netcdf file:

In [None]:
netcdf_data = netCDF4.Dataset(netcdf_file)
print(netcdf_data.variables)

## Convert the NetCDF file to a TileDB array

Before converting the file create a converter that contains the parameters for the conversion. The converter can be automatically generated from a NetCDF file using the `NetCDF4ConverterEngine.from_file` class method with the following parameters:

* `input_file`: The input NetCDF file to generate the converter engine from.
* `group_path`: The path to the NetCDF group to copy data from. Use `'/'` for the root group.
* `unlimited_dim_size`: The size of the domain for TileDB dimensions created from unlimited NetCDF dimensions. If `None`, the current size of the NetCDF dimension will be used.
* `dim_dtype`: The numpy dtype to use when converting a NetCDF dimension to a TileDB dimension.
* `tiles_by_var`: A map from the name of a NetCDF variable to the tiles of the dimensions of the variable in the generated TileDB array.
* `tiles_by_dims`: A map from the name of NetCDF dimensions defining a variable to the tiles of those dimensions in the generated TileDB array.
* `coords_to_dims`: If `True`, convert the NetCDF coordinate variable into a TileDB dimension for sparse arrays. Otherwise, convert the coordinate dimension into a TileDB dimension and the coordinate variable into a TileDB attribute.
* `collect_attrs`: If `True`, store all attributes with the same dimensions in the same array. Otherwise, store each attribute in a scalar array.

In [None]:
converter = NetCDF4ConverterEngine.from_file(
    netcdf_file,
    coords_to_dims=False,
    collect_attrs=True,
    dim_dtype=np.uint32,
    tiles_by_dims={("x", "y"): (20,20), ("x",): (20,), ("y",): (20,)},
)
converter

Rename the array names to be more descriptive:

In [None]:
converter.get_array_creator('array0').name = 'x'
converter.get_array_creator('array1').name = 'matrices'
converter.get_array_creator('array2').name = 'y'

Run the conversions to create two dense TileDB arrays:

In [None]:
group_uri = "output/tiledb_simple1"
converter.convert_to_group(group_uri)

## Examine the TileDB group schema

In [None]:
group_schema = GroupSchema.load(group_uri)
group_schema

## Examine the data in the arrays

Open the attributes from the generated TileDB group:

In [None]:
with Group(group_uri, attr="x.data") as group:
    with (
        group.open_array(attr="x.data") as x_array,
        group.open_array(attr="y.data") as y_array,
        group.open_array(array="matrices") as data_array,
    ):
        x = x_array[:]
        y = y_array[:]
        data = data_array[...]
        A1 = data["A1"]
        A2 = data["A2"]
        a1_description = AttrMetadata(data_array.meta, "A1")["description"]
        a2_description = AttrMetadata(data_array.meta, "A2")["description"]

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
axes[0].contourf(x, y, A1);
axes[0].set_title(a1_description);
axes[1].contourf(x, y, A2);
axes[1].set_title(a2_description);