# Intro to Weather Data in TileDB

## About this Tutorial

This tutorial shows how to store sparse "gridded" data in TileDB.

### What it Shows

1. Use sparse axis labels to handle querying sparse gridded data
2. Converting between matrix form and coordinate form

### Set-up Requirements

This example requires the following Python libraries: tiledb, tiledb-cf, numpy, xarray, rasterio, cartopy, matplotlib, geopandas

In [1]:
import tiledb
import tiledb.cf

import sparse
import geopandas as gpd
import rasterio.features
import numpy as np
import xarray as xr

import cartopy.crs as ccrs
import matplotlib.pyplot as plt

example_uri = "arrays/sparse_toy_weather_data"

In [2]:
import shutil

# clean up any previous runs
try:
    shutil.rmtree(example_uri)
except:
    pass

## Create toy data

Create toy data (random array) and store over latitude, longitude, and date.

In [3]:
# Create toy data using tiledb.cf DataspaceCreator
creator = tiledb.cf.DataspaceCreator()
creator.add_shared_dim(dim_name="latitude.index", domain=(0, 180), dtype=np.uint32)
creator.add_shared_dim(dim_name="latitude", domain=(-90.0, 90.0), dtype=np.float64)
creator.add_shared_dim(dim_name="longitude.index", domain=(0, 359), dtype=np.uint32)
creator.add_shared_dim(dim_name="longitude", domain=(-180.0, 179.0), dtype=np.float64)
creator.add_shared_dim(
    dim_name="date",
    domain=(np.datetime64("2021-01-01", "D"), np.datetime64("2021-12-31", "D")),
    dtype=np.dtype("datetime64[D]")
)
creator.add_array_creator(
    array_name="toy_data", dims=("date", "latitude.index", "longitude.index"), sparse=True, capacity=100_000
)
creator.add_attr_creator(
    attr_name="toy_data",
    array_name="toy_data",
    dtype=np.float64,
    fill=np.nan,
    filters=tiledb.FilterList([tiledb.ZstdFilter(7)]),
)
for data_name in ("latitude", "longitude"):
    creator.add_array_creator(
        array_name=f"{data_name}_axis_label",
        dims=(data_name,),
        coords_filters=tiledb.FilterList([tiledb.ZstdFilter(7)]),
        sparse=True,
    )
    creator.add_attr_creator(
        attr_name=f"{data_name}.index",
        array_name=f"{data_name}_axis_label",
        dtype=np.uint32,
        filters=tiledb.FilterList([tiledb.PositiveDeltaFilter(), tiledb.ZstdFilter()]),
    )
creator

0
"→ SharedDim(name=latitude.index, domain=(0, 180), dtype='uint32')"
"→ SharedDim(name=latitude, domain=(-90.0, 90.0), dtype='float64')"
"→ SharedDim(name=longitude.index, domain=(0, 359), dtype='uint32')"
"→ SharedDim(name=longitude, domain=(-180.0, 180.0), dtype='float64')"
"→ SharedDim(name=date, domain=(numpy.datetime64('2021-01-01'), numpy.datetime64('2021-12-31')), dtype='datetime64[D]')"

0
"→ tiledb.Dim(name=date, domain=(numpy.datetime64('2021-01-01'), numpy.datetime64('2021-12-31')), dtype='datetime64[D]', tile=None)"
"→ tiledb.Dim(name=latitude.index, domain=(0, 180), dtype='uint32', tile=None)"
"→ tiledb.Dim(name=longitude.index, domain=(0, 359), dtype='uint32', tile=None)"

0
"→ tiledb.Attr(name=toy_data, dtype='float64', var=False, nullable=False, filters=FilterList(FilterList([ZstdFilter(level=7)])))"

0
cell_order=row-major
tile_order=row-major
capacity=100000
sparse=True
allows_duplicates=False
coords_filters=None

0
"→ tiledb.Dim(name=latitude, domain=(-90.0, 90.0), dtype='float64', tile=None)"

0
"→ tiledb.Attr(name=latitude.index, dtype='uint32', var=False, nullable=False, filters=FilterList(FilterList([PositiveDeltaFilter(window=1024),  ZstdFilter(level=-1)])))"

0
cell_order=row-major
tile_order=row-major
capacity=0
sparse=True
allows_duplicates=False
coords_filters=FilterList([ZstdFilter(level=7)])

0
"→ tiledb.Dim(name=longitude, domain=(-180.0, 180.0), dtype='float64', tile=None)"

0
"→ tiledb.Attr(name=longitude.index, dtype='uint32', var=False, nullable=False, filters=FilterList(FilterList([PositiveDeltaFilter(window=1024),  ZstdFilter(level=-1)])))"

0
cell_order=row-major
tile_order=row-major
capacity=0
sparse=True
allows_duplicates=False
coords_filters=FilterList([ZstdFilter(level=7)])


In [4]:
creator.create_group(example_uri)

In [5]:
# Write data for access labels
with tiledb.cf.Group(example_uri, mode="w") as group:  
    with (
        group.open_array("latitude_axis_label") as lat_array,
        group.open_array("longitude_axis_label") as lon_array
    ):
        lat_array[np.arange(-90, 91, 1, dtype=np.float64)] = np.arange(181)
        lon_array[np.arange(-180, 180, 1, dtype=np.float64)] = np.arange(360)

In [6]:
# Use tiledb.cf.Group of tiledb.open_array to read slices of data from the latitude and longitude axis labels

In [7]:
# Write a "dense" subarray in the bounding box around Argentina
boundaries = gpd.read_file("../data/boundaries.geojson")
argentina = boundaries.loc[boundaries["name"] == "Argentina"]
geom = argentina.geometry
bbox = rasterio.features.bounds(geom) # Bounding box is stored in order: [left, bottom, right, top]
dx, dy = 1.0, 1.0 # optionally add buffer around region 
# Write to the toy data array
with tiledb.cf.Group(example_uri, mode="r") as group:
    # Query lat/lon indices from the axis labels.
    with (
        group.open_array("latitude_axis_label") as lat_array,
        group.open_array("longitude_axis_label") as lon_array,   
    ):
        lat_indices = lat_array[bbox[1] - dy : bbox[3] + dy]["latitude.index"]
        lon_indices = lon_array[bbox[0] - dx : bbox[2] + dx]["longitude.index"]


In [8]:
# Create random data in the bounding box for each day in Jan. 2021
dates = np.arange(
    np.datetime64("2021-01", "D"), np.datetime64("2021-02", "D"), np.timedelta64(1, "D")
)
data = np.random.rand(len(dates), len(lat_indices), len(lon_indices))
lon_indices

array([287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
       300, 301, 302, 303, 304, 305, 306, 307], dtype=uint32)

In [9]:
# Flatten data matrix and use meshgrid to construct coordinates for bounding box on a single day
data = data.reshape(-1) # flatten to convert to coordinate form
coords = tuple(
    dim.reshape(-1) for dim in np.meshgrid(dates, lat_indices, lon_indices, indexing="ij")
)
print(f"Shape data: {data.shape}")
print(f"Shape date coordinates: {coords[0].shape}")
print(f"Shape latitude coordinates: {coords[1].shape}")
print(f"Shape longitude coordinates: {coords[2].shape}")

Shape data: (22785,)
Shape date coordinates: (22785,)
Shape latitude coordinates: (22785,)
Shape longitude coordinates: (22785,)


In [10]:
# Write to toy data array
with tiledb.open(f"{example_uri}/toy_data", mode="w") as data_array:
    data_array[coords] = data

In [11]:
with tiledb.open(f"{example_uri}/toy_data") as data_array:
    print(data_array.nonempty_domain())
    result = data_array[:, :, :]

((numpy.datetime64('2021-01-01'), numpy.datetime64('2021-01-31')), (125, 159), (287, 307))


In [12]:
# Convert to sparse COO
# 1. Convert date results to date index
dates_result = result["date"]
dates_index = (dates_result - dates_result[0]).astype(int)
# 2. Construct array of coordinates
result_coords = np.array([dates_index, result["latitude.index"], result["longitude.index"]])
# 3. Create COO matrix
sparse_results = sparse.COO(result_coords, result["toy_data"])
sparse_results

0,1
Format,coo
Data Type,float64
Shape,"(31, 160, 308)"
nnz,22785
Density,0.014914772727272728
Read-only,True
Size,712.0K
Storage ratio,0.1
