# netCDF vs zarr, an imperfect comparsion

This notebook documents the differences in filestorage useage for netCDF and zarr. Zarr is very much a cloud optimized format, but if you are on a [POSIX](https://en.wikipedia.org/wiki/POSIX) file system, having a large amount of files might degrade performance. 

In [1]:
import os
import shutil
import zarr

import xarray as xr
import matplotlib.pyplot as plt

from siphon.catalog import TDSCatalog
import xarray as xr

In [2]:
zarr.__version__

'2.18.2'

## Dastasets

Standard Xarray Tutorial dataset

In [3]:
ds = xr.tutorial.open_dataset('air_temperature').load()
ds

Grabbing some global 1 degree data from our thredds data server:

In [4]:
catUrl = "https://thredds.ucar.edu/thredds/catalog/grib/NCEP/GFS/Global_onedeg_ana/catalog.xml";
datasetName = "Full Collection Dataset";

catalog = TDSCatalog(catUrl)

ds2 = catalog.datasets[datasetName].remote_access(use_xarray=True)
print('ds is a:', type(ds2))

ds is a: <class 'xarray.core.dataset.Dataset'>


In [5]:
ds2

In [6]:
ds2 = ds2.isel(time=slice(0, 20))

In [7]:
print(f"Dataset 1 (Air Temp) size: {ds.nbytes / 1e6:.2f} MB")
print(f"Dataset 2 (GFS) size: {ds2.nbytes / 1e6:.2f} MB")

Dataset 1 (Air Temp) size: 15.50 MB
Dataset 2 (GFS) size: 3179.81 MB


# Make some directories to put some science in them

In [8]:
def delete_directory_contents(directory_name):
    for filename in os.listdir(directory_name):
        file_path = os.path.join(directory_name, filename)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

In [9]:
# Specify the directory names
directory_name1 = 'export_directory_zarr'
directory_name2 = 'export_directory_netcdf'

# Delete the contents of the first directory
delete_directory_contents(directory_name1)

# Delete the contents of the second directory
delete_directory_contents(directory_name2)

## Export Zarr

In [10]:
# Define output paths for the datasets
paths = {
    'ds': 'export_directory_zarr/air_temperature.zarr',
    'ds2': 'export_directory_zarr/dataset2.zarr'
}

# Write the first dataset to Zarr format and print the output path
ds.to_zarr(paths['ds'], mode='w')
print(f"Dataset written to {paths['ds']} in Zarr format.")

# Write the second dataset (ds2) to Zarr format and print the output path
ds2.to_zarr(paths['ds2'], mode='w')
print(f"Second dataset written to {paths['ds2']} in Zarr format.")

  return to_zarr(  # type: ignore[call-overload,misc]


Dataset written to export_directory_zarr/air_temperature.zarr in Zarr format.
Second dataset written to export_directory_zarr/dataset2.zarr in Zarr format.


## Export netCDF

In [11]:
# Define output paths for the NetCDF files
output_dir = '/Users/thomasmartin/Downloads/export_directory_netcdf'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

paths = {
    'ds': os.path.join(output_dir, 'air_temperature.nc'),
    'ds2': os.path.join(output_dir, 'dataset2.nc')
}

# Write the first dataset to NetCDF format and print the output path
ds.to_netcdf(paths['ds'])
print(f"Dataset written to {paths['ds']} in NetCDF format.")

# Write the second dataset (ds2) to NetCDF format and print the output path
ds2.to_netcdf(paths['ds2'])
print(f"Second dataset written to {paths['ds2']} in NetCDF format.")

  ds.to_netcdf(paths['ds'])


Dataset written to /Users/thomasmartin/Downloads/export_directory_netcdf/air_temperature.nc in NetCDF format.


RuntimeError: NetCDF: Name contains illegal characters

## Count Files & Sizes

In [12]:
def count_files_and_subfolders(path):
    """
    Count the number of files and subfolders in a directory, and calculate the total size of files in bytes.

    Parameters:
    - path (str): The path to the directory.

    Returns:
    - file_count (int): The number of files in the directory.
    - subfolder_count (int): The number of subfolders in the directory.
    - total_size_bytes (int): The total size of all files in the directory in bytes.
    """
    # Initialize counters
    file_count = 0         # Count of files
    subfolder_count = 0    # Count of subfolders
    total_size_bytes = 0   # Total size of all files in bytes

    # Iterate over the directory tree
    for root, dirs, files in os.walk(path):
        # Increment file and subfolder counts
        file_count += len(files)
        subfolder_count += len(dirs)

        # Calculate total size of files
        total_size_bytes += sum(os.path.getsize(os.path.join(root, name)) for name in files)

    # Return the counts and total size
    return file_count, subfolder_count, total_size_bytes

#### Zarr

In [13]:
# Specify the path
path = 'export_directory_zarr/'

# Count files, subfolders, and calculate total size
file_count, subfolder_count, total_size_bytes = count_files_and_subfolders(path)

# Convert the total size to megabytes
total_size_mb = total_size_bytes / 1e6

# Print the results
print(f"Number of files: {file_count}")
print(f"Number of subfolders: {subfolder_count}")
print(f"Total size: {total_size_mb:.2f} MB")

Number of files: 3288
Number of subfolders: 104
Total size: 1901.61 MB


#### netCDF

In [14]:
# Specify the path
path = 'export_directory_netcdf/'

# Count files, subfolders, and calculate total size
file_count, subfolder_count, total_size_bytes = count_files_and_subfolders(path)

# Convert the total size to megabytes
total_size_mb = total_size_bytes / 1e6

# Print the results
print(f"Number of files: {file_count}")
print(f"Number of subfolders: {subfolder_count}")
print(f"Total size: {total_size_mb:.2f} MB")

Number of files: 2
Number of subfolders: 0
Total size: 7.77 MB
