In [2]:
# Script for importing netcdf, finding optimal shuffle settings for encoding, and then saving the dataset with these settings
import numpy as np
import xarray as xr
import os   
import copy
import re
from get_optimal_shuffle_encoding import get_optimal_shuffle_encoding

# load the dataset
folder_in = r'O:\HybridDune experiment\data ADV, OBS\raw NetCDF\test'
filename_in = r'ADV_RWS1_Deployment1.nc'
filename_out_shuffle_optimal = re.sub(r'\.nc$', ' shuffle_optimal V2.nc', filename_in)  # subtract .nc from filename, add ' shuffle_optimal.nc'
filename_1var_shuffle_off = r'temp 1var shuffle_off' 
filename_1var_shuffle_on  = r'temp 1var shuffle_on'  
filename_out_shuffle_off = r'ADV_RWS1_Deployment1 shuffle_off.nc'
filename_out_shuffle_on  = r'ADV_RWS1_Deployment1 shuffle_on.nc'

In [None]:
# Variant 1: open dataset and explicitly define scaling settings
ds = xr.open_dataset(os.path.join(folder_in, filename_in))
# Change data type of ds.p from int64 to float64 (otherwise the NETCDF scaling gives an error)
ds['p'] = ds['p'].astype('float64')

# Save the dataset to netCDF --------------------------------------------------------
# First define a custom encoding dictionary for the ADV variables, to save variables with the same accuracy (same number of decimals) as the original text file with data
encoding = {'p': { 'scale_factor': 10., 'dtype': 'int16', '_FillValue': -999},
            'u': { 'scale_factor': 0.001, 'dtype': 'int16', '_FillValue': -999},  # three decimals originally, so scale factor 0.001. max value is 7m/s, with 3 decimals is 7000 options, so int16 scale of ± 32767 is sufficient
            'v': { 'scale_factor': 0.001, 'dtype': 'int16', '_FillValue': -999},  # shuffle: flag for bit order. I just tried for which variables it saves data. (default is True when using deflate compression)
            'w': { 'scale_factor': 0.001, 'dtype': 'int16', '_FillValue': -999},
            'anl1': { 'dtype': 'uint16'},
            'anl2': { 'dtype': 'uint16'},
            'a1': { 'dtype': 'int16', '_FillValue': -999},
            'a2': { 'dtype': 'int16', '_FillValue': -999},
            'a3': { 'dtype': 'int16', '_FillValue': -999},
            'cor1': { 'dtype': 'int8', '_FillValue': -99},
            'cor2': { 'dtype': 'int8', '_FillValue': -99},
            'cor3': { 'dtype': 'int8', '_FillValue': -99},
            'snr1': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'snr2': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'snr3': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'voltage': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'heading': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'pitch': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'roll': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999},
            'burst': { 'scale_factor': 0.1, 'dtype': 'int16', '_FillValue': -999} }

encoding_shuffle_off = copy.deepcopy(encoding)  # make a copy of the encoding dict, to add the shuffle setting
encoding_shuffle_on  = copy.deepcopy(encoding)

# Then add deflate compression level 4 to all variables and coordinates in netCDF, without overwriting existing keys
compression_shuffle_off = {var: {"zlib": True, "complevel": 4, 'shuffle': False} for var in list(ds.data_vars) + list(ds.coords)}  # temporary dict, with only compression settings
compression_shuffle_on  = {var: {"zlib": True, "complevel": 4, 'shuffle': True} for var in list(ds.data_vars) + list(ds.coords)}  # temporary dict, with only compression settings

for var, comp in compression_shuffle_off.items():  # for each variable in the dataset, 
    if var in encoding_shuffle_off:                # if the variable already has an encoding, update it with the compression settings
        encoding_shuffle_off[var].update(comp)
    else:                                          # if the variable does not have an encoding yet, add it 
        encoding_shuffle_off[var] = comp

for var, comp in compression_shuffle_on.items():  # for each variable in the dataset, 
    if var in encoding_shuffle_on:                # if the variable already has an encoding, update it with the compression settings
        encoding_shuffle_on[var].update(comp)
    else:                                          # if the variable does not have an encoding yet, add it 
        encoding_shuffle_on[var] = comp

#print(encoding_shuffle_off)   # print the compression settings to check
#print(encoding_shuffle_on)    # print the compression settings to check
#ds.to_netcdf(os.path.join(folder_in, filename_out_shuffle_off), encoding=encoding_shuffle_off)
#ds.to_netcdf(os.path.join(folder_in, filename_out_shuffle_on), encoding=encoding_shuffle_on)

In [None]:
# Variant 2: open dataset and load encoding from the dataset, if possible
ds = xr.open_dataset(os.path.join(folder_in, filename_in))

# Define two dictionaries for compression settings, one with shuffle on and one with shuffle off
compression_shuffle_off = {var: {"zlib": True, "complevel": 4, 'shuffle': False} for var in list(ds.data_vars) + list(ds.coords)}  
compression_shuffle_on  = {var: {"zlib": True, "complevel": 4, 'shuffle': True} for var in list(ds.data_vars) + list(ds.coords)}  

# use ds.encoding from the dataset, if possible
# NB: if no user-defined encoding is saved in the dataset, it (often?) contains a dict (starting with 'unlimited_dims=set()' ) that is not useful. Check if that's the case
if hasattr(ds, 'encoding') and list(encoding.keys())[0] != 'unlimited_dims':  # If useful encoding is found in the dataset
    encoding = ds.encoding  # load the encoding from the dataset    
    encoding_shuffle_off = copy.deepcopy(encoding)  # make a copy of the encoding dict, to add the shuffle setting
    encoding_shuffle_on  = copy.deepcopy(encoding)
    print (f'Encoding found in dataset: {encoding}')  # print the encoding to check

    # Then add deflate compression to all variables and coordinates in netCDF, without overwriting existing keys
    for var, comp in compression_shuffle_off.items():  # for each variable in the dataset, 
        if var in encoding_shuffle_off:                # if the variable already has an encoding, update it with the compression settings
            encoding_shuffle_off[var].update(comp)
        else:                                          # if the variable does not have an encoding yet, add it 
            encoding_shuffle_off[var] = comp

    for var, comp in compression_shuffle_on.items():  # repeat for encoding_shuffle_on
        if var in encoding_shuffle_on:                
            encoding_shuffle_on[var].update(comp)
        else:                                          
            encoding_shuffle_on[var] = comp
elif hasattr(ds, 'encoding') and list(encoding.keys())[0] == 'unlimited_dims': # if encoding exists, but not useful, then skip
    encoding = ds.encoding  # load the encoding from the dataset
    print (f"Encoding found in dataset starts with 'unlimited dims:', is likely default/not useful. Skipped, instead only using deflate compression. Encoding found: {encoding}")  # print the encoding to check
    encoding_shuffle_off = compression_shuffle_off
    encoding_shuffle_on  = compression_shuffle_on
else:
    print("No encoding found in dataset (scaling etc), using only default deflate compression.")
    encoding_shuffle_off = compression_shuffle_off
    encoding_shuffle_on  = compression_shuffle_on

#print(encoding_shuffle_off)   # print the compression settings to check
#print(encoding_shuffle_on)    # print the compression settings to check

Encoding found in dataset starts with 'unlimited dims:', is likely default/not useful. Skipped, instead only using deflate compression. Encoding found: {'unlimited_dims': set(), 'source': 'O:\\HybridDune experiment\\data ADV, OBS\\raw NetCDF\\test\\ADV_RWS1_Deployment1.nc'}
{'sf': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'p': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'u': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'v': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'w': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'anl1': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'anl2': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'a1': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'a2': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'a3': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'cor1': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'cor2': {'zlib': True, 'complevel': 4, 'shuffle': False}, 'cor3': {'zlib': True, 'complevel': 4, 'shuffle'

In [None]:
# Now save the dataset with both shuffle settings, 1 variable at a time, compare the file sizes, then use optimal settings

# for every variable in the dataset, if the number of elements in the variable is larger than 1000, print the variable name and the number of elements
encoding_shuffle_optimal = copy.deepcopy(encoding_shuffle_on)  # make a copy of the encoding dict, to add the shuffle setting

for var in ds.data_vars:
    if ds[var].size > 1000:
        #print(f'Processing variable: {var} with size {ds[var].size}')
        
        # make a new dataset, containing only the selected variable, dropping all other variables
        ds_1var = ds[[var]].copy()  # create a new dataset with only the selected variable

        # make a new encoding dictionary, with only variables and coordinates that are in ds_1var
        encoding_1var_shuffle_off = {var: encoding_shuffle_off[var] for var in list(ds_1var.data_vars) + list(ds_1var.coords)}
        encoding_1var_shuffle_on  = {var: encoding_shuffle_on[var]  for var in list(ds_1var.data_vars) + list(ds_1var.coords)}
        
        
        try:
            ds_1var.to_netcdf(os.path.join(folder_in, filename_1var_shuffle_off), encoding=encoding_1var_shuffle_off)
        except Exception as e:
            if str(e).startswith("Cannot cast ufunc 'divide' output from dtype"):
                print(f"Error for variable {var}: {e}. Casting variable to float64 and retrying.")
                ds_1var[var] = ds_1var[var].astype('float64')
                ds_1var.to_netcdf(os.path.join(folder_in, filename_1var_shuffle_off), encoding=encoding_1var_shuffle_off)
                ds[var] = ds[var].astype('float64')  # also update the original dataset with the new data type. (order: after saving, so only if it works)
            else:
                raise
                        
        #ds_1var.to_netcdf(os.path.join(folder_in, filename_1var_shuffle_off), encoding=encoding_1var_shuffle_off)
        ds_1var.to_netcdf(os.path.join(folder_in, filename_1var_shuffle_on),  encoding=encoding_1var_shuffle_on)

        # read the file size of the saved files
        size_shuffle_off = os.path.getsize(os.path.join(folder_in, filename_1var_shuffle_off))
        size_shuffle_on  = os.path.getsize(os.path.join(folder_in, filename_1var_shuffle_on))

        # if size_shuffle_off is smaller than size_shuffle_on, update the encoding_shuffle_optimal dictionary for the variable
        if size_shuffle_on < size_shuffle_off:
            print(f'Variable {var}: shuffle on is smaller: {size_shuffle_on/1024:.0f} kB < {size_shuffle_off/1024:.0f} kB')
        else:           
            encoding_shuffle_optimal[var] = encoding_shuffle_off[var]
            print(f'Variable {var}: shuffle on is larger: {size_shuffle_on/1024:.0f} kB > {size_shuffle_off/1024:.0f} kB')

# Save the dataset with the optimal encoding, then remove the temporary nc files
#ds.to_netcdf(os.path.join(folder_in, filename_out_shuffle_optimal), encoding=encoding_shuffle_optimal)
#os.remove(os.path.join(folder_in, filename_1var_shuffle_off))
#os.remove(os.path.join(folder_in, filename_1var_shuffle_on))

Variable p: shuffle on is smaller: 2591 kB < 2680 kB
Variable u: shuffle on is LARGER: 44298 kB > 18358 kB
Variable v: shuffle on is LARGER: 43944 kB > 17598 kB
Variable w: shuffle on is LARGER: 42030 kB > 14908 kB
Variable anl1: shuffle on is smaller: 1951 kB < 2238 kB
Variable anl2: shuffle on is smaller: 2425 kB < 2767 kB
Variable a1: shuffle on is smaller: 2623 kB < 3750 kB
Variable a2: shuffle on is smaller: 2781 kB < 3953 kB
Variable a3: shuffle on is smaller: 2696 kB < 3841 kB
Variable cor1: shuffle on is smaller: 5729 kB < 7924 kB
Variable cor2: shuffle on is smaller: 5816 kB < 8028 kB
Variable cor3: shuffle on is smaller: 5736 kB < 7951 kB
Variable snr1: shuffle on is LARGER: 14228 kB > 5325 kB
Variable snr2: shuffle on is LARGER: 14090 kB > 5579 kB
Variable snr3: shuffle on is LARGER: 14147 kB > 5419 kB
Variable voltage: shuffle on is LARGER: 1444 kB > 658 kB
Variable heading: shuffle on is LARGER: 7161 kB > 3672 kB
Variable pitch: shuffle on is LARGER: 5651 kB > 2255 kB
Vari

In [None]:
encoding_shuffle_optimal = get_optimal_shuffle_encoding(folder_in, filename_in)




Variable p: shuffle on is smaller: 2591 kB < 2680 kB
Variable u: shuffle on is larger: 44298 kB > 18358 kB
Variable v: shuffle on is larger: 43944 kB > 17598 kB
Variable w: shuffle on is larger: 42030 kB > 14908 kB
Variable anl1: shuffle on is smaller: 1951 kB < 2238 kB
Variable anl2: shuffle on is smaller: 2425 kB < 2767 kB
Variable a1: shuffle on is smaller: 2623 kB < 3750 kB
Variable a2: shuffle on is smaller: 2781 kB < 3953 kB
Variable a3: shuffle on is smaller: 2696 kB < 3841 kB
Variable cor1: shuffle on is smaller: 5729 kB < 7924 kB
Variable cor2: shuffle on is smaller: 5816 kB < 8028 kB
Variable cor3: shuffle on is smaller: 5736 kB < 7951 kB
Variable snr1: shuffle on is larger: 14228 kB > 5325 kB
Variable snr2: shuffle on is larger: 14090 kB > 5579 kB
Variable snr3: shuffle on is larger: 14147 kB > 5419 kB
Variable voltage: shuffle on is larger: 1444 kB > 658 kB
Variable heading: shuffle on is larger: 7161 kB > 3672 kB
Variable pitch: shuffle on is larger: 5651 kB > 2255 kB
Vari

In [23]:
print(ds)

<xarray.Dataset> Size: 4GB
Dimensions:         (time: 15240, profile_number: 16, echos: 3, obs_number: 720)
Dimensions without coordinates: time, profile_number, echos, obs_number
Data variables:
    time_num        (time) datetime64[ns] 122kB ...
    time_string     (time) <U23 1MB ...
    file_name       (time) <U42 3MB ...
    profile_angle   (profile_number) float32 64B ...
    ini_beam_angle  (profile_number) float32 64B ...
    radius_lidar    (echos, profile_number, obs_number, time) float32 2GB ...
    intensity       (echos, profile_number, obs_number, time) float32 2GB ...
Attributes:
    name:                   storm1_lidar1_polar polar
    summary:                Hybrid Dune campaign, data of lidar 1 during storm 1
    instrument:             lidar 1
    period:                 storm 1, 2024-12-18 to 2024-12-20
    instrument type:        Sick Multiscan 165
    time zone:              UTC+1
    contact person:         Daan Poppema
    emailadres:             d.w.poppema@tud