# OpenSense data format for CML data
Example on how an open CML dataset can be transformed into a netCDF data format defined by the [OpenSense](https://opensenseaction.eu/) community.

The original dataset is OpenMRG from SMHI (https://zenodo.org/record/7107689/).


In [1]:
import os
import urllib
import zipfile
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime

### Download the OpenMRG data

In [2]:
# define data source and local path
url = "https://zenodo.org/record/7107689/files/OpenMRG.zip"
local_path = 'tmp_data/andersson_2022_OpenMRG/'

# create local folder
if not os.path.exists(local_path):
    os.makedirs(local_path)

# download to local folder
local_file_name = url.split("/")[-1]
request_return_meassage = urllib.request.urlretrieve(
    url, os.path.join(local_path, local_file_name)
)

### Combine data with metadata and adjust variable names

In [3]:
# For this ZIP file we cannot extract only the CML data since
# the NetCDF with the CML data is quite large. This seems to
# lead to crashes when reding directly from the ZIP file via Python.
with zipfile.ZipFile(os.path.join(local_path, local_file_name)) as zfile:
    zfile.extractall(local_path)

# Read metadata and data
df_metadata = pd.read_csv(os.path.join(local_path, 'cml/cml_metadata.csv'), index_col=0)
ds_full = xr.open_dataset(os.path.join(local_path, 'cml/cml.nc'))

# Add metadata with OpenSense naming conventions
for col_name, ds_var_name in [
    ('NearLatitude_DecDeg', 'site_0_lat'),
    ('NearLongitude_DecDeg', 'site_0_lon'),
    ('FarLatitude_DecDeg', 'site_1_lat'),
    ('FarLongitude_DecDeg', 'site_1_lon'),
    ('Frequency_GHz', 'frequency'),
    ('Polarization', 'polarization'),
    ('Length_km', 'length'),
]:
    ds_full.coords[ds_var_name] = (
        ('sublink'), 
        [df_metadata[df_metadata.Sublink==sublink_id][col_name].values[0] for sublink_id in list(ds_full.sublink.values)]
    )
    
# Change "sublink" to "sublink_id"
ds_full = ds_full.rename({"sublink": "sublink_id"})

In [4]:
ds_full

### Consider only one CML for the example

In [5]:
# take two sublinks that make up a CML
ds_ex = ds_full.isel(sublink_id=[0,1])

# add "cml_id" as dimension and coordinate
ds_ex = ds_ex.assign_coords({"cml_id": ("cml_id", ["cml_1"])})

# set dependence of coordinates on new dimension "cml_id" (independent on sublink_id)
for var in ["site_0_lat", "site_0_lon", "site_1_lat", "site_1_lon", "length"]:
    arr_var = ds_ex[var].isel(sublink_id=0).values[np.newaxis,]
    ds_ex[var] = (["cml_id"], arr_var)

# set dependence of coordinates on new dimension "cml_id" (dependent also on sublink_id)
for var in ["frequency", "polarization"]:
    arr_var = ds_ex[var].values[np.newaxis, :]
    ds_ex[var] = (["cml_id", "sublink_id"], arr_var)

# set dependence of variables on new dimension "cml_id"
for var in ["tsl", "rsl"]:
    arr_var = ds_ex[var].values.T[np.newaxis, :]
    ds_ex[var] = (["cml_id", "sublink_id", "time"], arr_var)

### Add attributes of variables and coordinates

In [6]:
def add_cml_attributes(ds):
    
    # dictionary of optional and required attributes for variables
    # and coordinates according to OpenSense white paper
    dict_attributes = {
        "time": {
            # "units": "s",    # defining units here interferes with encoding units of time
            "long_name": "time_utc",
            # "missing_value": "",   # defining units here interferes with encoding
        },
        "cml_id": {
            "long_name": "commercial_microwave_link_identifier",
        },
        'sublink_id': {
            "long_name": "sub-link_identifier",
        },
        'site_0_lat': {
            "units": "degrees in WGS84 projection",
            "long_name": "site_0_latitude",
        },
        'site_0_lon': {
            "units": "degrees in WGS84 projection",
            "long_name": "site_0_longitude",
        },      
        'site_0_elevation': {
            "units": "meters_above_sea",
            "long_name": "ground_elevation_above_sea_level",
        },     
        'site_0_altitude': {
            "units": "meters_above_sea",
            "long_name": "antenna_altitude_above_sea_level",
        }, 
        'site_1_lat': {
            "units": "degrees in WGS84 projection",
            "long_name": "site_1_latitude",
        },
        'site_1_lon': {
            "units": "degrees in WGS84 projection",
            "long_name": "site_1_longitude",
        },               
        'site_1_elevation': {
            "units": "meters_above_sea",
            "long_name": "ground_elevation_above_sea_level",
        },     
        'site_1_altitude': {
            "units": "meters_above_sea",
            "long_name": "antenna_altitude_above_sea_level",
        },    
        'length': {
            "units": "m",
            "long_name": "distance_between_pair_of_antennas",
        },                
        'frequency': {
            "units": "MHz",
            "long_name": "sublink_frequency",
        },         
        'tsl': {
            "units": "dBm",
            "long_name": "transmitted_signal_level",
            "missing_value": "",
        },             
        'rsl': {
            "units": "dBm",
            "long_name": "received_signal_level",
        },
        'polarization': {
            "units": "no units",
            "long_name": "sublink_polarization",
            "missing_value": "",
        }
    }
    
    # extract list of variables present in dataset
    ds_vars = list(ds.coords) + list(ds.data_vars)

    # add attributes of variables to dataset
    for v in ds_vars:
        if v in dict_attributes.keys():
            ds[v].attrs = dict_attributes[v]
    
    # set encoding attributes
    ds.time.encoding['units'] = "seconds since 1970-01-01 00:00:00"
    
    return ds

# add attributes of variables and coordinates
ds_ex = add_cml_attributes(ds_ex)

### Add global attributes

In [7]:
# get current time
t_now = str(pd.to_datetime(datetime.now())).split(".")[0]

# add global attributes that are not yet present
ds_ex.attrs["title"] = "Example dataset based on OpenMRG-CML"
ds_ex.attrs["file author(s)"] = "Nico Blettner"
ds_ex.attrs["institution"] = "Swedish Meteorological and Hydrological Institute (SMHI), Hydrology Research, http://www.smhi.se/hydrology-research"
ds_ex.attrs["date"] = t_now
ds_ex.attrs["source"] = "Ericsson MINI-LINK radios"
ds_ex.attrs["history"] = t_now + ": Reduce to one CML. Change variable names to OpenSense conventions. Add attributes."
ds_ex.attrs["naming convention"] = "OpenSense-0.1"
ds_ex.attrs["license restrictions"] = "https://creativecommons.org/licenses/by-sa/4.0"
ds_ex.attrs["reference"] = "https://doi.org/10.5281/zenodo.6673750"
ds_ex.attrs["comment"] = "Original netCDF created by Jafet Andersson and Victor Naslund, SMHI. Time is in UTC. Signal levels are measured in dBm, which is a logarithmic representation of the power (in mW): x [dBm]=10*log10(y[mW]). Metadata added with preliminary code from opensense_data_downloader.py"

### Remove superfluous global attributes

In [8]:
# exhaustive list of all required global attributes according to OpenSense white paper
global_attrs = [
    "title",
    "file author/s",
    "institution",
    "date",
    "source",
    "history",
    "naming convention",
    "license restrictions",
    "reference",
    "comment",
]

# list of attrs to be removed
rm_global_attrs = [var for var in ds_ex.attrs if var not in global_attrs]

# remove superfluous global attributes
for var in rm_global_attrs:
    del ds_ex.attrs[var]

In [9]:
# print attributes of "tsl" as an example
ds_ex.tsl.attrs

{'units': 'dBm', 'long_name': 'transmitted_signal_level', 'missing_value': ''}

In [10]:
ds_ex

### Save the dataset as netCDF

In [11]:
save_path = os.path.abspath(os.path.join(os.getcwd() ,"../"))+"/data/"

encoding = {
    "tsl": {"zlib": True},
    "rsl": {"zlib": True},
}

ds_ex.to_netcdf(
    save_path + "OpenSense_CML_example_format_data.nc",
    encoding=encoding,
)