
# RADKLIM RW download and upload to metacatalog, including creation of metadata


In [1]:
import tarfile
from glob import glob
import os

import tqdm
import xarray as xr
import numpy as np

import radolan_to_netcdf as rtn
#import cf

from metacatalog import api, ext


## Download data from DWD CDC server (binary)

2001 - 2022

In [3]:
%%time

!wget -q -P ./data/raklim_rw_binary/ --show-progress -r -np -A .tar.gz -R "index.html*" https://opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/reproc/2017_002/bin/

opendata.dwd.de/cli     [ <=>                ]   2,75K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,80K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>           

In [5]:
# delete folder /supplement
!rm -r -v /home/alexander/Github/scripts/dwd_radar/radklim_rw/data/raklim_rw_binary/opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/reproc/2017_002/bin/supplement


## Read data from nested tar file

Data is provided as monthly tar.gz files, which contains hourly binary files. To avoid extracting everything first we use the nested loop-construct below and extract only the data we want on the fly.

Goal is to produce one netCDF file for the entire temporal and spatial range.  
Data is to be aggregated to **daily sums** later.

DWD infos: https://opendata.dwd.de/climate_environment/CDC/help/landing_pages/doi_landingpage_RADKLIM_RW_V2017.002-de.html



### (cf-conform) Metadata dictionary

As the package `radolan_to_netcdf` does not yet support RADKLIM RW, the metadata dictionary is created here.


In [9]:
from radolan_to_netcdf.radolan_product_netcdf_config import metadata_per_timestamp

raklim_rw_netcdf_config = {
    "variables": {
        "rainfall_amount": {
            "variable_parameters": {
                "datatype": "i2",
                "dimensions": ("time", "y", "x"),
                "fill_value": -9999,
                "zlib": True,
                "complevel": 5,
            },
            "attributes": {
                "long_name": "Hourly rainfall",
                "standard_name": "rainfall_amount",
                "units": "kg",
                "scale_factor": 0.01,
                "add_offset": 0,
                "coordinates": "longitudes latitudes",
                "grid_mapping": "RADOLAN_grid",
            },
        },
    },
    "metadata_per_timestamp": metadata_per_timestamp,
    "metadata_fixed": {
        "n_lats": 1100,
        "n_lons": 900,
    },
}


In [None]:
%%time
#from time import time

# absolute output_path to the folder radklim, where netCDF files are saved
#output_path = "data/radklim_rw/"
fn_netcdf = "data/radklim_rw.nc"
rtn.create_empty_netcdf(fn=fn_netcdf, product_config_dict=raklim_rw_netcdf_config)

# create folder radklim_yw
os.makedirs(output_path, exist_ok=True)

for year in sorted(glob('./data/raklim_rw_binary/opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/reproc/2017_002/bin/*')):
    print(f"Extracting data for the year {year[-4:]}")
    for month in tqdm.tqdm(sorted(glob(year + '/*'))):
        with tarfile.open(month, 'r') as tar:
            # fn_list: filenames of daily files ('YW2017.002_20010131.tar.gz')
            fn_list = sorted([f.name for f in tar.getmembers()])
            #fn_list = sorted(tar.getnames())

            # loop over daily files
            for fn in fn_list:
                f = tar.extractfile(fn)
                # create (empty) daily netCDF
                fn_netcdf = f"{output_path}/{fn[-15:-7]}_radklim_yw.nc"
                # t1 = time()
                rtn.create_empty_netcdf(fn=fn_netcdf, product_name='YW')
                # t2=time()
                #print(f"create_empty_netcdf: {time() - t1}")
                # daily files contain the 5-minute data (tar_inner)
                with tarfile.open(fileobj=f) as tar_inner:
                    # t1 = time()
                    # fn_list_inner = sorted(tar_inner.getnames())
                    # print(f"tar_inner.getnames: {time() - t1}")
                    # t1 = time()
                    fn_list_inner = sorted([f.name for f in tar_inner.getmembers()])
                    # print(f"tar_inner.getmembers: {time() - t1}")

                    for fn_inner in fn_list_inner:
                        # extract 5-minute data, append to previously created daily netCDF
                        # t1 = time()
                        data, metadata = rtn.read_in_one_bin_file(tar_inner.extractfile(fn_inner))
                        # print(f"read_in_one_bin_file: {time() - t1}")
                        # t1 = time()
                        rtn.append_to_netcdf(
                            fn_netcdf, 
                            data_list=[data, ], 
                            metadata_list=[metadata, ],
                        )
                        # print(f"append_to_netcdf: {time() - t1}")

In [13]:
def tar2netcdf(input_path: str, output_path: str, radklim_rw_config: dict):
    """
    Untar DWD binary downloads and store as one netCDF files under path.

    Parameters:
    ------
    input_path: str 
        path to the folder where binary DWD downloads are stored (yearly folders).
        Usually something like *"./opendata.dwd.de/climate_environment/CDC/grids_germany/5_minutes/radolan/reproc/2017_002/bin"*
    output_path: str
        where to store generated netCDF file
    radklim_rw_config : dict
        Dictionary holding the parameters required for building a NetCDF file with the correct dimensions, 
        variables names and attributes (see package radolan_to_netcdf).
    """
    # create folder in output path
    os.makedirs(output_path, exist_ok=True)

    # get the absolute output_path to the folder radklim_rw, where netCDF files are saved
    output_path = os.path.abspath(output_path)

    fn_netcdf = f"{output_path}/radklim_rw.nc"

    rtn.create_empty_netcdf(fn=fn_netcdf, product_config_dict=radklim_rw_config, product_name="RW")

    # loop over yearly files
    for year in sorted(glob(f"{input_path}/*")):
        print(f"Extracting data for the year {year[-4:]}")
        for month in tqdm.tqdm(sorted(glob(year + '/*'))):
            with tarfile.open(month, 'r') as tar:
                # fn_list: filenames of hourly files ('raa01-rw2017.002_10000-0101010050-dwd---bin')
                fn_list = sorted([f.name for f in tar.getmembers()])

                # loop over hourly files, append to netcdf
                for fn in fn_list:
                    data, metadata = rtn.read_in_one_bin_file(tar.extractfile(fn))
                    rtn.append_to_netcdf(
                        fn_netcdf,
                        data_list=[data, ],
                        metadata_list=[metadata, ]
                    )


Execute `tar2netcdf` to extract data and save to one netCDF file.

In [14]:
tar2netcdf(input_path="data/raklim_rw_binary/opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/reproc/2017_002/bin/",
           output_path="data/radklim_rw/",
           radklim_rw_config=raklim_rw_netcdf_config)


Extracting data for the year 2001


  0%|          | 0/12 [00:34<?, ?it/s]


KeyboardInterrupt: 

In [16]:
xr.open_dataset("data/radklim_rw/radklim_rw.nc")


## hourly rainfall to daily rainfall
