
# RADOLAN RW download and upload to metacatalog, including creation of metadata

This is the final solution, using `radolan_to_netcdf` for download and splitting the netCDF daily when uploading to metacatalog!

All available RADOLAN RW (hourly resolution) data: **2005 - 2021**

In [1]:
import tarfile
import gzip
from glob import glob
import os

import tqdm
import xarray as xr

import radolan_to_netcdf as rtn
#import cf

from metacatalog import api, ext

In [2]:
%%time

!wget -q --show-progress -r -np -A .tar.gz -R "index.html*" https://opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/

opendata.dwd.de/cli     [ <=>                ]   2,42K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,10K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>           

Function to extract downloaded binary files into netCDF files

In [4]:
def tar2netcdf(input_path: str, output_path: str, if_exists: str, delete_last=False):
    """
    Untar DWD binary downloads and store as daily netCDF files under path.

    DWD naming pattern of inner tar: raa01-rw_10000-YYMMDDhhmm-dwd--bin

    Parameters:
    ------
    input_path: str 
        path to the folder where binary DWD downloads are stored (yearly folders).
        Usually something like *"./opendata.dwd.de/climate_environment/CDC/grids_germany/5_minutes/radolan/reproc/2017_002/bin"*
    output_path: str
        where to store generated netCDF files
    if_exists: {fail, replace, skip}
        What to do if netcdf file already exists.
        If you use 'skip', it is probably a good idea to delete the last generated netCDF in the output_folder by hand to 
        generate this file again and make sure that the file is not corrupted due an interruption while last creation of the file.
    delete_last: bool
        Whether to delete the last generated file in the output_folder.
        This option makes only sense if your last call of tar2netcdf() got interrupted and you are not sure if the netCDF file 
        generated last was fully completed, so you can generate that netCDF file again.  
    """
    # get the absolute output_path, where netCDF files are saved
    output_path = os.path.abspath(output_path)

    # create folder in output path
    os.makedirs(output_path, exist_ok=True)

    # delete last created netCDF file in output_path if delete_last == True
    if delete_last:
        existing_files = sorted(glob(f"{output_path}/*"))
        if len(existing_files) >= 1:
            os.remove(existing_files[-1])

    # loop over binary files
    for year in sorted(glob(f"{input_path}/*")):
        print(f"Extracting data for the year {year[-4:]}")
        for month in tqdm.tqdm(sorted(glob(year + '/*'))):
            with gzip.open(month, 'r') as fd:
                with tarfile.open(fileobj = fd) as tar_month:
                    fn_list_hour = sorted([f.name for f in tar_month.getmembers()])

                    for fn in fn_list_hour:
                        # fn: 'raa01-rw_10000-0506010050-dwd---bin.gz'
                        # netCDF file name
                        fn_netcdf = f"{output_path}/{year[-4:]}{fn[-21:-17]}_radolan_rw.nc" # fn[-21:-17] -> %m%d

                        # if os.path.exists(fn_netcdf):
                        #     if if_exists == 'fail':
                        #         raise ValueError(f"netCDF file {output_path}/{fn[-15:-7]}_radklim_yw.nc already exists")
                        #     elif if_exists == 'skip':
                        #         continue
                        #     elif if_exists == 'replace':
                        #         # create (empty) daily netCDF                    
                        #         rtn.create_empty_netcdf(fn=fn_netcdf, product_name='RW')
                        

                        # only create new netCDF when it does not already exist, a new netCDF file will be created if filename (-> day) changes
                        if os.path.exists(fn_netcdf):
                            pass
                        else:
                            # create (empty) daily netCDF                    
                            rtn.create_empty_netcdf(fn=fn_netcdf, product_name='RW')
                        
                        # extract hourly file
                        f_hour = tar_month.extractfile(fn)

                        with gzip.open(f_hour) as gz_hour:
                            # extract hourly data, append to previously created daily netCDF
                            data, metadata = rtn.read_in_one_bin_file(gz_hour)
                            rtn.append_to_netcdf(
                                fn_netcdf, 
                                data_list=[data, ], 
                                metadata_list=[metadata, ],
                            )

                

In [5]:
tar2netcdf(input_path="./opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/",
           output_path="./data/", if_exists="skip")


Extracting data for the year 2005


 29%|██▊       | 2/7 [01:31<03:48, 45.73s/it]


ValueError: time data '0508090  700' does not match format '%d%H%M%m%y%S'


## IDEE

kaputte Datei finden -> in Text konvertieren -> Fehler behen -> wieder als binary speichern

In [9]:
xr.open_mfdataset("./data/20050603_radolan_rw.nc")

Unnamed: 0,Array,Chunk
Bytes,6.18 MiB,6.18 MiB
Shape,"(900, 900)","(900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.18 MiB 6.18 MiB Shape (900, 900) (900, 900) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",900  900,

Unnamed: 0,Array,Chunk
Bytes,6.18 MiB,6.18 MiB
Shape,"(900, 900)","(900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.18 MiB,6.18 MiB
Shape,"(900, 900)","(900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 6.18 MiB 6.18 MiB Shape (900, 900) (900, 900) Count 2 Graph Layers 1 Chunks Type float64 numpy.ndarray",900  900,

Unnamed: 0,Array,Chunk
Bytes,6.18 MiB,6.18 MiB
Shape,"(900, 900)","(900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 74.16 MiB 74.16 MiB Shape (24, 900, 900) (24, 900, 900) Count 2 Graph Layers 1 Chunks Type float32 numpy.ndarray",900  900  24,

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48 B,48 B
Shape,"(24,)","(24,)"
Count,2 Graph Layers,1 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 48 B 48 B Shape (24,) (24,) Count 2 Graph Layers 1 Chunks Type int16 numpy.ndarray",24  1,

Unnamed: 0,Array,Chunk
Bytes,48 B,48 B
Shape,"(24,)","(24,)"
Count,2 Graph Layers,1 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 74.16 MiB 74.16 MiB Shape (24, 900, 900) (24, 900, 900) Count 2 Graph Layers 1 Chunks Type float32 numpy.ndarray",900  900  24,

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 74.16 MiB 74.16 MiB Shape (24, 900, 900) (24, 900, 900) Count 2 Graph Layers 1 Chunks Type float32 numpy.ndarray",900  900  24,

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 74.16 MiB 74.16 MiB Shape (24, 900, 900) (24, 900, 900) Count 2 Graph Layers 1 Chunks Type float32 numpy.ndarray",900  900  24,

Unnamed: 0,Array,Chunk
Bytes,74.16 MiB,74.16 MiB
Shape,"(24, 900, 900)","(24, 900, 900)"
Count,2 Graph Layers,1 Chunks
Type,float32,numpy.ndarray


Metadata: 
- https://opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/DESCRIPTION_gridsgermany-hourly-radolan-historical-bin_en.pdf
- https://opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/BESCHREIBUNG_gridsgermany-hourly-radolan-historical-bin_de.pdf