
# RADOLAN RW download and upload to metacatalog, including creation of metadata

This is the final solution, using `radolan_to_netcdf` for download and splitting the netCDF daily when uploading to metacatalog!

All available RADOLAN RW (hourly resolution) data: **2005 - 2021**

In [14]:
import tarfile
import gzip
from glob import glob
import os

import tqdm
import xarray as xr

import radolan_to_netcdf as rtn
#import cf

from metacatalog import api, ext

In [2]:
%%time

!wget -q --show-progress -r -np -A .tar.gz -R "index.html*" https://opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/

opendata.dwd.de/cli     [ <=>                ]   2,42K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,10K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>                ]   1,68K  --.-KB/s    in 0s      
opendata.dwd.de/cli     [ <=>           

Function to extract downloaded binary files into netCDF files

In [45]:
def tar2netcdf(input_path: str, output_path: str, if_exists: str, delete_last=False):
    """
    Untar DWD binary downloads and store as daily netCDF files under path.

    DWD naming pattern of inner tar: raa01-rw_10000-YYMMDDhhmm-dwd--bin

    Parameters:
    ------
    input_path: str 
        path to the folder where binary DWD downloads are stored (yearly folders).
        Usually something like *"./opendata.dwd.de/climate_environment/CDC/grids_germany/5_minutes/radolan/reproc/2017_002/bin"*
    output_path: str
        where to store generated netCDF files
    if_exists: {fail, replace, skip}
        What to do if netcdf file already exists.
        If you use 'skip', it is probably a good idea to delete the last generated netCDF in the output_folder by hand to 
        generate this file again and make sure that the file is not corrupted due an interruption while last creation of the file.
    delete_last: bool
        Whether to delete the last generated file in the output_folder.
        This option makes only sense if your last call of tar2netcdf() got interrupted and you are not sure if the netCDF file 
        generated last was fully completed, so you can generate that netCDF file again.  
    """
    # get the absolute output_path, where netCDF files are saved
    output_path = os.path.abspath(output_path)

    # create folder in output path
    os.makedirs(output_path, exist_ok=True)

    # delete last created netCDF file in output_path if delete_last == True
    if delete_last:
        existing_files = sorted(glob(f"{output_path}/*"))
        if len(existing_files) >= 1:
            os.remove(existing_files[-1])

    # loop over binary files
    for year in sorted(glob(f"{input_path}/*")):
        print(f"Extracting data for the year {year[-4:]}")
        for month in tqdm.tqdm(sorted(glob(year + '/*'))):
            print(month)
            with gzip.open(month, 'r') as fd:
                with tarfile.open(fileobj = fd) as tar_month:
                    fn_list_month = sorted([f.name for f in tar_month.getmembers()])
                    
                    for fn in fn_list_month:
                        # fn: 'raa01-rw_10000-0506010050-dwd---bin.gz'
                        # netCDF file name
                        fn_netcdf = f"{output_path}/{year[-4:]}{fn[-21:-17]}_radolan_rw.nc" # fn[-21:-17] -> %m%d
                        
                        if os.path.exists(fn_netcdf):
                            if if_exists == 'fail':
                                raise ValueError(f"netCDF file {output_path}/{fn[-15:-7]}_radklim_yw.nc already exists")
                            elif if_exists == 'skip':
                                continue
                        
                        # create (empty) daily netCDF                    
                        rtn.create_empty_netcdf(fn=fn_netcdf, product_name='YW')
                        
                        f_month = tar_month.extractfile(fn)

                        with gzip.open(f_month) as gz_hour:
                            print(gz_hour)
                            data, metadata = rtn.read_in_one_bin_file(gz_hour)
                            return data, metadata
                

                raise
                # loop over daily files
                for fn in fn_list:
                    f = tar.extractfile(fn)

                    # netCDF file name
                    fn_netcdf = f"{output_path}/{fn[-15:-7]}_radklim_yw.nc"

                    if os.path.exists(fn_netcdf):
                        if if_exists == 'fail':
                            raise ValueError(f"netCDF file {output_path}/{fn[-15:-7]}_radklim_yw.nc already exists")
                        elif if_exists == 'skip':
                            continue
                        
                    # create (empty) daily netCDF                    
                    rtn.create_empty_netcdf(fn=fn_netcdf, product_name='YW')
                    
                    # daily files contain the 5-minute data (tar_inner)
                    with tarfile.open(fileobj=f) as tar_inner:
                        fn_list_inner = sorted([f.name for f in tar_inner.getmembers()])

                        for fn_inner in fn_list_inner:
                            # extract 5-minute data, append to previously created daily netCDF
                            data, metadata = rtn.read_in_one_bin_file(tar_inner.extractfile(fn_inner))
                            rtn.append_to_netcdf(
                                fn_netcdf, 
                                data_list=[data, ], 
                                metadata_list=[metadata, ],
                            )



In [54]:
'raa01-rw_10000-0506010050-dwd---bin.gz'[-21:-17] # %m%d -> new 

'0601'

In [46]:
tar2netcdf(input_path="./opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/",
           output_path="./data/", if_exists="skip")


Extracting data for the year 2005


  0%|          | 0/7 [00:00<?, ?it/s]

./opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/2005/RW-200506.tar.gz





'raa01-rw_10000-0506010050-dwd---bin.gz'

In [20]:
import gzip
import tarfile
import radolan_to_netcdf as rtn

with gzip.open("./opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/2005/RW-200506.tar.gz") as fd: 
    with tarfile.open(fileobj = fd) as tar_month:
        for item in tar_month:
            #print(item)
            with tar_month.extractfile(item) as tar_hour:
                #print(tar_hour)
                with gzip.open(tar_hour) as gz_inner:
                    data, metadata = rtn.read_in_one_bin_file(gz_inner)
print(data)
print(metadata)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
{'producttype': 'RW', 'datetime': datetime.datetime(2005, 6, 30, 23, 45), 'radarid': '10000', 'datasize': 1620000, 'formatversion': 2, 'maxrange': '128 km', 'radolanversion': '01.00.00', 'precision': 0.1, 'intervalseconds': 3600, 'nrow': 900, 'ncol': 900, 'radarlocations': ['bln', 'drs', 'eis', 'emd', 'ess', 'fbg', 'fld', 'fra', 'ham', 'han', 'muc', 'neu', 'nhb', 'ros', 'tur', 'umd'], 'nodataflag': nan, 'secondary': array([], dtype=int64), 'nodatamask': array([     0,      1,      2, ..., 809997, 809998, 809999]), 'cluttermask': array([], dtype=int64)}


In [88]:
data, metadata = rtn.read_in_one_bin_file("./opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/2005/RW-200506/raa01-rw_10000-0506010050-dwd---bin")

In [87]:
f.read()

ValueError: I/O operation on closed file

In [40]:
with tarfile.open("opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/bin/2005/RW-200506.tar.gz") as tar:
    fn_list = sorted([f.name for f in tar.getmembers()])
    fn = fn_list[0]
    hour = tar.extractfile(fn)
    print(tar.extractfile(hour))
    with tarfile.open(fileobj = hour) as tar_inner:
        print(tar_inner)

AttributeError: 'ExFileObject' object has no attribute 'isreg'

In [38]:
tar.extractfile(fn)

OSError: TarFile is closed

In [33]:
os.path.abspath(fn)

'/home/alexander/Github/scripts/dwd_radar/radolan_rw/raa01-rw_10000-0506010050-dwd---bin.gz'

In [20]:
tar.extractfile(fn)

OSError: TarFile is closed