In [None]:
import xarray as xr
from ocf_blosc2 import Blosc2
from tqdm import tqdm
import h5py

In [None]:
month_to_times = {
    1: (8, 16),
    2: (8, 17),
    3: (7, 18),
    4: (7, 19),
    5: (6, 20),
    6: (5, 20),
    7: (5, 20),
    8: (6, 20),
    9: (7, 19),
    10: (7, 18),
    11: (7, 16),
    12: (8, 16)
}
import h5py
import pandas as pd
type = "weather"
NWP_FEATURES = ["t_500", "clcl", "alb_rad", "tot_prec", "ww", "relhum_2m", "h_snow", "aswdir_s", "td_2m", "omega_1000"]
for month in tqdm(range(1, 12 + 1)):
    print('opening dataset')
    hrv = xr.open_dataset(
        f"/data/{type}/2021/{month}.zarr.zip",
        engine="zarr",
        consolidated=True,
        chunks={"time": "auto"}
    )
    start, stop = month_to_times[month]
    start -= 1 # all data types look backward an hour
    if type == "weather":
        stop += 4 # ONLY FOR WEATHER, since we look forward 4 hrs
    print('filtering')
    filtered_dataset_lazy = hrv.where((hrv['time'].dt.hour >= start) & (hrv['time'].dt.hour <= stop), drop=True)
    print('chunking')
    filtered_dataset_lazy = filtered_dataset_lazy.chunk("auto")
    print('writing')
    filtered_dataset_lazy = filtered_dataset_lazy[NWP_FEATURES]
    output_path = f"/data/{type}_proc/2021/{month}.hdf5"
    with h5py.File(output_path, 'w') as hdf_file:
        for var in list(filtered_dataset_lazy.indexes):
            print(f"outputing var {var}")
            data = filtered_dataset_lazy[var].values
            if var == 'channel':
                continue
            if var == "time":
                # special handling is required
                data = list(map(lambda x: pd.to_datetime(x).timestamp(), data))
                print("number of times:", len(data))
            hdf_file.create_dataset(var, data=data, compression='lzf')
        for var in filtered_dataset_lazy.data_vars:
            print(f"outputing var {var}")
            data = filtered_dataset_lazy[var].values
            hdf_file.create_dataset(var, data=data, compression='lzf')

