## Setting Up:

In [None]:
import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import zipfile
import cdsapi
import zipfile
import numpy as np
import glob
import xarray as xr

from scripts.config_CH import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

## Download monthly ERA5-Land variables:

In [None]:
RUN = False
if RUN:
    os.makedirs(path_ERA5_raw, exist_ok=True)
    c = cdsapi.Client()
    c.retrieve(
        'reanalysis-era5-land-monthly-means', {
            'product_type': ['monthly_averaged_reanalysis'],
            'variable': [
                '10m_u_component_of_wind',
                '10m_v_component_of_wind',
                '2m_temperature',
                'forecast_albedo',
                'snow_cover',
                'snow_density',
                'snow_depth_water_equivalent',
                'snowfall',
                'snowmelt',
                'surface_latent_heat_flux',
                'surface_net_thermal_radiation',
                'surface_sensible_heat_flux',
                'surface_solar_radiation_downwards',
                'total_precipitation',
            ],
            'year': [
                '1950',
                '1951',
                '1952',
                '1953',
                '1954',
                '1955',
                '1956',
                '1957',
                '1958',
                '1959',
                '1960',
                '1961',
                '1962',
                '1963',
                '1964',
                '1965',
                '1966',
                '1967',
                '1968',
                '1969',
                '1970',
                '1971',
                '1972',
                '1973',
                '1974',
                '1975',
                '1976',
                '1977',
                '1978',
                '1979',
                '1980',
                '1981',
                '1982',
                '1983',
                '1984',
                '1985',
                '1986',
                '1987',
                '1988',
                '1989',
                '1990',
                '1991',
                '1992',
                '1993',
                '1994',
                '1995',
                '1996',
                '1997',
                '1998',
                '1999',
                '2000',
                '2001',
                '2002',
                '2003',
                '2004',
                '2005',
                '2006',
                '2007',
                '2008',
                '2009',
                '2010',
                '2011',
                '2012',
                '2013',
                '2014',
                '2015',
                '2016',
                '2017',
                '2018',
                '2019',
                '2020',
                '2021',
                '2022',
                '2023',
                '2024',
            ],
            'month': [
                '01',
                '02',
                '03',
                '04',
                '05',
                '06',
                '07',
                '08',
                '09',
                '10',
                '11',
                '12',
            ],
            'time': ['00:00'],
            "data_format": "netcdf",
            "download_format": "zip",
            'area': [
                50,
                0,
                45,
                20,
            ],
        }, path_ERA5_raw+'download.netcdf.zip')
    with zipfile.ZipFile(path_ERA5_raw+'download.netcdf.zip', 'r') as zip:
        zip.extractall(path_ERA5_raw)
    c.retrieve("reanalysis-era5-single-levels", {
            "product_type": ["reanalysis"],
            "variable": ["geopotential"],
            "year": ["2024"],
            "month": ["06"],
            "day": ["01"],
            "time": ["12:00"],
            "data_format": "netcdf"
        }, path_ERA5_raw+'era5_geopotential_pressure.nc')

In [None]:
! ls '/home/mburlet/scratch/data/DATA_MB/DEV/ERA5Land/raw/'

In [None]:
xr.open_dataset(path_ERA5_raw+'data_0.nc')

In [None]:
dcs = []
for path in glob.glob(path_ERA5_raw+'*.nc'):
    dcs.append(xr.open_dataset(path))

In [None]:
coordsVar = []
dataVar = []
for dc in dcs:
    coordsVar += list(dc.coords)
    dataVar += list(dc.data_vars)
coordsVar = np.unique(coordsVar).tolist()
dataVar = np.unique(dataVar).tolist()
print(f"{coordsVar=}")
print(f"{dataVar=}")

coords = {k:[] for k in coordsVar}
dataTypes = {}
for dc in dcs:
    for k in dc.coords:
        if len(dc[k].shape)>0:
            coords[k] +=  list(dc[k].values)
    for k in dc.data_vars:
        if k not in dataTypes:
            dataTypes[k] = dc[k].dtype
coords = {k: np.sort(np.unique(np.array(coords[k]))) for k in coords}

In [None]:
discardVar = ['expver']

size = tuple(len(coords[k]) for k in coords if len(coords[k])>0)
print(f"{size=}")
dims = tuple(k for k in coords if len(coords[k])>0 and k not in discardVar)
print(f"{dims=}")
data = {}
for k in dataVar:
    data[k] = np.zeros(size, dtype=dataTypes[k])

In [None]:
for e, dc in enumerate(dcs):
    print(f"Processing datacube n°{e} / {len(dcs)}")
    for k in tqdm(dc.data_vars):
        idx = []
        selVar = []
        for c in dims+tuple(discardVar):
            if len(dc[c].shape)>0 and c not in discardVar:
                s = list(dc[c].values)
                sorter = np.argsort(coords[c])
                tmp = sorter[np.searchsorted(coords[c], s, sorter=sorter)]
                idx.append(tmp)
            elif c in discardVar:
                s = list(dc[c].values)
                sorter = np.argsort(coords[c])
                tmp = sorter[np.searchsorted(coords[c], s, sorter=sorter)]
                selVar.append(tmp)
        assert len(selVar)==1
        for v in selVar[0]:
            data[k][v][idx[0],:,:][:,idx[1],:][:,:,idx[2]] = dc[k].data.transpose((1,2,0))


In [None]:
ds = xr.Dataset(
    {k: (tuple(discardVar)+dims, data[k]) for k in data},
    coords=coords
)
ds.to_netcdf(path_ERA5_raw+"era5_monthly_averaged_data.nc")