In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd

import hvplot.pandas
import hvplot.xarray
import geoviews as gv

import rioxarray as rxr
from rioxarray.merge import merge_arrays
from tqdm.auto import tqdm

data_root = Path("./data")

In [2]:
def load_kp(data_root):
    df = pd.read_csv(data_root / "CAMELS_FR_attributes/static_attributes/CAMELS_FR_site_general_attributes.csv", sep=";")
    gdf = gpd.read_file(data_root / "./CAMELS_FR_geography/CAMELS_FR_gauge_outlet.gpkg")
    gdf["area"]=df["sit_area_topo"]
    gdf = gdf.to_crs("EPSG:4326")
    return gdf

In [3]:
gdf = load_kp(data_root)
gv.tile_sources.ESRI * gdf.hvplot(geo=True)

### MERIT

In [1]:
var = "dir"
tiles = ["n30e000", "n30w030"]
poly = gpd.read_file("hydro_data/data/shp/fr.shp").loc[0,"geometry"]

paths = (
    list((data_root / "MERIT" / f"{var}_{tiles[0]}").glob("*.tif")) +
    list((data_root / "MERIT" / f"{var}_{tiles[1]}").glob("*.tif"))
)

NameError: name 'gpd' is not defined

In [2]:
def read_func(var):
    tiles = ["n30e000", "n30w030"]
    
    paths = (
        list((data_root / "MERIT" / f"{var}_{tiles[0]}").glob("*.tif")) +
        list((data_root / "MERIT" / f"{var}_{tiles[1]}").glob("*.tif"))
    )
    
    # Open all tiles lazily
    das = [
        rxr.open_rasterio(p, masked=True).squeeze("band", drop=True)
        for p in (paths)
    ]
    
    # Merge into a single grid
    da = merge_arrays(das)
    da = da.sel(x=slice(-5, 10), y=slice(51,41))
    return da

In [28]:
merit_out = data_root / "MERIT_processed"
merit_out.mkdir(exist_ok=True)

for var in tqdm(["upa", "dir"]):#, "elv", 
    da = read_func(var)

    for k in ["_FillValue", "missing_value"]:
        da.attrs.pop(k, None)
    
    da = da.rio.clip([poly])
    da.rio.to_raster(
        merit_out / f"{var}.tif",
        driver="GTiff",
        compress="DEFLATE",
        tiled=True,
        BIGTIFF="IF_SAFER",
    )

  0%|          | 0/2 [00:00<?, ?it/s]

  result = runner(coro)
  return data.astype(dtype, **kwargs)


### dump processed SIM

In [7]:
dss = []

for fp in tqdm(sorted(list((data_root / "SIM").glob("*.csv")))):
    x = pd.read_csv(fp, sep=";")
    
    start_x, diff_x = x["LAMBX"].min(), 80
    start_y, diff_y = x["LAMBY"].min(), 80
    
    x["x"] = ((x["LAMBX"] - start_x) / diff_x).astype(int)
    x["y"] = ((x["LAMBY"] - start_y) / diff_y).astype(int)
    x["time"] = pd.to_datetime(x["DATE"].astype(str), format="%Y%m%d")
    
    a = x["time"] - x["time"][0]
    x["t"] = a.dt.days

    variables = [
        'PRENEI', 'PRELIQ', 'T', 'FF', 'Q', 'DLI',
        'SSI', 'HU', 'EVAP', 'ETP', 'PE', 'SWI', 'SSWI_10J', 'DRAINC', 'RUNC',
        'RESR_NEIGE', 'RESR_NEIGE6', 'HTEURNEIGE', 'HTEURNEIGE6', 'HTEURNEIGEX',
        'SNOW_FRAC', 'ECOULEMENT', 'WG_RACINE', 'WGI_RACINE', 'TINF_H',
        'TSUP_H'
    ]

    x_coords = np.arange(600, 11960+80, 80)
    y_coords = np.arange(16170, 26810+80, 80)[::-1]
    ds = xr.Dataset(coords={"x":x_coords, "y":y_coords, "time":x["time"].unique()})

    for var in tqdm(variables):
        data = np.full((x["x"].max() + 1, x["y"].max() + 1, x["t"].max() + 1), float("nan"))
        data[x["x"].values, x["y"].values, x["t"].values] = x[var]
        ds[var] = (("x", "y", "time"), data[:,::-1])

    
    ds.to_netcdf(data_root / "SIM_processed" / (fp.stem.split("_")[-1] + ".nc"))
    del ds

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

In [8]:
%time ds = xr.open_mfdataset((data_root / "SIM_processed").glob("*.nc"))

CPU times: user 205 ms, sys: 38.5 ms, total: 244 ms
Wall time: 432 ms


In [9]:
%time ds.to_netcdf(data_root / "dataset" / "inp_dyn_.nc")

CPU times: user 11.9 s, sys: 4min 5s, total: 4min 17s
Wall time: 4min 24s
