### This notebook needs modifications with regards to I/O paths depending on the infra.

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
from PyStemmusScope import variable_conversion as vc
import sys
sys.path.append('.')
from utils import interpolation, era5land_accumulated_vars, map_landcover_to_igbp, landcover_to_igbp
from sklearn.preprocessing import OneHotEncoder

start_time = "2014-1-31"
end_time = "2014-02-05"

parent_in_path = "./data"
data_paths = {"era5land": f"{parent_in_path}/NL/era5land_{start_time}_{end_time}_NL.nc",
              "lai": f"{parent_in_path}/NL/lai_{start_time}_{end_time}_NL.nc",
              "ssm": f"{parent_in_path}/NL/ssm_{start_time}_{end_time}_NL.nc",
              "co2": f"{parent_in_path}/NL/co2_{start_time}_{end_time}_NL.nc",
              "landcover": f"{parent_in_path}/NL/landcover_{start_time}_{end_time}_NL.nc",
              "vcmax": f"{parent_in_path}/NL/vcmax_{start_time}_{end_time}_NL.nc",
              "canopyheight": f"{parent_in_path}/NL/canopyheight_{start_time}_{end_time}_NL.nc",
              "all_data": f"{parent_in_path}/NL/all_data_{start_time}_{end_time}_NL.nc",
              "igbp_table": f"{parent_in_path}/auxiliary/lccs_to_igbp_table.csv",
              "igbp_class": f"{parent_in_path}/auxiliary/IGBP11unique.csv",
            }
parent_out_path = "/scratch/outputs/NL"
os.makedirs(parent_out_path, exist_ok=True)

In [None]:
variable_names = {"lai": "LAI",
                  "ssm": "band_data",
                  "co2": "co2",
                  "canopyheight": "__xarray_dataarray_variable__",
                  "vcmax": "__xarray_dataarray_variable__",
                  "landcover": "lccs_class"}  

# interpolation
era5land = xr.open_dataset(data_paths["era5land"])
other_coords = {"time": era5land.time, "longitude": era5land.longitude, "latitude": era5land.latitude}

for name in variable_names:
    ds = xr.open_dataset(data_paths[name])
    ds_interpolated = interpolation(ds, other_coords)    
    era5land[name] = ds_interpolated[variable_names[name]]

# save
out_path = f"{parent_out_path}/all_data_{start_time}_{end_time}_NL.nc"
era5land.to_netcdf(out_path, mode='w')
print(f"{out_path} is saved")

In [None]:
# variable derivation
# read data
all_data = xr.open_dataset(data_paths["all_data"])

# variable calculations
all_data = era5land_accumulated_vars(all_data, "ssrd", "Rin", 3600)
all_data = era5land_accumulated_vars(all_data, "strd", "Rli", 3600)
all_data = era5land_accumulated_vars(all_data, "tp", "Precip_msr", 0.001) # to mm
all_data["p"] = all_data["sp"] / 100  # Pa -> hPa
all_data["Ta"] = all_data["t2m"] - 273.15  # K -> degC
all_data["ea"] = vc.calculate_es(all_data["d2m"] - 273.15)*10 # *10 is for kPa -> hPa
all_data["u"] = (all_data["u10"] ** 2 + all_data["v10"] ** 2) ** 0.5
all_data["ssm"] = all_data["ssm"] / 1000

# convert landcover to IGBP
# lookup tables
igbp_table = pd.read_csv(data_paths["igbp_table"])
igbp_class = pd.read_csv(data_paths["igbp_class"])['0'].unique()

# define one hot encoding for IGBP using dask-ml functions
encoder = OneHotEncoder(sparse_output=False)

# Unsorted categories are not yet supported by dask-ml
igbp_stemmus_scope = np.sort(igbp_table["IGBP_STEMMUS_SCOPE"].to_numpy().reshape(-1,1))
encoder = encoder.fit(igbp_stemmus_scope)  
lookup_table = igbp_table.set_index("lccs_class").T.to_dict('records')[0]

ds = landcover_to_igbp(all_data, "landcover", encoder, lookup_table, igbp_class)

# rename some variables
rename_vars = {"co2": "CO2", "lai": "LAI", "canopyheight": "hc", "ssm": "SSM", "vcmax": "Vcmo"}
ds = ds.rename(rename_vars)

# save
out_path = f"{parent_out_path}/model_input_{start_time}_{end_time}_NL.nc"
ds.to_netcdf(out_path, mode='w')
print(f"{out_path} is saved")