Ouranosinc · aulemahal · Aug 15, 2023 · Aug 11, 2023 · Aug 11, 2023 · Aug 14, 2023
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -14,7 +14,7 @@ New features and enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 * `xscen` now tracks code coverage using `coveralls <https://coveralls.io/>`_. (:pull:`187`).
 * New function `get_warming_level` to search within the IPCC CMIP global temperatures CSV without requiring data. (:issue:`208`, :pull:`210`).
-* File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`).
+* File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`, :pull:`237`).
 * New scripting functions `save_and_update` and `move_and_delete`. (:pull:`214`).
 * Spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. (:pull:`221`).
 * New argument `var_as_string` for `get_cat_attrs` to return variable names as strings. (:pull:`233`).
@@ -29,6 +29,7 @@ Breaking changes
 ^^^^^^^^^^^^^^^^
 * Columns ``date_start`` and ``date_end`` now use a ``datetime64[ms]`` dtype. (:pull:`222`).
 * The default output of ``date_parser`` is now ``pd.Timestamp`` (``output_dtype='datetime'``). (:pull:`222`).
+* ``date_parser(date, end_of_period=True)`` has time "23:59:59", instead of "23:00". (:pull:`222`, :pull:`237`).
 * ``driving_institution`` was removed from the "default" xscen columns. (:pull:`222`).
 * Folder parsing utilities (``parse_directory``) moved to ``xscen.catutils``. Signature changed : ``globpattern`` removed, ``dirglob`` added, new ``patterns`` specifications. See doc for all changes. (:pull:`205`).
 * ``compute_indicators`` now returns all outputs produced by indicators with multiple outputs (such as `rain_season`). (:pull:`228`).

diff --git a/tests/test_catutils.py b/tests/test_catutils.py
@@ -250,6 +250,6 @@ def test_build_path_multivar(samplecat):
     info["variable"] = ("tas", "tasmin")
     with pytest.raises(
         ValueError,
-        match="Selected schema raw-sims is meant to be used with single-variable datasets.",
+        match="Selected schema raw-sims-raw is meant to be used with single-variable datasets.",
     ):
         cu.build_path(info)
diff --git a/tests/test_scripting.py b/tests/test_scripting.py
@@ -23,6 +23,10 @@ class TestScripting:
         "cat:source": "CanESM5",
         "cat:experiment": "ssp585",
         "cat:member": "r1i1p1f1",
+        "cat:domain": "global",
+        "cat:mip_era": "CMIP6",
+        "cat:institution": "CCCma",
+        "cat:activity": "ScenarioMIP",
     }
 
     def test_save_and_update(self):
@@ -50,7 +54,7 @@ def test_save_and_update(self):
         assert (
             cat.df.path[1]
             == root
-            + "/simulation/raw/CanESM5/ssp585/r1i1p1f1/yr/tas/tas_yr_CanESM5_ssp585_r1i1p1f1_2000-2049.nc"
+            + "/simulation/raw/CMIP6/ScenarioMIP/global/CCCma/CanESM5/ssp585/r1i1p1f1/yr/tas/tas_yr_CMIP6_ScenarioMIP_global_CCCma_CanESM5_ssp585_r1i1p1f1_2000-2049.nc"
         )
         assert cat.df.source[1] == "CanESM5"
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -13,8 +13,8 @@ class TestDateParser:
     @pytest.mark.parametrize(
         "date,end_of_period,dtype,exp",
         [
-            ("2001", True, "datetime", pd.Timestamp("2001-12-31")),
-            ("150004", True, "datetime", pd.Timestamp("1500-04-30")),
+            ("2001", True, "datetime", pd.Timestamp("2001-12-31 23:59:59")),
+            ("150004", True, "datetime", pd.Timestamp("1500-04-30 23:59:59")),
             ("31231212", None, "datetime", pd.Timestamp("3123-12-12")),
             ("2001-07-08", None, "period", pd.Period("2001-07-08", "H")),
             (pd.Timestamp("1993-05-20T12:07"), None, "str", "1993-05-20"),
@@ -24,7 +24,12 @@ class TestDateParser:
                 "datetime",
                 pd.Timestamp("1981-02-28"),
             ),
-            (np.datetime64("1200-11-12"), "Y", "datetime", pd.Timestamp("1200-12-31")),
+            (
+                np.datetime64("1200-11-12"),
+                "Y",
+                "datetime",
+                pd.Timestamp("1200-12-31 23:59:59"),
+            ),
             (
                 datetime(2045, 5, 2, 13, 45),
                 None,

diff --git a/xscen/catutils.py b/xscen/catutils.py
@@ -13,7 +13,7 @@
 from functools import partial, reduce
 from multiprocessing import Pool
 from pathlib import Path, PosixPath
-from typing import Any, Optional, Union
+from typing import Any, Optional, Tuple, Union
 
 import cftime
 import netCDF4
@@ -834,30 +834,34 @@ def _parse_from_nc(path: os.PathLike, get_vars=True, get_time=True):
 # ## Path building ## #
 def _schema_option(option: dict, facets: dict):
     """Parse an option element of the facet schema tree."""
-    facet_value = facets.get(option["option"])
+    facet_value = facets.get(option["facet"])
     if "value" in option:
         if isinstance(option["value"], str):
             answer = facet_value == option["value"]
         else:  # A list
             answer = facet_value in option["value"]
     else:
         answer = not isna(facet_value)
-
-    if "is_true" in option and answer:
-        return option["is_true"]
-    if "else" in option and not answer:
-        return option["else"]
     return answer
 
 
 def _schema_level(schema: Union[dict, str], facets: dict):
     if isinstance(schema, str):
+        if schema.startswith("(") and schema.endswith(")"):
+            optional = True
+            schema = schema[1:-1]
+        else:
+            optional = False
         if schema == "DATES":
-            return _schema_dates(facets)
+            return _schema_dates(facets, optional=optional)
 
         # A single facet:
         if isna(facets.get(schema)):
-            return None
+            if optional:
+                return None
+            raise ValueError(
+                f"Facet {schema} is needed but None-like or missing in the data."
+            )
         return facets[schema]
     if isinstance(schema, list):
         parts = []
@@ -866,39 +870,18 @@ def _schema_level(schema: Union[dict, str], facets: dict):
             if not isna(part):
                 parts.append(part)
         return "_".join(parts)
-    if "option" in schema:
-        answer = _schema_option(schema, facets)
-        if isinstance(answer, bool) and not answer:
-            # Test failed with no "else" value, we skip this level.
-            return None
-        return _schema_level(answer, facets)
-    if "text" in schema:
+    if isinstance(schema, dict) and "text" in schema:
         return schema["text"]
     raise ValueError(f"Invalid schema : {schema}")
 
 
-class KeyRecorder:
-    """A dummy object but recording requested keys.
-
-    Bracket-access (a[key]) returns the key as-is, but adds it to the `keys` property.
-    """
-
-    def __init__(self):
-        self.keys = set()
-
-    def __getitem__(self, k):
-        self.keys.add(k)
-        return k
-
-    def get(self, k):
-        return self[k]
-
-
-def _schema_dates(facets):
-    if isinstance(facets, KeyRecorder):
-        # Record that these three fields are needed
-        facets["date_start"], facets["date_end"], facets["xrfreq"]
-        return "dates"
+def _schema_dates(facets, optional=False):
+    if any([facets.get(f) is None for f in ["date_start", "date_end", "xrfreq"]]):
+        if optional:
+            return None
+        raise ValueError(
+            "Facets date_start, date_end and xrfreq are needed, but at least one is missing or None-like in the data."
+        )
 
     if facets["xrfreq"] == "fx":
         return "fx"
@@ -952,10 +935,19 @@ def _schema_folders(schema: list, facets: dict) -> list:
 
 def _get_needed_fields(schema: dict):
     """Return the list of facets that is needed for a given schema."""
-    facet_rec = KeyRecorder()
-    _schema_folders(schema["folders"], facet_rec)
-    _schema_filename(schema["filename"], facet_rec)
-    return facet_rec.keys
+    needed = set()
+    for level in schema["folders"]:
+        if isinstance(level, str):
+            if not (level.startswith("(") and level.endswith(")")):
+                needed.add(level)
+        elif isinstance(level, list):
+            for lvl in level:
+                needed.add(lvl)
+        elif not (isinstance(level, dict) and list(level.keys()) == ["text"]):
+            raise ValueError(
+                f"Invalid schema with unknown {level} of type {type(level)}."
+            )
+    return needed
 
 
 def _read_schemas(schemas):
@@ -982,9 +974,9 @@ def _build_path(
     data: Union[dict, xr.Dataset, xr.DataArray, pd.Series],
     schemas: dict,
     root: Path,
-    strict: bool = False,
+    get_type: bool = False,
     **extra_facets,
-) -> Path:
+) -> Union[Path, tuple[Path, str]]:
     # Get all known metadata
     if isinstance(data, (xr.Dataset, xr.DataArray)):
         facets = (
@@ -1021,7 +1013,7 @@ def _build_path(
         if match:
             # Checks
             needed_fields = _get_needed_fields(schema)
-            if strict and (missing_fields := needed_fields - set(facets.keys())):
+            if missing_fields := needed_fields - set(facets.keys()):
                 raise ValueError(
                     f"Missing facets {missing_fields} are needed to build the path according to selected schema {name}."
                 )
@@ -1036,7 +1028,9 @@ def _build_path(
                 out = root / out
             if "format" in facets:  # Add extension
                 # Can't use `with_suffix` in case there are dots in the name
-                return out.parent / f"{out.name}.{facets['format']}"
+                out = out.parent / f"{out.name}.{facets['format']}"
+            if get_type:
+                return out, name
             return out
 
     raise ValueError(f"This file doesn't match any schema. Facets:\n{facets}")
@@ -1047,7 +1041,6 @@ def build_path(
     data: Union[dict, xr.Dataset, xr.DataArray, pd.Series, DataCatalog, pd.DataFrame],
     schemas: Optional[Union[str, os.PathLike, list[dict], dict]] = None,
     root: os.PathLike = None,
-    strict: bool = False,
     **extra_facets,
 ) -> Union[Path, DataCatalog, pd.DataFrame]:
     """Parse the schema from a configuration and construct path using a dictionary of facets.
@@ -1066,9 +1059,6 @@ def build_path(
         Or a single schema dict (single element of the yaml).
     root : Path, optional
         If given, the generated path(s) is given under this root one.
-    strict : bool
-        If True, the data must include all facets referred in the schema.
-        If False (default), the output paths might be inconsistent if some fields are missing.
     extra_facets : str
         Extra facets to supplement or override metadadata missing from the first input.
 
@@ -1077,6 +1067,7 @@ def build_path(
     Path or catalog
         Constructed path. If "format" is absent from the facets, it has no suffix.
         If `data` was a catalog, a copy with a "new_path" column is returned.
+        Another "new_path_type" column is also added if `schemas` was a collection of schemas (like the default).
 
     Examples
     --------
@@ -1100,13 +1091,17 @@ def build_path(
 
         df = df.copy()
 
-        df["new_path"] = df.apply(
+        paths = df.apply(
             _build_path,
             axis=1,
+            result_type="expand",
             schemas=schemas,
             root=root,
-            strict=strict,
+            get_type=True,
             **extra_facets,
-        ).apply(str)
+        )
+        df["new_path"] = paths[0].apply(str)
+        if len(schemas) > 1:
+            df["new_path_type"] = paths[1]
         return df
-    return _build_path(data, schemas=schemas, root=root, **extra_facets)
+    return _build_path(data, schemas=schemas, root=root, get_type=False, **extra_facets)