Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stricter build_path / better end_of_period #237

Merged
merged 6 commits into from Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion HISTORY.rst
Expand Up @@ -14,7 +14,7 @@ New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* `xscen` now tracks code coverage using `coveralls <https://coveralls.io/>`_. (:pull:`187`).
* New function `get_warming_level` to search within the IPCC CMIP global temperatures CSV without requiring data. (:issue:`208`, :pull:`210`).
* File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`).
* File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`, :pull:`237`).
* New scripting functions `save_and_update` and `move_and_delete`. (:pull:`214`).
* Spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. (:pull:`221`).
* New argument `var_as_string` for `get_cat_attrs` to return variable names as strings. (:pull:`233`).
Expand All @@ -29,6 +29,7 @@ Breaking changes
^^^^^^^^^^^^^^^^
* Columns ``date_start`` and ``date_end`` now use a ``datetime64[ms]`` dtype. (:pull:`222`).
* The default output of ``date_parser`` is now ``pd.Timestamp`` (``output_dtype='datetime'``). (:pull:`222`).
* ``date_parser(date, end_of_period=True)`` has time "23:59:59", instead of "23:00". (:pull:`222`, :pull:`237`).
* ``driving_institution`` was removed from the "default" xscen columns. (:pull:`222`).
* Folder parsing utilities (``parse_directory``) moved to ``xscen.catutils``. Signature changed : ``globpattern`` removed, ``dirglob`` added, new ``patterns`` specifications. See doc for all changes. (:pull:`205`).
* ``compute_indicators`` now returns all outputs produced by indicators with multiple outputs (such as `rain_season`). (:pull:`228`).
Expand Down
2 changes: 1 addition & 1 deletion tests/test_catutils.py
Expand Up @@ -250,6 +250,6 @@ def test_build_path_multivar(samplecat):
info["variable"] = ("tas", "tasmin")
with pytest.raises(
ValueError,
match="Selected schema raw-sims is meant to be used with single-variable datasets.",
match="Selected schema raw-sims-raw is meant to be used with single-variable datasets.",
aulemahal marked this conversation as resolved.
Show resolved Hide resolved
):
cu.build_path(info)
6 changes: 5 additions & 1 deletion tests/test_scripting.py
Expand Up @@ -23,6 +23,10 @@ class TestScripting:
"cat:source": "CanESM5",
"cat:experiment": "ssp585",
"cat:member": "r1i1p1f1",
"cat:domain": "global",
"cat:mip_era": "CMIP6",
"cat:institution": "CCCma",
"cat:activity": "ScenarioMIP",
}

def test_save_and_update(self):
Expand Down Expand Up @@ -50,7 +54,7 @@ def test_save_and_update(self):
assert (
cat.df.path[1]
== root
+ "/simulation/raw/CanESM5/ssp585/r1i1p1f1/yr/tas/tas_yr_CanESM5_ssp585_r1i1p1f1_2000-2049.nc"
+ "/simulation/raw/CMIP6/ScenarioMIP/global/CCCma/CanESM5/ssp585/r1i1p1f1/yr/tas/tas_yr_CMIP6_ScenarioMIP_global_CCCma_CanESM5_ssp585_r1i1p1f1_2000-2049.nc"
)
assert cat.df.source[1] == "CanESM5"

Expand Down
11 changes: 8 additions & 3 deletions tests/test_utils.py
Expand Up @@ -13,8 +13,8 @@ class TestDateParser:
@pytest.mark.parametrize(
"date,end_of_period,dtype,exp",
[
("2001", True, "datetime", pd.Timestamp("2001-12-31")),
("150004", True, "datetime", pd.Timestamp("1500-04-30")),
("2001", True, "datetime", pd.Timestamp("2001-12-31 23:59:59")),
("150004", True, "datetime", pd.Timestamp("1500-04-30 23:59:59")),
("31231212", None, "datetime", pd.Timestamp("3123-12-12")),
("2001-07-08", None, "period", pd.Period("2001-07-08", "H")),
(pd.Timestamp("1993-05-20T12:07"), None, "str", "1993-05-20"),
Expand All @@ -24,7 +24,12 @@ class TestDateParser:
"datetime",
pd.Timestamp("1981-02-28"),
),
(np.datetime64("1200-11-12"), "Y", "datetime", pd.Timestamp("1200-12-31")),
(
np.datetime64("1200-11-12"),
"Y",
"datetime",
pd.Timestamp("1200-12-31 23:59:59"),
),
(
datetime(2045, 5, 2, 13, 45),
None,
Expand Down
103 changes: 49 additions & 54 deletions xscen/catutils.py
Expand Up @@ -13,7 +13,7 @@
from functools import partial, reduce
from multiprocessing import Pool
from pathlib import Path, PosixPath
from typing import Any, Optional, Union
from typing import Any, Optional, Tuple, Union

import cftime
import netCDF4
Expand Down Expand Up @@ -834,30 +834,34 @@ def _parse_from_nc(path: os.PathLike, get_vars=True, get_time=True):
# ## Path building ## #
def _schema_option(option: dict, facets: dict):
"""Parse an option element of the facet schema tree."""
facet_value = facets.get(option["option"])
facet_value = facets.get(option["facet"])
if "value" in option:
if isinstance(option["value"], str):
answer = facet_value == option["value"]
else: # A list
answer = facet_value in option["value"]
else:
answer = not isna(facet_value)

if "is_true" in option and answer:
return option["is_true"]
if "else" in option and not answer:
return option["else"]
return answer


def _schema_level(schema: Union[dict, str], facets: dict):
if isinstance(schema, str):
if schema.startswith("(") and schema.endswith(")"):
optional = True
schema = schema[1:-1]
else:
optional = False
if schema == "DATES":
return _schema_dates(facets)
return _schema_dates(facets, optional=optional)

# A single facet:
if isna(facets.get(schema)):
return None
if optional:
return None
raise ValueError(
f"Facet {schema} is needed but None-like or missing in the data."
)
return facets[schema]
if isinstance(schema, list):
parts = []
Expand All @@ -866,39 +870,18 @@ def _schema_level(schema: Union[dict, str], facets: dict):
if not isna(part):
parts.append(part)
return "_".join(parts)
if "option" in schema:
answer = _schema_option(schema, facets)
if isinstance(answer, bool) and not answer:
# Test failed with no "else" value, we skip this level.
return None
return _schema_level(answer, facets)
if "text" in schema:
if isinstance(schema, dict) and "text" in schema:
return schema["text"]
raise ValueError(f"Invalid schema : {schema}")


class KeyRecorder:
"""A dummy object but recording requested keys.

Bracket-access (a[key]) returns the key as-is, but adds it to the `keys` property.
"""

def __init__(self):
self.keys = set()

def __getitem__(self, k):
self.keys.add(k)
return k

def get(self, k):
return self[k]


def _schema_dates(facets):
if isinstance(facets, KeyRecorder):
# Record that these three fields are needed
facets["date_start"], facets["date_end"], facets["xrfreq"]
return "dates"
def _schema_dates(facets, optional=False):
if any([facets.get(f) is None for f in ["date_start", "date_end", "xrfreq"]]):
if optional:
return None
raise ValueError(
"Facets date_start, date_end and xrfreq are needed, but at least one is missing or None-like in the data."
)

if facets["xrfreq"] == "fx":
return "fx"
Expand Down Expand Up @@ -952,10 +935,19 @@ def _schema_folders(schema: list, facets: dict) -> list:

def _get_needed_fields(schema: dict):
"""Return the list of facets that is needed for a given schema."""
facet_rec = KeyRecorder()
_schema_folders(schema["folders"], facet_rec)
_schema_filename(schema["filename"], facet_rec)
return facet_rec.keys
needed = set()
for level in schema["folders"]:
if isinstance(level, str):
if not (level.startswith("(") and level.endswith(")")):
needed.add(level)
elif isinstance(level, list):
for lvl in level:
needed.add(lvl)
elif not (isinstance(level, dict) and list(level.keys()) == ["text"]):
raise ValueError(
f"Invalid schema with unknown {level} of type {type(level)}."
)
return needed


def _read_schemas(schemas):
Expand All @@ -982,9 +974,9 @@ def _build_path(
data: Union[dict, xr.Dataset, xr.DataArray, pd.Series],
schemas: dict,
root: Path,
strict: bool = False,
get_type: bool = False,
**extra_facets,
) -> Path:
) -> Union[Path, tuple[Path, str]]:
# Get all known metadata
if isinstance(data, (xr.Dataset, xr.DataArray)):
facets = (
Expand Down Expand Up @@ -1021,7 +1013,7 @@ def _build_path(
if match:
# Checks
needed_fields = _get_needed_fields(schema)
if strict and (missing_fields := needed_fields - set(facets.keys())):
if missing_fields := needed_fields - set(facets.keys()):
raise ValueError(
f"Missing facets {missing_fields} are needed to build the path according to selected schema {name}."
)
Expand All @@ -1036,7 +1028,9 @@ def _build_path(
out = root / out
if "format" in facets: # Add extension
# Can't use `with_suffix` in case there are dots in the name
return out.parent / f"{out.name}.{facets['format']}"
out = out.parent / f"{out.name}.{facets['format']}"
if get_type:
return out, name
return out

raise ValueError(f"This file doesn't match any schema. Facets:\n{facets}")
Expand All @@ -1047,7 +1041,6 @@ def build_path(
data: Union[dict, xr.Dataset, xr.DataArray, pd.Series, DataCatalog, pd.DataFrame],
schemas: Optional[Union[str, os.PathLike, list[dict], dict]] = None,
root: os.PathLike = None,
strict: bool = False,
**extra_facets,
) -> Union[Path, DataCatalog, pd.DataFrame]:
"""Parse the schema from a configuration and construct path using a dictionary of facets.
Expand All @@ -1066,9 +1059,6 @@ def build_path(
Or a single schema dict (single element of the yaml).
root : Path, optional
If given, the generated path(s) is given under this root one.
strict : bool
If True, the data must include all facets referred in the schema.
If False (default), the output paths might be inconsistent if some fields are missing.
extra_facets : str
Extra facets to supplement or override metadadata missing from the first input.

Expand All @@ -1077,6 +1067,7 @@ def build_path(
Path or catalog
Constructed path. If "format" is absent from the facets, it has no suffix.
If `data` was a catalog, a copy with a "new_path" column is returned.
Another "new_path_type" column is also added if `schemas` was a collection of schemas (like the default).

Examples
--------
Expand All @@ -1100,13 +1091,17 @@ def build_path(

df = df.copy()

df["new_path"] = df.apply(
paths = df.apply(
_build_path,
axis=1,
result_type="expand",
schemas=schemas,
root=root,
strict=strict,
get_type=True,
**extra_facets,
).apply(str)
)
df["new_path"] = paths[0].apply(str)
if len(schemas) > 1:
df["new_path_type"] = paths[1]
return df
return _build_path(data, schemas=schemas, root=root, **extra_facets)
return _build_path(data, schemas=schemas, root=root, get_type=False, **extra_facets)