From 863508d0112a0abf38fc0aeb7cc4983974940b52 Mon Sep 17 00:00:00 2001 From: RondeauG Date: Thu, 11 Apr 2024 11:41:27 -0400 Subject: [PATCH 1/7] first tests --- tests/test_reduce.py | 0 xscen/reduce.py | 3 --- 2 files changed, 3 deletions(-) create mode 100644 tests/test_reduce.py diff --git a/tests/test_reduce.py b/tests/test_reduce.py new file mode 100644 index 00000000..e69de29b diff --git a/xscen/reduce.py b/xscen/reduce.py index a49f571e..50448f6a 100644 --- a/xscen/reduce.py +++ b/xscen/reduce.py @@ -1,6 +1,5 @@ """Functions to reduce an ensemble of simulations.""" -import logging from typing import Optional, Union import numpy as np @@ -9,8 +8,6 @@ from .config import parse_config -logger = logging.getLogger(__name__) - __all__ = ["build_reduction_data", "reduce_ensemble"] From 8cfedd9cb91586f84e87b04596e8b1dcd83719d7 Mon Sep 17 00:00:00 2001 From: RondeauG Date: Thu, 18 Apr 2024 09:32:59 -0400 Subject: [PATCH 2/7] use xclim instead --- xscen/reduce.py | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/xscen/reduce.py b/xscen/reduce.py index 50448f6a..6bb6916a 100644 --- a/xscen/reduce.py +++ b/xscen/reduce.py @@ -1,5 +1,6 @@ """Functions to reduce an ensemble of simulations.""" +import warnings from typing import Optional, Union import numpy as np @@ -37,6 +38,13 @@ def build_reduction_data( xr.DataArray 2D DataArray of dimensions "realization" and "criteria", to be used as input for ensemble reduction. """ + warnings.warn( + "This function will be dropped in a future version, as it is now redundant with xclim.ensembles.make_criteria." + "Either use xclim.ensembles.make_criteria directly (preceded by xclim.ensembles.create_ensemble if needed) or " + "use xscen's reduce_ensemble function to build the criteria and reduce the ensemble in one step.", + FutureWarning, + ) + # Use metadata to identify the simulation attributes info = {} keys = datasets.keys() if isinstance(datasets, dict) else range(len(datasets)) @@ -77,8 +85,15 @@ def build_reduction_data( @parse_config -def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict): - """Reduce an ensemble of simulations using clustering algorithms from xclim.ensembles. +def reduce_ensemble( + data: Union[xr.DataArray, dict, list, xr.Dataset], + method: str, + *, + horizons: Optional[list[str]] = None, + create_kwargs: Optional[dict] = None, + **kwargs, +): + r"""Reduce an ensemble of simulations using clustering algorithms from xclim.ensembles. Parameters ---------- @@ -86,11 +101,17 @@ def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict): Selection criteria data : 2-D xr.DataArray with dimensions 'realization' and 'criteria'. These are the values used for clustering. Realizations represent the individual original ensemble members and criteria the variables/indicators used in the grouping algorithm. - This data can be generated using build_reduction_data(). + This data can be generated using py:func:`xclim.ensembles.make_criteria`. + Alternatively, either a xr.Dataset, a list of xr.Dataset or a dictionary of xr.Dataset can be passed, + in which case the data will be built using py:func:`xclim.ensembles.create_ensemble` and py:func:`xclim.ensembles.make_criteria`. method : str ['kkz', 'kmeans']. Clustering method. - kwargs : dict - Arguments to send to either xclim.ensembles.kkz_reduce_ensemble or xclim.ensembles.kmeans_reduce_ensemble + horizons : list of str, optional + Subset of horizons on which to create the data. Only used if `data` needs to be built. + create_kwargs : dict, optional + Arguments to pass to py:func:`xclim.ensembles.create_ensemble` if `data` is not an xr.DataArray. + \*\*kwargs : dict + Arguments to send to either py:func:`xclim.ensembles.kkz_reduce_ensemble` or py:func:`xclim.ensembles.kmeans_reduce_ensemble`. Returns ------- @@ -99,8 +120,24 @@ def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict): clusters : dict If using kmeans clustering, realizations grouped by cluster. fig_data : dict - If using kmeans clustering, data necessary to call xclim.ensembles.plot_rsqprofile() + If using kmeans clustering, data necessary to call py:func:`xclim.ensembles.plot_rsqprofile`. + + Notes + ----- + If building `data` to be constructed by this function, the datasets should already have a climatology computed on them, such that the data + has no temporal dimension aside from the "horizon" coordinate (which is optional and might be used to subset the data). + If the indicators are a mix of yearly, seasonal, and monthly, they should be stacked on the same time/horizon axis and put in the same dataset. + You can use py:func:`xscen.utils.unstack_dates` on seasonal or monthly indicators to this end. """ + if isinstance(data, (list, dict)): + data = xce.create_ensemble(datasets=data, **(create_kwargs or {})) + if horizons: + if "horizon" not in data.dims: + raise ValueError("Data must have a 'horizon' dimension to be subsetted.") + data = data.sel(horizon=horizons) + if "criteria" not in data.dims: + data = xce.make_criteria(data) + selected = getattr(xce, f"{method}_reduce_ensemble")(data=data, **kwargs) clusters = {} From 95f01b2c39589c8a82798f9bbe73fd4db87cb411 Mon Sep 17 00:00:00 2001 From: RondeauG Date: Thu, 18 Apr 2024 09:41:00 -0400 Subject: [PATCH 3/7] remove empty file --- tests/test_reduce.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/test_reduce.py diff --git a/tests/test_reduce.py b/tests/test_reduce.py deleted file mode 100644 index e69de29b..00000000 From 75e32e0911c06ae90c94f068178874351aff732e Mon Sep 17 00:00:00 2001 From: RondeauG Date: Tue, 23 Apr 2024 11:42:51 -0400 Subject: [PATCH 4/7] moved function, added tests, fixed docs --- docs/notebooks/4_ensembles.ipynb | 61 +++++++------------------- tests/test_ensembles.py | 55 ++++++++++++++++++++++++ xscen/__init__.py | 2 - xscen/ensembles.py | 74 ++++++++++++++++++++++++++++++++ xscen/reduce.py | 39 +++++------------ 5 files changed, 156 insertions(+), 75 deletions(-) diff --git a/docs/notebooks/4_ensembles.ipynb b/docs/notebooks/4_ensembles.ipynb index 16e6e803..abd7b709 100644 --- a/docs/notebooks/4_ensembles.ipynb +++ b/docs/notebooks/4_ensembles.ipynb @@ -55,11 +55,13 @@ "for d in datasets:\n", " ds = open_dataset(datasets[d]).isel(lon=slice(0, 4), lat=slice(0, 4))\n", " ds = xs.climatological_op(\n", - " ds, op=\"mean\", window=30, periods=[[1981, 2010], [2021, 2050]]\n", - " )\n", - " datasets[d] = xs.compute_deltas(ds, reference_horizon=\"1981-2010\")\n", - " datasets[d].attrs[\"cat:id\"] = d # Required by build_reduction_data\n", - " datasets[d].attrs[\"cat:xrfreq\"] = \"AS-JAN\"" + " ds,\n", + " op=\"mean\",\n", + " window=30,\n", + " periods=[[1981, 2010], [2021, 2050]],\n", + " horizons_as_dim=True,\n", + " ).drop_vars(\"time\")\n", + " datasets[d] = xs.compute_deltas(ds, reference_horizon=\"1981-2010\")" ] }, { @@ -68,47 +70,14 @@ "source": [ "### Preparing the data\n", "\n", - "Ensemble reduction is built upon climate indicators that are relevant to represent the ensemble's variability for a given application. In this case, we'll use the mean temperature delta between 2021-2050 and 1981-2010.\n", + "Ensemble reduction is built upon climate indicators that are relevant to represent the ensemble's variability for a given application. In this case, we'll use the mean temperature delta between 2021-2050 and 1981-2010, but monthly or seasonal indicators could also be required. The `horizons_as_dim` argument in `climatological_op` can help combine indicators of multiple frequencies into a single dataset. Alternatively, `xscen.utils.unstack_dates` can also accomplish the same thing if the climatological operations have already been computed.\n", "\n", - "However, the functions implemented in `xclim.ensembles._reduce` require a very specific 2-D DataArray of dimensions \"realization\" and \"criteria\". That means that all the variables need to be combined and renamed, and that all dimensions need to be stacked together.\n", + "The functions implemented in `xclim.ensembles._reduce` require a very specific 2-D DataArray of dimensions \"realization\" and \"criteria\". The first solution is to first create an ensemble using `xclim.ensembles.create_ensemble`, then pass the result to `xclim.ensembles.make_criteria`. Alternatively, the datasets can be passed directly to `xscen.ensembles.reduce_ensemble` and the necessary preliminary steps will be accomplished automatically.\n", "\n", - "`xs.build_reduction_data` can be used to prepare the data for ensemble reduction. Its arguments are:\n", + "In this example, the number of criteria will corresponds to: `indicators x horizons x longitude x latitude`, but criteria that are purely NaN across all realizations will be removed.\n", "\n", - "- `datasets` (dict, list)\n", - "- `xrfreqs` are the unique frequencies of the indicators.\n", - "- `horizons` is used to instruct on which horizon(s) to build the data from.\n", + "Note that `xs.spatial_mean` could have been used prior to calling that function to remove the spatial dimensions.\n", "\n", - "Because a simulation could have multiple datasets (in the case of multiple frequencies), an attempt will be made to decipher the ID and frequency from the metadata." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = xs.build_reduction_data(\n", - " datasets=datasets,\n", - " xrfreqs=[\"AS-JAN\"],\n", - " horizons=[\"2021-2050\"],\n", - ")\n", - "\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of criteria corresponds to: `indicators x horizons x longitude x latitude`, but criteria that are purely NaN across all realizations are removed.\n", - "\n", - "Note that `xs.spatial_mean` could have been used prior to calling that function to remove the spatial dimensions." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "### Selecting a reduced ensemble\n", "\n", "
NOTE\n", @@ -117,9 +86,11 @@ "
\n", "\n", "Ensemble reduction through `xscen.reduce_ensemble` consists in a simple call to `xclim`. The arguments are:\n", - "- `data`, which is the 2D DataArray that is created by using `xs.build_reduction_data`.\n", + "- `data`, which is the aforementioned 2D DataArray, or the list/dict of datasets required to build it.\n", "- `method` is either `kkz` or `kmeans`. See the link above for further details on each technique.\n", - "- `kwargs` is a dictionary of arguments to send to the method chosen." + "- `horizons` is used to instruct on which horizon(s) to build the data from, if data needs to be constructed.\n", + "- `create_kwargs`, the arguments to pass to `xclim.ensembles.create_ensemble` if data needs to be constructed.\n", + "- `kwargs` is a dictionary of arguments to send to the clustering method chosen." ] }, { @@ -129,7 +100,7 @@ "outputs": [], "source": [ "selected, clusters, fig_data = xs.reduce_ensemble(\n", - " data=data, method=\"kmeans\", kwargs={\"method\": {\"n_clusters\": 3}}\n", + " data=datasets, method=\"kmeans\", horizons=[\"2021-2050\"], max_clusters=3\n", ")" ] }, diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py index 8971c116..2f141e4e 100644 --- a/tests/test_ensembles.py +++ b/tests/test_ensembles.py @@ -10,6 +10,7 @@ import xesmf as xe except ImportError: xe = None +from xclim.testing import open_dataset from xclim.testing.helpers import test_timeseries as timeseries import xscen as xs @@ -1115,3 +1116,57 @@ def test_build_partition_data(self, samplecat, tmp_path): subset_kw=dict(name="mtl", method="gridpoint", lat=[45.0], lon=[-74]), indicators_kw=dict(indicators=[xc.atmos.tg_mean, xc.indicators.cf.tg]), ) + + +class TestReduceEnsemble: + def test_with_criteria(self): + ds = open_dataset("EnsembleReduce/TestEnsReduceCriteria.nc") + selected, clusters, fig_data = xs.reduce_ensemble( + ds["data"], method="kmeans", max_clusters=3 + ) + assert selected.shape == (3,) + np.testing.assert_array_equal(selected, [4, 7, 23]) + assert len(clusters) == 3 + assert fig_data is not None + + @pytest.mark.parametrize("horizon", ["1981-2010", "2021-2050"]) + def test_without_criteria(self, horizon): + datasets = { + "ACCESS": "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "BNU-ESM": "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "CCSM4-r1": "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "CCSM4-r2": "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", + "CNRM-CM5": "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc", + } + for d in datasets: + ds = open_dataset(datasets[d]).isel(lon=slice(0, 4), lat=slice(0, 4)) + ds = xs.climatological_op( + ds, + op="mean", + window=30, + periods=[["1981", "2010"], ["2021", "2050"]], + horizons_as_dim=True, + ).drop_vars("time") + datasets[d] = xs.compute_deltas(ds, reference_horizon="1981-2010") + + selected, clusters, fig_data = xs.reduce_ensemble( + datasets, method="kkz", horizons=[horizon], num_select=4 + ) + assert selected.shape == (4,) + answer = ( + ["ACCESS", "BNU-ESM", "CNRM-CM5", "CCSM4-r1"] + if horizon == "2021-2050" + else ["ACCESS", "BNU-ESM", "CCSM4-r1", "CCSM4-r2"] + ) + np.testing.assert_array_equal(selected, answer) + assert clusters == {} + assert fig_data == {} + + def test_errors(self): + ds = open_dataset("EnsembleReduce/TestEnsReduceCriteria.nc") + with pytest.raises( + ValueError, match="Data must have a 'horizon' dimension to be subsetted." + ): + xs.reduce_ensemble( + ds["data"], method="kmeans", horizons=["1981-2010"], max_clusters=3 + ) diff --git a/xscen/__init__.py b/xscen/__init__.py index a21729c1..b94a45fe 100644 --- a/xscen/__init__.py +++ b/xscen/__init__.py @@ -14,7 +14,6 @@ extract, indicators, io, - reduce, regrid, scripting, spatial, @@ -38,7 +37,6 @@ ) from .indicators import compute_indicators from .io import save_to_netcdf, save_to_table, save_to_zarr -from .reduce import build_reduction_data, reduce_ensemble from .regrid import * from .scripting import ( TimeoutException, diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 0251500f..a8c747d6 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -25,6 +25,7 @@ "build_partition_data", "ensemble_stats", "generate_weights", + "reduce_ensemble", ] @@ -764,3 +765,76 @@ def build_partition_data( ens = ens.rename(rename_dict) return ens + + +@parse_config +def reduce_ensemble( + data: Union[xr.DataArray, dict, list, xr.Dataset], + method: str, + *, + horizons: Optional[list[str]] = None, + create_kwargs: Optional[dict] = None, + **kwargs, +): + r"""Reduce an ensemble of simulations using clustering algorithms from xclim.ensembles. + + Parameters + ---------- + data : xr.DataArray + Selection criteria data : 2-D xr.DataArray with dimensions 'realization' and 'criteria'. + These are the values used for clustering. Realizations represent the individual original + ensemble members and criteria the variables/indicators used in the grouping algorithm. + This data can be generated using py:func:`xclim.ensembles.make_criteria`. + Alternatively, either a xr.Dataset, a list of xr.Dataset or a dictionary of xr.Dataset can be passed, + in which case the data will be built using py:func:`xclim.ensembles.create_ensemble` and py:func:`xclim.ensembles.make_criteria`. + method : str + ['kkz', 'kmeans']. Clustering method. + horizons : list of str, optional + Subset of horizons on which to create the data. Only used if `data` needs to be built. + create_kwargs : dict, optional + Arguments to pass to py:func:`xclim.ensembles.create_ensemble` if `data` is not an xr.DataArray. + \*\*kwargs : dict + Arguments to send to either py:func:`xclim.ensembles.kkz_reduce_ensemble` or py:func:`xclim.ensembles.kmeans_reduce_ensemble`. + + Returns + ------- + selected : xr.DataArray + DataArray of dimension 'realization' with the selected simulations. + clusters : dict + If using kmeans clustering, realizations grouped by cluster. + fig_data : dict + If using kmeans clustering, data necessary to call py:func:`xclim.ensembles.plot_rsqprofile`. + + Notes + ----- + If building `data` to be constructed by this function, the datasets should already have a climatology computed on them, such that the data + has no temporal dimension aside from the "horizon" coordinate (which is optional and might be used to subset the data). + If the indicators are a mix of yearly, seasonal, and monthly, they should be stacked on the same time/horizon axis and put in the same dataset. + You can use py:func:`xscen.utils.unstack_dates` on seasonal or monthly indicators to this end. + """ + if isinstance(data, (list, dict)): + data = ensembles.create_ensemble(datasets=data, **(create_kwargs or {})) + if horizons: + if "horizon" not in data.dims: + raise ValueError("Data must have a 'horizon' dimension to be subsetted.") + data = data.sel(horizon=horizons) + if "criteria" not in data.dims: + data = ensembles.make_criteria(data) + + selected = getattr(ensembles, f"{method}_reduce_ensemble")(data=data, **kwargs) + + clusters = {} + fig_data = {} + if method == "kmeans": + fig_data = selected[2] + clusters_tmp = selected[1] + selected = selected[0] + realization = np.arange(len(clusters_tmp)) + + clusters = { + g: data.realization.isel(realization=realization[clusters_tmp == g]) + for g in np.unique(clusters_tmp) + } + selected = data.realization.isel(realization=selected) + + return selected, clusters, fig_data diff --git a/xscen/reduce.py b/xscen/reduce.py index 6bb6916a..b7310769 100644 --- a/xscen/reduce.py +++ b/xscen/reduce.py @@ -9,8 +9,6 @@ from .config import parse_config -__all__ = ["build_reduction_data", "reduce_ensemble"] - @parse_config def build_reduction_data( @@ -129,32 +127,17 @@ def reduce_ensemble( If the indicators are a mix of yearly, seasonal, and monthly, they should be stacked on the same time/horizon axis and put in the same dataset. You can use py:func:`xscen.utils.unstack_dates` on seasonal or monthly indicators to this end. """ - if isinstance(data, (list, dict)): - data = xce.create_ensemble(datasets=data, **(create_kwargs or {})) - if horizons: - if "horizon" not in data.dims: - raise ValueError("Data must have a 'horizon' dimension to be subsetted.") - data = data.sel(horizon=horizons) - if "criteria" not in data.dims: - data = xce.make_criteria(data) - - selected = getattr(xce, f"{method}_reduce_ensemble")(data=data, **kwargs) - - clusters = {} - fig_data = {} - if method == "kmeans": - fig_data = selected[2] - clusters_tmp = selected[1] - selected = selected[0] - realization = np.arange(len(clusters_tmp)) - - clusters = { - g: data.realization.isel(realization=realization[clusters_tmp == g]) - for g in np.unique(clusters_tmp) - } - selected = data.realization.isel(realization=selected) - - return selected, clusters, fig_data + warnings.warn( + "This function has been moved to xscen.ensembles.reduce_ensemble. This version will be dropped in a future release.", + FutureWarning, + ) + return reduce_ensemble( + data=data, + method=method, + horizons=horizons, + create_kwargs=create_kwargs, + **kwargs, + ) def _concat_criteria(criteria: Optional[xr.DataArray], ens: xr.Dataset): From 2a9509b163d6baea8d595fa89ee1443a8865934b Mon Sep 17 00:00:00 2001 From: RondeauG Date: Tue, 23 Apr 2024 11:50:59 -0400 Subject: [PATCH 5/7] upd changes --- CHANGES.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index d3a05a67..775b8688 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,11 +6,18 @@ v0.9.0 (unreleased) ------------------- Contributors to this version: Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Gabriel Rondeau-Genesse (:user:`RondeauG`), Juliette Lavoie (:user:`juliettelavoie`), Marco Braun (:user:`vindelico`). +New features and enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* ``xs.reduce_ensemble`` will now call ``xclim.ensembles.create_ensemble`` and ``xclim.ensembles.make_critera`` if required. (:pull:`386`). + Breaking changes ^^^^^^^^^^^^^^^^ * Removed support for the old instances of the `region` argument in ``spatial_mean``, ``extract_dataset``, and ``subset``. (:pull:`367`). * Removed ``xscen.extract.clisops_subset``. (:pull:`367`). * ``dtr`` (the function) was renamed to ``dtr_from_minmax`` to avoid confusion with the `dtr` variable. (:pull:`372`). +* The ``xscen.reduce`` module has been abandoned. (:pull:`386`). + * ``build_reduction_data`` has been made redundant by ``xclim.ensembles.make_critera`` and will be removed in a future release. + * ``xscen.reduce.reduce_ensemble`` has been moved to ``xscen.ensembles.reduce_ensemble``, as a module was no longer necessary. Internal changes ^^^^^^^^^^^^^^^^ From 6b1c526d7d9ea0cf92da6e00b4210d8620e131a5 Mon Sep 17 00:00:00 2001 From: RondeauG Date: Tue, 23 Apr 2024 11:54:44 -0400 Subject: [PATCH 6/7] small fix to templates --- templates/1-basic_workflow_with_config/workflow1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/1-basic_workflow_with_config/workflow1.py b/templates/1-basic_workflow_with_config/workflow1.py index bcbaed34..ac8f70a4 100644 --- a/templates/1-basic_workflow_with_config/workflow1.py +++ b/templates/1-basic_workflow_with_config/workflow1.py @@ -417,7 +417,7 @@ xs.measure_time(name=f"{cur}", logger=logger), ): # Compute climatological mean - ds_mean = xs.climatological_mean(ds=ds_input) + ds_mean = xs.climatological_op(ds=ds_input) # Save to zarr path = f"{CONFIG['paths']['task']}".format(**cur) From 2c47fc5774251e0fd054fecbd36ec272b317dfbf Mon Sep 17 00:00:00 2001 From: RondeauG Date: Tue, 23 Apr 2024 12:05:56 -0400 Subject: [PATCH 7/7] omit reduce.py fromcoveralls --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cf16822a..049a27cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,7 +162,7 @@ values = [ [tool.coverage.run] relative_files = true include = ["xscen/*"] -omit = ["docs/notebooks/*.ipynb", "tests/*.py"] +omit = ["docs/notebooks/*.ipynb", "tests/*.py", "xscen/reduce.py"] # FIXME: Remove xscen/reduce.py when it's fully deleted. [tool.isort] append_only = true