Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics for temporal subgroups #266

Merged
merged 24 commits into from Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ead9b03
Update environment
wpreimes Jan 14, 2022
98d5024
First implementation of flexible set metrics
wpreimes Jan 17, 2022
1e6bf06
Fix keyword for metrics calculation when reference dataset must be in…
wpreimes Jan 19, 2022
285f600
Merge branch 'fix-only-with-temporal-ref' into set-metrics
wpreimes Jan 19, 2022
4852b56
Update tests
wpreimes Jan 19, 2022
b176281
Merge branch 'fix-only-with-temporal-ref' into set-metrics
wpreimes Jan 19, 2022
19bba5b
Update CHANGELOG.rst
wpreimes Jan 19, 2022
cb43e20
Update CHANGELOG.rst
wpreimes Jan 19, 2022
5f16b71
Merge branch 'fix-only-with-temporal-ref' into set-metrics
wpreimes Jan 19, 2022
589476f
Update env
wpreimes Jan 19, 2022
f9bdc2e
Remove unnecessary checks for data availability
wpreimes Feb 14, 2022
7409972
Merge branch 'master' into set-metrics
wpreimes Feb 14, 2022
a3d3c7c
Undo
wpreimes Feb 14, 2022
b1a11a0
Fix Test
wpreimes Apr 4, 2022
e3cf56a
Merge branch 'master' of github.com:wpreimes/pytesmo into set-metrics
wpreimes Apr 12, 2022
c357fa6
Make bootstrapping settings better accessible when using the validati…
wpreimes Apr 14, 2022
cea4496
Merge branch 'frm4sm_ci' into set-metrics
wpreimes Apr 14, 2022
7ccbd7a
Merge branch 'master' of https://github.com/TUW-GEO/pytesmo into set-…
wpreimes Apr 19, 2022
b4eda8f
Renamed GenericDatetime to YearlessDatetime and moved to grouping module
wpreimes Aug 21, 2023
26cc09b
Update notebook to include subset metrics and reader adapters
wpreimes Aug 21, 2023
83be9dc
Update tests
wpreimes Aug 21, 2023
445f33c
Change yearless date name
wpreimes Aug 21, 2023
4cc1c0c
Fix merge conflicts
wpreimes Aug 21, 2023
9a6f103
Fix tests
wpreimes Aug 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
546 changes: 474 additions & 72 deletions docs/examples/validation_framework.ipynb

Large diffs are not rendered by default.

245 changes: 241 additions & 4 deletions src/pytesmo/time_series/grouping.py
Expand Up @@ -26,22 +26,23 @@
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# Author: Christoph Paulik christoph.paulik@geo.tuwien.ac.at
# Creation date: 2014-06-30


"""
Module provides grouping functions that can be used together with pandas
to create a few strange timegroupings like e.g. decadal products were
there are three products per month with timestamps on the 10th 20th and last
of the month
"""
from dataclasses import dataclass
from typing import Optional, Union, Tuple, List

import pandas as pd
import numpy as np
from datetime import date
from datetime import date, datetime
import calendar

from cadati.conv_doy import doy


def group_by_day_bin(df, bins=[1, 11, 21, 32], start=False,
dtindex=None):
Expand Down Expand Up @@ -153,3 +154,239 @@ def grouped_dates_between(start_date, end_date, bins=[1, 11, 21, 32], start=Fals
tstamps = grp.sum().index.to_pydatetime().tolist()

return tstamps


@dataclass
class YearlessDatetime:
"""
Container class to store Datetime information without a year. This is
used to group data when the year is not relevant (e.g. seasonal analysis).
Only down to second. Used by
:class:`pytesmo.validation_framework.metric_calculators_adapters.TsDistributor`
"""
month: int

day: int = 1
hour: int = 0
minute: int = 0
second: int = 0

@property
def __ly(self):
return 2400 # arbitrary leap year

def __ge__(self, other: 'YearlessDatetime'):
return self.to_datetime(self.__ly) >= other.to_datetime(self.__ly)

def __le__(self, other: 'YearlessDatetime'):
return self.to_datetime(self.__ly) <= other.to_datetime(self.__ly)

def __lt__(self, other: 'YearlessDatetime'):
return self.to_datetime(self.__ly) < other.to_datetime(self.__ly)

def __gt__(self, other: 'YearlessDatetime'):
return self.to_datetime(self.__ly) > other.to_datetime(self.__ly)

def __repr__(self):
return f"****-{self.month:02}-{self.day:02}" \
f"T{self.hour:02}:{self.minute:02}:{self.second:02}"

@property
def doy(self) -> int:
"""
Get day of year for this date. Assume leap year!
i.e.: 1=Jan.1st, 366=Dec.31st, 60=Feb.29th.
"""
return doy(self.month, self.day, year=None)

@classmethod
def from_datetime(cls, dt: datetime):
"""
Omit year from passed datetime to create generic datetime.
"""
return cls(dt.month, dt.day, dt.hour, dt.minute, dt.second)

def to_datetime(self, years: Optional[Union[Tuple[int, ...], int]]) \
-> Union[datetime, List, None]:
"""
Convert generic datetime to datetime with year.
Feb 29th for non-leap-years will return None
"""
dt = []

for year in np.atleast_1d(years):
if not calendar.isleap(year) and self.doy == 60.:
continue
else:
d = datetime(year, self.month, self.day, self.hour,
self.minute, self.second)
dt.append(d)

if len(dt) == 1:
return dt[0]
elif len(dt) == 0:
return None
else:
return dt


class TsDistributor:

def __init__(self,
dates=None,
date_ranges=None,
yearless_dates=None,
yearless_date_ranges=None):
"""
Build a data distibutor from individual dates, date ranges, generic
dates (without specific year) and generic date ranges.

Components:
- individual datetime objects for distinct dates
- generic datetime objects for dates without specific a year
- date range / datetime tuple
i.e. ALL datetimes between the 2 passed dates (start, end)
the start date must be earlier than the end date
- generic date range / generic datetime tuple
i.e. ALL datetimes between 2 generic dates (for any year)

Parameters
----------
dates : Tuple[datetime, ...] or Tuple[str, ...] or pd.DatetimeIndex
Individual dates (that also have a year assigned).
date_ranges: Tuple[Tuple[datetime, datetime], ...]
A list of date ranges, consisting of a start and end date for each
range. The start date must be earlier in time than the end date.
yearless_dates: Tuple[YearlessDatetime,...] or Tuple[datetime...]
A list of generic dates (that apply to any year).
Can be passed as a list of
- YearlessDatetime objects
e.g. YearlessDatetime(5,31,12,1,10), ie. May 31st 12:01:10
- pydatetime objects (years will be ignored, duplicates dropped)
yearless_date_ranges: [Tuple[YearlessDatetime, YearlessDatetime], ...]
A list of generic date ranges (that apply to any year).
"""

self.dates = dates
self.date_ranges = date_ranges
self.yearless_dates = yearless_dates
self.yearless_date_ranges = yearless_date_ranges

def __repr__(self):
s = []
for var in ['dates', 'date_ranges', 'yearless_dates',
'yearless_date_ranges']:
val = getattr(self, var)
s.append(f"#{var}={len(val) if val is not None else 0}")

return f"{self.__class__.__name__}({', '.join(s)})"

def select(self,
df: Union[pd.DataFrame, pd.Series, pd.DatetimeIndex],
set_nan=False):
"""
Select rows from data frame or series with mathing date time indices.

Parameters
----------
df: pd.DataFrame or pd.Series
Must have a date time index, which is then filtered based on the
dates.
set_nan: bool, optional (default: False)
Instead of dropping rows that are not selected, set their values to
nan.


Returns
-------
df: pd.DataFrame or pd.Series
The filterd input data

"""
if isinstance(df, pd.DatetimeIndex):
idx = df
else:
idx = df.index

if not isinstance(idx, pd.DatetimeIndex):
raise ValueError(f"Expected a DatetimeIndex, "
f"but got {type(df.index)}.")

mask = self.filter(idx)

if set_nan:
df[~mask] = np.nan
return df
else:
return df[mask]

def filter(self, idx: pd.DatetimeIndex):
"""
Filter datetime index for a TimeSeriesDistributionSet

Parameters
----------
idx: pd.DatetimeIndex
Datetime index to split using the set

Returns
-------
idx_filtered: pd.DatetimeIndex
Filtered Index that contains dates for the set
"""

mask = pd.DataFrame(index=idx.copy())

if self.dates is not None:
_idx_dates = idx.intersection(pd.DatetimeIndex(self.dates))
mask['dates'] = False
mask.loc[_idx_dates, 'dates'] = True

if self.date_ranges is not None:
for i, drange in enumerate(self.date_ranges):
start, end = drange[0], drange[1]
if start > end:
start, end = end, start
mask[f"range{i}"] = (idx >= start) & (idx <= end)

if self.yearless_dates is not None:
arrs = np.array([])
for d in self.yearless_dates:
dts = d.to_datetime(np.unique(idx.year))
if dts is None:
continue
else:
arrs = np.append(arrs, dts)
_idx_dates = idx.intersection(pd.DatetimeIndex(arrs))
mask['gen_dates'] = False
mask.loc[_idx_dates, 'gen_dates'] = True

# avoid loop like:
# cond = ["__index_month == {}".format(m) for m in months]
# selection = dat.query(" | ".join(cond)).index

if self.yearless_date_ranges is not None:
for i, gdrange in enumerate(self.yearless_date_ranges):
for y in np.unique(idx.year):

if not calendar.isleap(y) and (gdrange[0].doy == 60):
start = YearlessDatetime(3, 1)
else:
start = gdrange[0]

if (not calendar.isleap(y)) and (gdrange[1].doy == 60):
end = YearlessDatetime(2, 28, 23, 59, 59)
else:
end = gdrange[1]

start_dt = start.to_datetime(years=y)

if end < start:
end_dt = end.to_datetime(years=y + 1)
else:
end_dt = end.to_datetime(years=y)

mask[f"gen_range{y}-{i}"] = (idx >= start_dt) & (
idx <= end_dt)

return mask.any(axis=1, bool_only=True)