In [None]:
#| default_exp validation

# Validation

> Utilities to validate input data

In [None]:
#| export
import re
from typing import Optional, Union

import pandas as pd

from utilsforecast.compat import DFType, DataFrame, Series, pl_DataFrame, pl_Series, pl

In [None]:
import datetime

from fastcore.test import test_eq, test_fail

In [None]:
#| polars
import polars.testing

In [None]:
#| exporti
def _is_int_dtype(s: Union[pd.Index, Series]) -> bool:
    if isinstance(s, (pd.Index, pd.Series)):
        out = pd.api.types.is_integer_dtype(s.dtype)
    else:
        try:
            out = s.dtype.is_integer()
        except AttributeError:
            out = s.is_integer()
    return out

def _is_dt_dtype(s: Union[pd.Index, Series]) -> bool:
    if isinstance(s, (pd.Index, pd.Series)):
        out = pd.api.types.is_datetime64_any_dtype(s.dtype)
    else:
        out = s.dtype in (pl.Date, pl.Datetime)
    return out

In [None]:
#| hide
assert _is_int_dtype(pd.Series([1, 2]))
assert _is_int_dtype(pd.Index([1, 2], dtype='uint8'))
assert not _is_int_dtype(pd.Series([1.0]))
assert _is_dt_dtype(pd.to_datetime(['2000-01-01']))
assert _is_dt_dtype(pd.to_datetime(['2000-01-01'], utc=True))
assert _is_dt_dtype(pd.to_datetime(['2000-01-01']).astype('datetime64[s]'))

In [None]:
#| hide
#| pyarrow
assert _is_int_dtype(pd.Series([1, 2], dtype='int32[pyarrow]'))
assert _is_dt_dtype(pd.to_datetime(['2000-01-01']).astype('timestamp[ns][pyarrow]'))

In [None]:
#| hide
#| polars
assert _is_int_dtype(pl.Series([1, 2]))
assert _is_int_dtype(pl.Series([1, 2], dtype=pl.UInt8))
assert not _is_int_dtype(pl.Series([1.0]))
assert _is_dt_dtype(pl.Series([datetime.date(2000, 1, 1)]))
assert _is_dt_dtype(pl.Series([datetime.datetime(2000, 1, 1)]))
assert _is_dt_dtype(pl.Series([datetime.datetime(2000, 1, 1, tzinfo=datetime.timezone.utc)]))

In [None]:
#| exporti
def _is_dt_or_int(s: Series) -> bool:
    return _is_dt_dtype(s) or _is_int_dtype(s)

In [None]:
#| export
def ensure_shallow_copy(df: pd.DataFrame) -> pd.DataFrame:
    from packaging.version import Version

    if Version(pd.__version__) < Version("1.4"):
        # https://github.com/pandas-dev/pandas/pull/43406
        df = df.copy()
    return df

In [None]:
#| export
def ensure_time_dtype(df: DFType, time_col: str = 'ds') -> DFType:
    """Make sure that `time_col` contains timestamps or integers.
    If it contains strings, try to cast them as timestamps."""
    times = df[time_col]
    if _is_dt_or_int(times):
        return df
    parse_err_msg = (
        f"Failed to parse '{time_col}' from string to datetime. "
        'Please make sure that it contains valid timestamps or integers.'
    )
    if isinstance(times, pd.Series) and pd.api.types.is_object_dtype(times):
        try:
            times = pd.to_datetime(times)
        except ValueError:
            raise ValueError(parse_err_msg)
        df = ensure_shallow_copy(df.copy(deep=False))
        df[time_col] = times
    elif isinstance(times, pl_Series) and times.dtype == pl.Utf8:
        try:
            times = times.str.to_datetime()
        except pl.exceptions.ComputeError:
            raise ValueError(parse_err_msg)
        df = df.with_columns(times)
    else:
        raise ValueError(f"'{time_col}' should have valid timestamps or integers.")
    return df

In [None]:
pd.testing.assert_frame_equal(
    ensure_time_dtype(pd.DataFrame({'ds': ['2000-01-01']})),
    pd.DataFrame({'ds': pd.to_datetime(['2000-01-01'])})
)
df = pd.DataFrame({'ds': [1, 2]})
assert df is ensure_time_dtype(df)
test_fail(
    lambda: ensure_time_dtype(pd.DataFrame({'ds': ['2000-14-14']})),
    contains='Please make sure that it contains valid timestamps',
)

In [None]:
#| polars
pl.testing.assert_frame_equal(
    ensure_time_dtype(pl.DataFrame({'ds': ['2000-01-01']})),
    pl.DataFrame().with_columns(ds=pl.datetime(2000, 1, 1))
)
df = pl.DataFrame({'ds': [1, 2]})
assert df is ensure_time_dtype(df)
test_fail(
    lambda: ensure_time_dtype(pl.DataFrame({'ds': ['hello']})),
    contains='Please make sure that it contains valid timestamps',
)

In [None]:
#| export
def validate_format(
    df: DataFrame,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
    target_col: Optional[str] = 'y',
) -> None:
    """Ensure DataFrame has expected format.

    Parameters
    ----------
    df : pandas or polars DataFrame
        DataFrame with time series in long format.
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestamp.
    target_col : str, optional (default='y')
        Column that contains the target.

    Returns
    -------
    None
    """
    if not isinstance(df, (pd.DataFrame, pl_DataFrame)):
        raise ValueError(
            f'`df` must be either pandas or polars dataframe, got {type(df)}'
        )

    # required columns
    expected_cols = {id_col, time_col}
    if target_col is not None:
        expected_cols.add(target_col)
    missing_cols = sorted(expected_cols - set(df.columns))
    if missing_cols:
        raise ValueError(f"The following columns are missing: {missing_cols}")

    # time col
    if not _is_dt_or_int(df[time_col]):
        times_dtype = df[time_col].dtype
        raise ValueError(f"The time column ('{time_col}') should have either timestamps or integers, got '{times_dtype}'.")

    # target col
    if target_col is None:
        return None
    target = df[target_col]
    if isinstance(target, pd.Series):
        is_numeric = pd.api.types.is_numeric_dtype(target.dtype)
    else:
        try:
            is_numeric = target.dtype.is_numeric()
        except AttributeError:
            is_numeric = target.is_numeric()
    if not is_numeric:
        raise ValueError(f"The target column ('{target_col}') should have a numeric data type, got '{target.dtype}')")

In [None]:
#| hide
from nbdev import show_doc

In [None]:
show_doc(validate_format)

---

[source](https://github.com/Nixtla/utilsforecast/blob/main/utilsforecast/validation.py#L75){target="_blank" style="float:right; font-size:smaller"}

### validate_format

>      validate_format
>                       (df:Union[pandas.core.frame.DataFrame,polars.dataframe.f
>                       rame.DataFrame], id_col:str='unique_id',
>                       time_col:str='ds', target_col:Optional[str]='y')

Ensure DataFrame has expected format.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | Union |  | DataFrame with time series in long format. |
| id_col | str | unique_id | Column that identifies each serie. |
| time_col | str | ds | Column that identifies each timestamp. |
| target_col | Optional | y | Column that contains the target. |
| **Returns** | **None** |  |  |

In [None]:
import datetime

from utilsforecast.compat import POLARS_INSTALLED, pl
from utilsforecast.data import generate_series

In [None]:
test_fail(lambda: validate_format(1), contains="got <class 'int'>")
constructors = [pd.DataFrame]
if POLARS_INSTALLED:
    constructors.append(pl.DataFrame)
for constructor in constructors:
    df = constructor({'unique_id': [1]})
    test_fail(lambda: validate_format(df), contains="missing: ['ds', 'y']")
    df = constructor({'unique_id': [1], 'time': ['x'], 'y': [1]})
    test_fail(lambda: validate_format(df, time_col='time'), contains="('time') should have either timestamps or integers")
    for time in [1, datetime.datetime(2000, 1, 1)]:
        df = constructor({'unique_id': [1], 'ds': [time], 'sales': ['x']})
        test_fail(lambda: validate_format(df, target_col='sales'), contains="('sales') should have a numeric data type")

In [None]:
#| export
def validate_freq(
    times: Series,
    freq: Union[str, int],
) -> None:
    if _is_int_dtype(times) and not isinstance(freq, int):
        raise ValueError(
            "Time column contains integers but the specified frequency is not an integer. "
            "Please provide a valid integer, e.g. `freq=1`"
        )
    if _is_dt_dtype(times) and isinstance(freq, int):
        raise ValueError(
            "Time column contains timestamps but the specified frequency is an integer. "
            "Please provide a valid pandas or polars offset, e.g. `freq='D'` or `freq='1d'`."
        )
    # try to catch pandas frequency in polars dataframe
    if isinstance(times, pl_Series) and isinstance(freq, str):
        missing_n = re.search(r"\d+", freq) is None
        uppercase = re.sub(r"\d+", "", freq).isupper()
        if missing_n or uppercase:
            raise ValueError(
                "You must specify a valid polars offset when using polars dataframes. "
                "You can find the available offsets in "
                "https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.dt.offset_by.html"
            )

In [None]:
test_fail(lambda: validate_freq(pd.Series([1, 2]), 'D'), contains='provide a valid integer')
test_fail(lambda: validate_freq(pd.to_datetime(['2000-01-01']).to_series(), 1), contains='provide a valid pandas or polars offset')

In [None]:
#| polars
test_fail(lambda: validate_freq(pl.Series([1, 2]), '1d'), contains='provide a valid integer')
test_fail(lambda: validate_freq(pl.Series([datetime.datetime(2000, 1, 1)]), 1), contains='provide a valid pandas or polars offset')
test_fail(lambda: validate_freq(pl.Series([datetime.datetime(2000, 1, 1)]), 'D'), contains='valid polars offset')