In [None]:
#| default_exp validation

# Validation

> Utilities to validate input data

In [None]:
#| export
import numpy as np

from utilsforecast.compat import DataFrame, pl_DataFrame

In [None]:
#| export
def validate_format(
    df: DataFrame,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
    target_col: str = 'y',
) -> None:
    """Ensure DataFrame has expected format.

    Parameters
    ----------
    df : pandas or polars DataFrame
        DataFrame with time series in long format.
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestamp.
    target_col : str (default='y')
        Column that contains the target.

    Returns
    -------
    None
    """
    if not isinstance(df, (pd.DataFrame, pl_DataFrame)):
        raise ValueError(
            f'`df` must be either pandas or polars dataframe, got {type(df)}'
        )

    # required columns
    missing_cols = sorted({id_col, time_col, target_col} - set(df.columns))
    if missing_cols:
        raise ValueError(f"The following columns are missing: {missing_cols}")

    # time col
    times_dtype = df[time_col].head(1).to_numpy().dtype
    if not (np.issubdtype(times_dtype, np.datetime64) or np.issubdtype(times_dtype, np.integer)):
        raise ValueError(f"The time column ('{time_col}') should have either timestamps or integers, got '{times_dtype}'.")

    # target col
    target_dtype = df[target_col].head(1).to_numpy().dtype
    if not np.issubdtype(target_dtype, np.number):
        raise ValueError(f"The target column ('{target_col}') should have a numeric data type, got '{target_dtype}')")

In [None]:
#| hide
from nbdev import show_doc

In [None]:
show_doc(validate_format)

---

[source](https://github.com/Nixtla/utilsforecast/blob/main/utilsforecast/validation.py#L12){target="_blank" style="float:right; font-size:smaller"}

### validate_format

>      validate_format
>                       (df:Union[pandas.core.frame.DataFrame,polars.dataframe.f
>                       rame.DataFrame], id_col:str='unique_id',
>                       time_col:str='ds', target_col:str='y')

Ensure DataFrame has expected format.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | Union |  | DataFrame with time series in long format. |
| id_col | str | unique_id | Column that identifies each serie. |
| time_col | str | ds | Column that identifies each timestamp. |
| target_col | str | y | Column that contains the target. |
| **Returns** | **None** |  |  |

In [None]:
import datetime

import pandas as pd
from fastcore.test import test_fail

from utilsforecast.compat import POLARS_INSTALLED, pl
from utilsforecast.data import generate_series

In [None]:
test_fail(lambda: validate_format(1), contains="got <class 'int'>")
constructors = [pd.DataFrame]
if POLARS_INSTALLED:
    constructors.append(pl.DataFrame)
for constructor in constructors:
    df = constructor({'unique_id': [1]})
    test_fail(lambda: validate_format(df), contains="missing: ['ds', 'y']")
    df = constructor({'unique_id': [1], 'time': ['x'], 'y': [1]})
    test_fail(lambda: validate_format(df, time_col='time'), contains="('time') should have either timestamps or integers")
    for time in [1, datetime.datetime(2000, 1, 1)]:
        df = constructor({'unique_id': [1], 'ds': [time], 'sales': ['x']})
        test_fail(lambda: validate_format(df, target_col='sales'), contains="('sales') should have a numeric data type")