In [None]:
#| default_exp preprocessing

# Preprocessing
> Utilities for processing data before training/analysis

In [None]:
#| export
from typing import Union

import numpy as np
import pandas as pd

In [None]:
#| hide
from nbdev import show_doc

from utilsforecast.data import generate_series

In [None]:
#| exporti
def _determine_bound(bound, freq, times_by_id, agg) -> np.ndarray:
    if bound == 'per_serie':
        out = times_by_id[agg].values
    else:
        # the following return a scalar
        if bound == 'global':
            val = getattr(times_by_id[agg].values, agg)()
            if isinstance(freq, str):
                val = np.datetime64(val)
        else:
            if isinstance(freq, str):
                # this raises a nice error message if it isn't a valid datetime
                val = np.datetime64(bound)
            else:
                val = bound
        out = np.full(times_by_id.shape[0], val)
    if isinstance(freq, str):
        out = out.astype(f'datetime64[{freq}]')
    return out

In [None]:
#| export
def fill_gaps(
    df: pd.DataFrame,
    freq: Union[str, int],
    start: str = 'per_serie',
    end: str = 'global',
    id_col: str = 'unique_id',
    time_col: str = 'ds',
) -> pd.DataFrame:
    """Enforce start and end datetimes for dataframe.

    Parameters
    ----------
    df : pandas DataFrame
        Input data
    freq : str or int
        Series' frequency
    start : str
        Initial timestamp for the series.
            * 'per_serie' uses each serie's first timestamp
            * 'global' uses the first timestamp seen in the data
            * Can also be a specific timestamp, e.g. '2000-01-01'
    end : str
        Initial timestamp for the series.
            * 'per_serie' uses each serie's last timestamp
            * 'global' uses the last timestamp seen in the data
            * Can also be a specific timestamp, e.g. '2000-01-01'
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestamp.
    target_col : str (default='y')
        Column that contains the target.

    Returns
    -------
    filled_df : pandas DataFrame
        Dataframe with gaps filled.
    """
    if isinstance(freq, str):
        if freq == freq.upper():
            # abbreviations like MS = 'Month Start', YS = 'Year Start'
            delta_freq = freq[0]
        else:
            delta_freq = freq
        delta = np.timedelta64(1, delta_freq)
    else:
        delta_freq = freq
        delta = freq
    times_by_id = df.groupby(id_col, observed=True)[time_col].agg(['min', 'max'])
    starts = _determine_bound(start, delta_freq, times_by_id, 'min')
    ends = _determine_bound(end, delta_freq, times_by_id, 'max') + delta
    sizes = ((ends - starts) / delta).astype(np.int64)
    times = pd.Index(
        np.concatenate(
            [
                np.arange(start, end, delta) for start, end in zip(starts, ends)
            ]
        )
    )
    if isinstance(freq, str):
        times = times.astype('datetime64[ns]')
        first_time = np.datetime64(df['ds'].iloc[0])
        was_truncated = first_time != first_time.astype(f'datetime64[{delta_freq}]')
        if was_truncated:
            times += pd.tseries.frequencies.to_offset(freq)
    uids = np.repeat(times_by_id.index, sizes)
    idx = pd.MultiIndex.from_arrays([uids, times], names=[id_col, time_col])
    return df.set_index([id_col, time_col]).reindex(idx).reset_index()

In [None]:
show_doc(fill_gaps)

---

[source](https://github.com/Nixtla/utilsforecast/blob/main/utilsforecast/preprocessing.py#L34){target="_blank" style="float:right; font-size:smaller"}

### fill_gaps

>      fill_gaps (df:pandas.core.frame.DataFrame, freq:Union[str,int],
>                 start:str='per_serie', end:str='global',
>                 id_col:str='unique_id', time_col:str='ds')

Enforce start and end datetimes for dataframe.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | DataFrame |  | Input data |
| freq | Union |  | Series' frequency |
| start | str | per_serie | Initial timestamp for the series.<br>    * 'per_serie' uses each serie's first timestamp<br>    * 'global' uses the first timestamp seen in the data<br>    * Can also be a specific timestamp, e.g. '2000-01-01' |
| end | str | global | Initial timestamp for the series.<br>    * 'per_serie' uses each serie's last timestamp<br>    * 'global' uses the last timestamp seen in the data<br>    * Can also be a specific timestamp, e.g. '2000-01-01' |
| id_col | str | unique_id | Column that identifies each serie. |
| time_col | str | ds | Column that identifies each timestamp. |
| **Returns** | **DataFrame** |  | **Dataframe with gaps filled.** |

In [None]:
df = pd.DataFrame(
    {
        'unique_id': [0, 0, 0, 1, 1],
        'ds': pd.to_datetime(['2020', '2021', '2023', '2021', '2022']),
        'y': np.arange(5),
    }
)
df

Unnamed: 0,unique_id,ds,y
0,0,2020-01-01,0
1,0,2021-01-01,1
2,0,2023-01-01,2
3,1,2021-01-01,3
4,1,2022-01-01,4


The default functionality is taking the current starts and only extending the end date to be the same for all series.

In [None]:
fill_gaps(
    df,
    freq='YS',
)

Unnamed: 0,unique_id,ds,y
0,0,2020-01-01,0.0
1,0,2021-01-01,1.0
2,0,2022-01-01,
3,0,2023-01-01,2.0
4,1,2021-01-01,3.0
5,1,2022-01-01,4.0
6,1,2023-01-01,


We can also specify `end='per_serie'` to only fill possible gaps within each serie.

In [None]:
fill_gaps(
    df,
    freq='YS',
    end='per_serie',
)

Unnamed: 0,unique_id,ds,y
0,0,2020-01-01,0.0
1,0,2021-01-01,1.0
2,0,2022-01-01,
3,0,2023-01-01,2.0
4,1,2021-01-01,3.0
5,1,2022-01-01,4.0


We can also specify an end date in the future.

In [None]:
fill_gaps(
    df,
    freq='YS',
    end='2024',
)

Unnamed: 0,unique_id,ds,y
0,0,2020-01-01,0.0
1,0,2021-01-01,1.0
2,0,2022-01-01,
3,0,2023-01-01,2.0
4,0,2024-01-01,
5,1,2021-01-01,3.0
6,1,2022-01-01,4.0
7,1,2023-01-01,
8,1,2024-01-01,


We can set all series to start at the same time.

In [None]:
fill_gaps(
    df,
    freq='YS',
    start='global'
)

Unnamed: 0,unique_id,ds,y
0,0,2020-01-01,0.0
1,0,2021-01-01,1.0
2,0,2022-01-01,
3,0,2023-01-01,2.0
4,1,2020-01-01,
5,1,2021-01-01,3.0
6,1,2022-01-01,4.0
7,1,2023-01-01,


We can also set a common start date for all series (which can be earlier than their current starts).

In [None]:
fill_gaps(
    df,
    freq='YS',
    start='2019',
)

Unnamed: 0,unique_id,ds,y
0,0,2019-01-01,
1,0,2020-01-01,0.0
2,0,2021-01-01,1.0
3,0,2022-01-01,
4,0,2023-01-01,2.0
5,1,2019-01-01,
6,1,2020-01-01,
7,1,2021-01-01,3.0
8,1,2022-01-01,4.0
9,1,2023-01-01,


In case the times are integers the frequency, start and end must also be integers.

In [None]:
df = pd.DataFrame(
    {
        'unique_id': [0, 0, 0, 1, 1],
        'ds': [2020, 2021, 2023, 2021, 2022],
        'y': np.arange(5),
    }
)
df

Unnamed: 0,unique_id,ds,y
0,0,2020,0
1,0,2021,1
2,0,2023,2
3,1,2021,3
4,1,2022,4


In [None]:
fill_gaps(
    df,
    freq=1,
    start=2019,
    end=2024,
)

Unnamed: 0,unique_id,ds,y
0,0,2019,
1,0,2020,0.0
2,0,2021,1.0
3,0,2022,
4,0,2023,2.0
5,0,2024,
6,1,2019,
7,1,2020,
8,1,2021,3.0
9,1,2022,4.0


In [None]:
#| hide
n_series = 1_000
series = generate_series(n_series, equal_ends=True)

# keep the start date of each serie
first_dates = series.groupby('unique_id', observed=True).head(1)
uneven = series.drop(first_dates.index).sample(int(0.5 * series.shape[0]), random_state=0)
uneven = pd.concat([first_dates, uneven])

end = series['ds'].max()
actual = fill_gaps(uneven, freq='D', end=end)
assert uneven.shape[0] < series.shape[0]
assert actual.shape[0] == series.shape[0]
pd.testing.assert_frame_equal(
    actual.groupby('unique_id', observed=True)['ds'].agg(['min', 'max']),
    series.groupby('unique_id', observed=True)['ds'].agg(['min', 'max']),
)

In [None]:
#| hide

def check_fill(df, filled):
    assert filled.shape[0] > df.shape[0]
    assert df['y'].notnull().any()
    pd.testing.assert_series_equal(
        df['ds'].agg(['min', 'max']),
        filled['ds'].agg(['min', 'max']),
    )

# year end
df_y = pd.DataFrame(
    {
        'unique_id': [0, 0, 0, 1, 1],
        'ds': pd.to_datetime(['2020-12-31', '2021-12-31', '2023-12-31', '2021-12-31', '2022-12-31']),
        'y': np.arange(5),
    }
)
y_filled = fill_gaps(
    df_y,
    freq='Y',
)
check_fill(df_y, y_filled)

# week (sunday)
df_w = pd.DataFrame(
    {
        'unique_id': [0, 0, 0, 1, 1],
        'ds': pd.to_datetime(['2018-11-11', '2018-11-18', '2018-12-02', '2018-11-11', '2018-12-02']),
        'y': np.arange(5),
    }
)
w_filled = fill_gaps(df_w, 'W')
check_fill(df_w, w_filled)

# week (tuesday)
df_wt = pd.DataFrame(
    {
        'unique_id': [0, 0, 0, 1, 1],
        'ds': pd.to_datetime(['2018-11-13', '2018-11-20', '2018-12-04', '2018-11-13', '2018-12-04']),
        'y': np.arange(5),
    }
)
wt_filled = fill_gaps(df_wt, 'W-TUE')
check_fill(df_wt, wt_filled)