In [None]:
#default_exp data.datasets.tourism

# Tourism dataset

> API details.

In [None]:
#export
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset

from nixtla.data.datasets.utils import download_file, Info, TimeSeriesDataclass
from nixtla.data.ts_dataset import TimeSeriesDataset

In [None]:
#export
SOURCE_URL = 'https://robjhyndman.com/data/27-3-Athanasopoulos1.zip'

Tourism meta information

In [None]:
#export
@dataclass
class Yearly:
    seasonality: int = 1
    horizon: int = 4
    freq: str = 'Y'
    rows: int = 2
    name: str = 'Yearly'

@dataclass
class Quarterly:
    seasonality: int = 4
    horizon: int = 8
    freq: str = 'Q'
    rows: int = 3
    name: str = 'Quarterly'

@dataclass
class Monthly:
    seasonality: int = 12
    horizon: int = 24
    freq: str = 'M'
    rows: int = 3
    name: str = 'Monthly'

In [None]:
#export
TourismInfo = Info(groups=('Yearly', 'Quarterly', 'Monthly'),
                   class_groups=(Yearly, Quarterly, Monthly))

In [None]:
#export
class Tourism(TimeSeriesDataclass):

    @staticmethod
    def load(directory: str,
             group: str,
             training: bool = True,
             return_tensor: bool = True) -> Union[TimeSeriesDataset, TimeSeriesDataclass]:
        """
        Downloads and loads Tourism data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Yearly', 'Quarterly', 'Monthly'.
        training: bool
            Wheter return training or testing data. Default True.
        return_tensor: bool
            Wheter return TimeSeriesDataset (tensors, True) or
            TimeSeriesDataclass (dataframes)
        """
        path = Path(directory) / 'tourism' / 'datasets'

        Tourism.download(directory)

        class_group = TourismInfo.get_group(group)

        if training:
            file = path / f'{class_group.name.lower()}_in.csv'
        else:
            file = path / f'{class_group.name.lower()}_oos.csv'

        df = pd.read_csv(file)

        dfs = []
        freq = to_offset(class_group.freq)
        for col in df.columns:
            df_col = df[col]
            length, year = df_col[:2].astype(int)
            skip_rows = class_group.rows
            start_date = pd.to_datetime(f'{year}-01-01')
            if group != 'Yearly':
                n_offsets = df_col[2].astype(int)
                start_date += n_offsets * freq
            elif col == 'Y18' and not training: # viene mal en el archivo esta serie
                start_date += 2 * freq
            df_col = df_col[skip_rows:length + skip_rows]
            df_col = df_col.rename('y').to_frame()
            df_col['unique_id'] = col
            df_col['ds'] = pd.date_range(start_date, periods=length, freq=freq)

            dfs.append(df_col)

        df = pd.concat(dfs)

        df = df.reset_index().filter(items=['unique_id', 'ds', 'y'])
        df = df.sort_values(['unique_id', 'ds'])
        
        if return_tensor:
            return TimeSeriesDataset(y_df=df, X_s_df=None, X_t_df=None)
        else:
            return TimeSeriesDataclass(Y=df, S=None, X=None, group=group)

    @staticmethod
    def download(directory: str) -> None:
        """Downloads Tourism Dataset."""
        path = Path(directory) / 'tourism' / 'datasets'
        if not path.exists():
            download_file(path, SOURCE_URL, decompress=True)

In [None]:
for group in TourismInfo.groups:
    print(group)
    tourism_dataset = Tourism.load(directory='data', group=group)
    print(tourism_dataset.ts_tensor)

Yearly
Processing dataframes ...
Creating ts tensor ...
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.6053e+04,
          3.8473e+04, 3.8421e+04],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00,
          1.0000e+00, 1.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.9310e+05,
          3.2960e+05, 2.3426e+05],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00,
          1.0000e+00, 1.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 8.1042e+02,
          9.1592e+02, 1.0107e+03],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00,
          1.0000e+00, 1.0000e+00]],

        ...,

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 4.8070e+00,
          4.3470e+00, 4.9200e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00,
          1.0000e+00, 1.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.0900e+01,
          2.1000e+01, 2.0164e+01],
         [0.0000e+00, 0.0000e+0

In [None]:
for group in TourismInfo.class_groups:
    train_data = Tourism.load(directory='data', group=group.name, training=True, return_tensor=False).Y
    valid_data = Tourism.load(directory='data', group=group.name, training=False, return_tensor=False).Y   
    train_end_dates = train_data.groupby('unique_id').ds.max()
    valid_start_dates = valid_data.groupby('unique_id').ds.min()
    all_dates = train_end_dates.to_frame('end').join(valid_start_dates.rename('start'))
    
    assert (all_dates['end'] + to_offset(group.freq) == all_dates['start']).all()