In [None]:
#default_exp data.datasets.tourism

# Tourism dataset

> API details.

In [None]:
#export
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset

from nixtla.data.datasets.utils import download_file, Info, TimeSeriesDataclass

In [None]:
#export
SOURCE_URL = 'https://robjhyndman.com/data/27-3-Athanasopoulos1.zip'

Tourism meta information

In [None]:
#export
@dataclass
class Yearly:
    seasonality: int = 1
    horizon: int = 4
    freq: str = 'Y'
    rows: int = 2
    name: str = 'Yearly'

@dataclass
class Quarterly:
    seasonality: int = 4
    horizon: int = 8
    freq: str = 'Q'
    rows: int = 3
    name: str = 'Quarterly'

@dataclass
class Monthly:
    seasonality: int = 12
    horizon: int = 24
    freq: str = 'M'
    rows: int = 3
    name: str = 'Monthly'

In [None]:
#export
TourismInfo = Info(groups=('Yearly', 'Quarterly', 'Monthly'),
                   class_groups=(Yearly, Quarterly, Monthly))

In [None]:
#export
class Tourism(TimeSeriesDataclass):

    @staticmethod
    def load(directory: str,
             group: str,
             training: bool = True) -> 'Tourism':
        """
        Downloads and loads Tourism data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Yearly', 'Quarterly', 'Monthly'.
        training: bool
            Wheter return training or testing data. Default True.
        """
        path = Path(directory) / 'tourism' / 'datasets'

        Tourism.download(directory)

        class_group = TourismInfo.get_group(group)

        if training:
            file = path / f'{class_group.name.lower()}_in.csv'
        else:
            file = path / f'{class_group.name.lower()}_oos.csv'

        df = pd.read_csv(file)

        dfs = []
        freq = to_offset(class_group.freq)
        for col in df.columns:
            df_col = df[col]
            length, year = df_col[:2].astype(int)
            skip_rows = class_group.rows
            start_date = pd.to_datetime(f'{year}-01-01')         
            if group != 'Yearly':
                n_offsets = df_col[2].astype(int)
                start_date += n_offsets * freq
            elif col == 'Y18' and not training: # viene mal en el archivo esta serie
                start_date += 2 * freq
            df_col = df_col[skip_rows:length + skip_rows]
            df_col = df_col.rename('y').to_frame()
            df_col['unique_id'] = col
            df_col['ds'] = pd.date_range(start_date, periods=length, freq=freq)

            dfs.append(df_col)

        df = pd.concat(dfs)

        df = df.reset_index().filter(items=['unique_id', 'ds', 'y'])
        df = df.sort_values(['unique_id', 'ds'])

        return Tourism(Y=df, S=None, X=None, group=group)

    @staticmethod
    def download(directory: str) -> None:
        """Downloads Tourism Dataset."""
        path = Path(directory) / 'tourism' / 'datasets'
        if not path.exists():
            download_file(path, SOURCE_URL, decompress=True)

In [None]:
for group in TourismInfo.groups:
    print(group)
    tourism_dataset = Tourism.load(directory='data', group=group)
    print(tourism_dataset.Y.head())

Yearly
  unique_id         ds           y
0        Y1 1979-12-31  25092.2284
1        Y1 1980-12-31  24271.5134
2        Y1 1981-12-31  25828.9883
3        Y1 1982-12-31  27697.5047
4        Y1 1983-12-31  27956.2276
Quarterly
  unique_id         ds           y
0        q1 1979-03-31   3592.5500
1        q1 1979-06-30   6409.3403
2        q1 1979-09-30  10953.4928
3        q1 1979-12-31   4136.8453
4        q1 1980-03-31   3369.4600
Monthly
  unique_id         ds          y
0        m1 1979-01-31  1149.8700
1        m1 1979-02-28  1053.8002
2        m1 1979-03-31  1388.8798
3        m1 1979-04-30  1783.3702
4        m1 1979-05-31  1921.0252


In [None]:
for group_name, group in zip(TourismInfo.groups, TourismInfo.class_groups):
    train_data = Tourism.load(directory='data', group=group_name, training=True).Y
    valid_data = Tourism.load(directory='data', group=group_name, training=False).Y   
    train_end_dates = train_data.groupby('unique_id').ds.max()
    valid_start_dates = valid_data.groupby('unique_id').ds.min()
    all_dates = train_end_dates.to_frame('end').join(valid_start_dates.rename('start'))
    
    assert (all_dates['end'] + to_offset(group.freq) == all_dates['start']).all()