In [None]:
#default_exp data.datasets.m3

# M3 dataset

> API details.

In [None]:
#export
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd

from nixtla.data.datasets.utils import download_file, Info, TimeSeriesDataclass

In [None]:
#export
SOURCE_URL = 'https://forecasters.org/data/m3comp/M3C.xls'

Tourism meta information

In [None]:
#export
@dataclass
class Yearly:
    seasonality: int = 1
    horizon: int = 6
    freq: str = 'Y'
    sheet_name: str = 'M3Year'
    name: str = 'Yearly'

@dataclass
class Quarterly:
    seasonality: int = 4
    horizon: int = 8
    freq: str = 'Q'
    sheet_name: str = 'M3Quart'
    name: str = 'Quarterly'

@dataclass
class Monthly:
    seasonality: int = 12
    horizon: int = 18
    freq: str = 'M'
    sheet_name: str = 'M3Month'
    name: str = 'Monthly'

@dataclass
class Other:
    seasonality: int = 1
    horizon: int = 8
    freq: str = 'D'
    sheet_name: str = 'M3Other'
    name: str = 'Other'

In [None]:
#export
M3Info = Info(groups=('Yearly', 'Quarterly', 'Monthly', 'Other'),
              class_groups=(Yearly, Quarterly, Monthly, Other))

In [None]:
#exporti
def _return_year(ts):
    year = ts.iloc[0]
    year = year if year != 0 else 1970

    return year

In [None]:
#export
@dataclass
class M3(TimeSeriesDataclass):

    @staticmethod
    def load(directory: str,
             group: str,
             training: bool = True) -> 'M3':
        """
        Downloads and loads M3 data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Yearly', 'Quarterly', 'Monthly', 'Other'.
        training: bool
            Wheter return training or testing data. Default True.
        """
        path = Path(directory) / 'm3' / 'datasets'

        M3.download(directory)

        class_group = M3Info.get_group(group)

        df = pd.read_excel(path / 'M3C.xls', sheet_name=class_group.sheet_name)

        df = df.rename(columns={'Series': 'unique_id',
                                'Category': 'category',
                                'Starting Year': 'year',
                                'Starting Month': 'month'})

        df['unique_id'] = [class_group.name[0] + str(i + 1) for i in range(len(df))]
        S = df.filter(items=['unique_id', 'category'])

        id_vars = list(df.columns[:6])
        df = pd.melt(df, id_vars=id_vars, var_name='ds', value_name='y')
        df = df.dropna().sort_values(['unique_id', 'ds']).reset_index(drop=True)

        freq = pd.tseries.frequencies.to_offset(class_group.freq)

        if group == 'Other':
            df['year'] = 1970

        df['ds'] = df.groupby('unique_id')['year'] \
                     .transform(lambda df: pd.date_range(f'{_return_year(df)}-01-01',
                                                         periods=df.shape[0],
                                                         freq=freq))

        df = df.filter(items=['unique_id', 'ds', 'y'])

        if training:
            df = df.groupby('unique_id').apply(lambda df: df.head(-class_group.horizon)).reset_index(drop=True)
        else:
            df = df.groupby('unique_id').tail(class_group.horizon)
            df['ds'] = df.groupby('unique_id').cumcount() + 1

        return M3(Y=df, S=S, X=None, idx_categorical_static=[0], group=group)

    @staticmethod
    def download(directory: Path) -> None:
        """Download M3 Dataset."""
        path = Path(directory) / 'm3' / 'datasets'
        if not path.exists():
            download_file(path, SOURCE_URL)

Observaciones:

1. Hay series mensuales que no tienen año de inicio, en este caso iniciarán en 1970.
2. Las series `Other` no tienen fecha como tal. Serán consideradas como diarias y empezarán en 1970.

In [None]:
for group in M3Info.groups:
    print(group)
    m3_dataset = M3.load(directory='data', group=group)
    print(m3_dataset.Y.head(), '\n', m3_dataset.S.head())

Yearly
  unique_id         ds        y
0        Y1 1975-01-01   940.66
1        Y1 1975-01-02  1084.86
2        Y1 1975-01-03  1244.98
3        Y1 1975-01-04  1445.02
4        Y1 1975-01-05  1683.17 
   unique_id      category
0        Y1  MICRO       
1        Y2  MICRO       
2        Y3  MICRO       
3        Y4  MICRO       
4        Y5  MICRO       
Quarterly
  unique_id         ds        y
0        Q1 1984-03-31  3142.63
1        Q1 1984-06-30  3190.75
2        Q1 1984-09-30  3178.69
3        Q1 1984-12-31  3170.94
4        Q1 1985-03-31  3124.38 
   unique_id      category
0        Q1  MICRO       
1        Q2  MICRO       
2        Q3  MICRO       
3        Q4  MICRO       
4        Q5  MICRO       
Monthly
  unique_id         ds       y
0        M1 1990-01-31  2640.0
1        M1 1990-02-28  2640.0
2        M1 1990-03-31  2160.0
3        M1 1990-04-30  4200.0
4        M1 1990-05-31  3360.0 
   unique_id      category
0        M1  MICRO       
1        M2  MICRO       
2        