In [None]:
#default_exp data.datasets.wth

# Weather (WTH) dataset

> Download the WTH dataset: https://www.ncei.noaa.gov/data/local-climatological-data/.

In [None]:
#hide
from nbdev import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#export
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import gdown
import numpy as np
import pandas as pd

from nixtlats.data.datasets.utils import Info, time_features_from_frequency_str
from nixtlats.data.datasets.ett import process_multiple_ts

## WTH meta information

In [None]:
#export
@dataclass
class WTH:
    freq: str = 'H'
    name: str = 'WTH'
    n_ts: int = 12

In [None]:
#export
WTHInfo = Info(groups=('WTH',),
              class_groups=(WTH,))

## Download data class

In [None]:
#export
@dataclass
class WTH:
    
    source_url: str = 'https://drive.google.com/uc?id=1UBRz-aM_57i_KCC-iaSWoKDPTGGv6EaG'

    @staticmethod
    def load(directory: str,
             cache: bool = True) -> Tuple[pd.DataFrame, 
                                          Optional[pd.DataFrame], 
                                          Optional[pd.DataFrame]]:
        """Downloads and loads ETT data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        cache: bool
            If `True` saves and loads 
            
        Notes
        -----
        [1] Returns train+val+test sets.
        """
        path = f'{directory}/wth/datasets'
        file_cache = f'{path}/WTH.p'
        
        if os.path.exists(file_cache) and cache:
            df, X_df, S_df = pd.read_pickle(file_cache)
            
            return df, X_df, S_df
        
        
        WTH.download(directory)
        path = f'{directory}/wth/datasets'
        
        y_df = pd.read_csv(f'{path}/WTH.csv')
        y_df, X_df = process_multiple_ts(y_df)
       
        S_df = None
        if cache:
            pd.to_pickle((y_df, X_df, S_df), file_cache)
            
        return y_df, X_df, S_df

    @staticmethod
    def download(directory: str) -> None:
        """Download WTH Dataset."""
        path = f'{directory}/wth/datasets/'
        if not os.path.exists(path):
            os.makedirs(path)
            gdown.download(WTH.source_url, f'{path}/WTH.csv')

In [None]:
for group, meta in WTHInfo:
    y_df, x_df, s_df = WTH.load(directory='data', cache=False)
    n_series = len(np.unique(y_df.unique_id.values))
    ex_vars = x_df.columns.to_list()
    ex_vars.remove('unique_id')
    ex_vars.remove('ds')

    display_str  = f'Group: {group} '
    display_str += f'n_series: {n_series} '
    display_str += f'ex_vars: {", ".join(ex_vars)}'

    print(display_str)

Group: WTH n_series: 12 ex_vars: HourOfDay, DayOfWeek, DayOfMonth, DayOfYear


In [None]:
def test_wth(expected_first_ds_y: np.ndarray,
             expected_first_ds_x: np.ndarray) -> None:
    y_df, x_df, _ = WTH.load(directory='data', cache=False)
    first_ds_y = y_df.groupby('unique_id').head(1)['y'].values
    first_ds_x = x_df.groupby('unique_id').head(1).drop(['unique_id', 'ds'], axis=1).values
    expected_x = np.repeat(expected_first_ds_x.reshape(1, -1), first_ds_x.shape[0], axis=0)
    
    np.testing.assert_array_almost_equal(first_ds_y, expected_first_ds_y)
    np.testing.assert_array_almost_equal(first_ds_x, expected_x)

In [None]:
test_wth(expected_first_ds_y=np.array([ 10.,  16.,  -9.,  13.,   7., -14.,  67.,   7.,  130.,  21.65,  30.35, -10.3]),
         expected_first_ds_x=np.array([-0.5       ,  0.16666667, -0.5       , -0.5       ]))