In [None]:
#default_exp data.datasets.ett

# Electricity Transformer (ETT) dataset

> Download the ETT dataset.

In [None]:
#hide
from nbdev import *
%load_ext autoreload
%autoreload 2

In [None]:
#export
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd

from nixtlats.data.datasets.utils import download_file, Info, time_features_from_frequency_str

## ETT meta information

In [None]:
#export
@dataclass
class ETTh1:
    freq: str = 'H'
    name: str = 'ETTh1'
    n_ts: int = 7

@dataclass
class ETTh2:
    freq: str = 'H'
    name: str = 'ETTh2'
    n_ts: int = 7

@dataclass
class ETTm1:
    freq: str = '15T'
    name: str = 'ETTm1'
    n_ts: int = 7

@dataclass
class ETTm2:
    freq: str = '15T'
    name: str = 'ETTm2'
    n_ts: int = 7

In [None]:
#export
ETTInfo = Info(groups=('ETTh1', 'ETTh2', 'ETTm1', 'ETTm2'),
               class_groups=(ETTh1, ETTh2, ETTm1, ETTm2))

In [None]:
#export
def process_multiple_ts(y_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Transforms multiple timeseries as columns to long format."""
    y_df['date'] = pd.to_datetime(y_df['date'])
    y_df.rename(columns={'date': 'ds'}, inplace=True)
    u_ids = y_df.columns.to_list()
    u_ids.remove('ds')
    
    time_cls = time_features_from_frequency_str('h')
    for cls_ in time_cls:
        cls_name = cls_.__class__.__name__
        y_df[cls_name] = cls_(y_df['ds'].dt)

    X_df = y_df.drop(u_ids, axis=1)
    y_df = y_df.filter(items=['ds'] + u_ids)
    y_df = y_df.set_index('ds').stack()
    y_df = y_df.rename('y').rename_axis(['ds', 'unique_id']).reset_index()
    y_df['unique_id'] = pd.Categorical(y_df['unique_id'], u_ids)
    y_df = y_df[['unique_id', 'ds', 'y']].sort_values(['unique_id', 'ds'])
    
    X_df = y_df[['unique_id', 'ds']].merge(X_df, how='left', on=['ds'])
    
    return y_df, X_df

## Download data class

In [None]:
#export
@dataclass
class ETT:
    
    source_url: str = 'https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/'

    @staticmethod
    def load(directory: str,
             group: str,
             cache: bool = True) -> Tuple[pd.DataFrame, 
                                          Optional[pd.DataFrame], 
                                          Optional[pd.DataFrame]]:
        """Downloads and loads ETT data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'ETTh1', 'ETTh2', 
                            'ETTm1', 'ETTm2'.
        cache: bool
            If `True` saves and loads 
            
        Notes
        -----
        [1] Returns train+val+test sets.
        """
        path = f'{directory}/ett/datasets'
        file_cache = f'{path}/{group}.p'
        
        if os.path.exists(file_cache) and cache:
            df, X_df, S_df = pd.read_pickle(file_cache)
            
            return df, X_df, S_df
        
        
        ETT.download(directory)
        path = f'{directory}/ett/datasets'
        class_group = ETTInfo[group]
        
        y_df = pd.read_csv(f'{path}/{group}.csv')
        
        y_df, X_df = process_multiple_ts(y_df)
       
        S_df = None
        if cache:
            pd.to_pickle((y_df, X_df, S_df), file_cache)
            
        return y_df, X_df, S_df

    @staticmethod
    def download(directory: str) -> None:
        """Download ETT Dataset."""
        path = f'{directory}/ett/datasets/'
        if not os.path.exists(path):
            for group in ETTInfo.groups:
                download_file(path, f'{ETT.source_url}/{group}.csv')

In [None]:
for group, meta in ETTInfo:
    y_df, x_df, s_df = ETT.load(directory='data', group=group, cache=False)
    n_series = len(np.unique(y_df.unique_id.values))
    ex_vars = x_df.columns.to_list()
    ex_vars.remove('unique_id')
    ex_vars.remove('ds')

    display_str  = f'Group: {group} '
    display_str += f'n_series: {n_series} '
    display_str += f'ex_vars: {", ".join(ex_vars)}'

    print(display_str)

2.59MiB [00:00, 2.89MiB/s]
ERROR:nixtlats.data.datasets.utils:ERROR, something went wrong downloading data
INFO:nixtlats.data.datasets.utils:Successfully downloaded ETTh1.csv, 2589657, bytes.
2.42MiB [00:00, 6.61MiB/s]
ERROR:nixtlats.data.datasets.utils:ERROR, something went wrong downloading data
INFO:nixtlats.data.datasets.utils:Successfully downloaded ETTh2.csv, 2417960, bytes.
10.4MiB [00:00, 19.5MiB/s]
ERROR:nixtlats.data.datasets.utils:ERROR, something went wrong downloading data
INFO:nixtlats.data.datasets.utils:Successfully downloaded ETTm1.csv, 10360719, bytes.
9.68MiB [00:00, 37.3MiB/s]
ERROR:nixtlats.data.datasets.utils:ERROR, something went wrong downloading data
INFO:nixtlats.data.datasets.utils:Successfully downloaded ETTm2.csv, 9677236, bytes.


Group: ETTh1 n_series: 7 ex_vars: HourOfDay, DayOfWeek, DayOfMonth, DayOfYear
Group: ETTh2 n_series: 7 ex_vars: HourOfDay, DayOfWeek, DayOfMonth, DayOfYear
Group: ETTm1 n_series: 7 ex_vars: HourOfDay, DayOfWeek, DayOfMonth, DayOfYear
Group: ETTm2 n_series: 7 ex_vars: HourOfDay, DayOfWeek, DayOfMonth, DayOfYear


In [None]:
def test_group(group: str, expected_first_ds_y: np.ndarray,
               expected_first_ds_x: np.ndarray) -> None:
    y_df, x_df, _ = ETT.load(directory='data', group=group, cache=False)
    first_ds_y = y_df.groupby('unique_id').head(1)['y'].values
    first_ds_x = x_df.groupby('unique_id').head(1).drop(['unique_id', 'ds'], axis=1).values
    expected_x = np.repeat(expected_first_ds_x.reshape(1, -1), first_ds_x.shape[0], axis=0)
    
    np.testing.assert_array_almost_equal(first_ds_y, expected_first_ds_y)
    np.testing.assert_array_almost_equal(first_ds_x, expected_x)

In [None]:
test_group(group='ETTh1', 
           expected_first_ds_y=np.array([ 5.82700014,  2.00900006,  1.59899998,  0.46200001,  4.20300007, 1.34000003, 30.53100014]),
           expected_first_ds_x=np.array([-0.5       ,  0.16666667, -0.5       , -0.00136986]))

In [None]:
test_group(group='ETTh2', 
           expected_first_ds_y=np.array([41.13000107, 12.48099995, 36.5359993 ,  9.35499954,  4.42399979, 1.31099999, 38.66199875]),
           expected_first_ds_x=np.array([-0.5       ,  0.16666667, -0.5       , -0.00136986]))

In [None]:
test_group(group='ETTm1', 
           expected_first_ds_y=np.array([ 5.82700014,  2.00900006,  1.59899998,  0.46200001,  4.20300007, 1.34000003, 30.53100014]),
           expected_first_ds_x=np.array([-0.5       ,  0.16666667, -0.5       , -0.00136986]))

In [None]:
test_group(group='ETTm2', 
           expected_first_ds_y=np.array([41.13000107, 12.48099995, 36.5359993 ,  9.35499954,  4.42399979, 1.31099999, 38.66199875]),
           expected_first_ds_x=np.array([-0.5       ,  0.16666667, -0.5       , -0.00136986]))