In [2]:
from collections.abc import Generator
from pathlib import Path
from typing import Any

import datasets
import pandas as pd
from datasets import Features, Sequence, Value

# 1. Prepare a univariate dataset for pre-training/fine-tuning
In this example, we will see how to use the Hugging Face ```datasets``` library to prepare your custom datasets to use with ```uni2ts```. 

Firstly, we load our data which comes in the form of a wide dataframe. Here, each column represents a _univariate_ time series.

In [14]:
# Load dataframe
url_wide = (
    "https://gist.githubusercontent.com/rsnirwan/c8c8654a98350fadd229b00167174ec4"
    "/raw/a42101c7786d4bc7695228a0f2c8cea41340e18f/ts_wide.csv"
)
df = pd.read_csv(url_wide, index_col=0, parse_dates=True)

df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2021-01-01 00:00:00,-1.3378,0.1268,-0.3645,-1.0864,-2.3803,-0.2447,2.2647,-0.7917,0.7071,1.3763
2021-01-01 01:00:00,-1.6111,0.0926,-0.1364,-1.1613,-2.1421,-0.3477,2.4262,-0.9609,0.6413,1.275
2021-01-01 02:00:00,-1.9259,-0.142,0.1063,-1.0405,-2.1426,-0.3271,2.4434,-0.9034,0.4323,0.6767
2021-01-01 03:00:00,-1.9184,-0.493,0.6269,-0.8531,-1.706,-0.3088,2.4307,-0.9602,0.3193,0.515
2021-01-01 04:00:00,-1.9168,-0.5057,0.9419,-0.7666,-1.4287,-0.4284,2.3258,-1.2504,0.366,0.1708


In [4]:
from data_loader import read_parquet_hive

# Cell 2: Parameters
LAKE_ROOT = Path("/home/dev/data/ohlcv")

asset_class = "crypto"   # one of: crypto, fx, index, etf, equity, futures
symbol = "ETH"           # e.g., ETH, BTC, USDJPY, SPY, AAPL
freq = "15min"           # one of: 1min, 15min, 1h, 4h, 1d

# Optional partition narrowing
year = 2025              # set to None to read all years
month = 7                # set to None to read all months

# Optional lazy-scan date filter window (UTC)
date_start = pd.Timestamp("2024-01-01", tz="UTC")
date_end   = pd.Timestamp("2024-04-01", tz="UTC")


In [6]:
# Cell 4: Load a partition and sort
df = read_parquet_hive(
    LAKE_ROOT,
    asset_class=asset_class,
    symbol=symbol,
    freq=freq,
    year=year,
    month=month,
)
df = df.sort("ts")

df.head()


ts,open,high,low,close,volume,asset_class,symbol
"datetime[μs, UTC]",f64,f64,f64,f64,f64,str,str
2025-07-01 00:00:00 UTC,2486.16,2496.4,2485.38,2493.36,1151.93403,"""crypto""","""ETH"""
2025-07-01 00:15:00 UTC,2493.43,2498.5,2485.0,2485.15,650.784791,"""crypto""","""ETH"""
2025-07-01 00:30:00 UTC,2485.25,2490.5,2477.72,2489.4,1360.341981,"""crypto""","""ETH"""
2025-07-01 00:45:00 UTC,2489.47,2496.2,2489.3,2493.94,592.136844,"""crypto""","""ETH"""
2025-07-01 01:00:00 UTC,2493.89,2499.9,2491.0,2499.36,685.831934,"""crypto""","""ETH"""


### Method 1: Example generator function
1. Create an example generator function, a function which yields each individual time series. Each time series consists of 
    1. target: target time series that should be predicted
    2. start: timestamp of the first time step
    3. freq: frequency str of time series
    4. item_id: identifier 
    5. (optional) past_feat_dynamic_real: time series for which only the context values are known
    6. (optional) feat_dynamic_real: time series for which the context and prediction values are known
2. Define the schema for the features to ensure the datasets library saves the correct data types.
3. Write the data to disk using the ```from_generator``` function.

In [11]:
def example_gen_func() -> Generator[dict[str, Any]]:
    for i in range(len(df.columns)):
        yield {
            "target": df.iloc[:, i].to_numpy(),  # array of shape (time,)
            "start": df.index[0],
            "freq": pd.infer_freq(df.index),
            "item_id": f"item_{i}",
        }

In [12]:
features = Features(
    dict(
        target=Sequence(Value("float32")),
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [13]:
hf_dataset = datasets.Dataset.from_generator(example_gen_func, features=features)
hf_dataset.save_to_disk(Path("example_dataset_1"))

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

### Method 2: Sharded example generator function
For larger datasets, the Hugging Face ```datasets``` library is able to use multiprocessing to speed up the generation of examples. Since the ```from_generator``` function takes as input a generator object which iterates through every example, naively using this function with multiprocessing does not lead to any speed ups. Instead, we need to provide a _sharded_ generator function, which is able to index into the specific examples based on the inputs. See the following example for a simple recipe:

In [16]:
def sharded_example_gen_func(examples: list[int]) -> Generator[dict[str, Any]]:
    for i in examples:
        yield {
            "target": df.iloc[:, i].to_numpy(),
            "start": df.index[0],
            "freq": pd.infer_freq(df.index),
            "item_id": f"item_{i}",
        }

In [17]:
features = Features(
    dict(
        target=Sequence(Value("float32")),
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [18]:
hf_dataset = datasets.Dataset.from_generator(
    sharded_example_gen_func,
    features=features,
    gen_kwargs={"examples": [i for i in range(len(df.columns))]},
    num_proc=2,
)
hf_dataset.save_to_disk(Path("example_dataset_2"))

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

# 2. Prepare a multivariate dataset for pre-training/fine-tuning
Finally, we can also prepare _multivariate_ time series:

In [19]:
# Load dataframe
url_wide = (
    "https://gist.githubusercontent.com/rsnirwan/c8c8654a98350fadd229b00167174ec4"
    "/raw/a42101c7786d4bc7695228a0f2c8cea41340e18f/ts_wide.csv"
)
df = pd.read_csv(url_wide, index_col=0, parse_dates=True)

df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2021-01-01 00:00:00,-1.3378,0.1268,-0.3645,-1.0864,-2.3803,-0.2447,2.2647,-0.7917,0.7071,1.3763
2021-01-01 01:00:00,-1.6111,0.0926,-0.1364,-1.1613,-2.1421,-0.3477,2.4262,-0.9609,0.6413,1.275
2021-01-01 02:00:00,-1.9259,-0.142,0.1063,-1.0405,-2.1426,-0.3271,2.4434,-0.9034,0.4323,0.6767
2021-01-01 03:00:00,-1.9184,-0.493,0.6269,-0.8531,-1.706,-0.3088,2.4307,-0.9602,0.3193,0.515
2021-01-01 04:00:00,-1.9168,-0.5057,0.9419,-0.7666,-1.4287,-0.4284,2.3258,-1.2504,0.366,0.1708


In [20]:
def multivar_example_gen_func() -> Generator[dict[str, Any], None, None]:
    yield {
        "target": df.to_numpy().T,  # array of shape (var, time)
        "start": df.index[0],
        "freq": pd.infer_freq(df.index),
        "item_id": "item_0",
    }

In [21]:
features = Features(
    dict(
        target=Sequence(
            Sequence(Value("float32")), length=len(df.columns)
        ),  # multivariate time series are saved as (var, time)
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [22]:
hf_dataset = datasets.Dataset.from_generator(
    multivar_example_gen_func, features=features
)
hf_dataset.save_to_disk("example_dataset_multi")

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

# 3. Inspecting the processed data
Let's inspect the processed datasets to ensure that our data has been processed correctly.

In [23]:
# Load datasets with ArrowTableIndexer
ds1 = datasets.load_from_disk("example_dataset_1").with_format("numpy")
ds2 = datasets.load_from_disk("example_dataset_2").with_format("numpy")
ds_multi = datasets.load_from_disk("example_dataset_multi").with_format("numpy")

```example_dataset_1``` and ```example_dataset_2``` are univariate datasets, which should have 10 time series each, and ```example_dataset_multi``` should be a single multivariate time series (with 10 variates). 

In [24]:
len(ds1), len(ds2), len(ds_multi)

(10, 10, 1)

Inspecting the features returned when we index into a time series from the dataset...

In [25]:
ds1[0].keys(), ds2[0].keys(), ds_multi[0].keys()

(dict_keys(['target', 'start', 'freq', 'item_id']),
 dict_keys(['target', 'start', 'freq', 'item_id']),
 dict_keys(['target', 'start', 'freq', 'item_id']))

We should get 2 univariate and 1 multivariate target time series...

In [26]:
ds1[0]["target"].shape, ds2[0]["target"].shape, ds_multi[0]["target"].shape

((240,), (240,), (10, 240))

In [None]:
ds2[0].keys()

dict_keys(['target', 'start', 'freq', 'item_id'])

In [38]:
ds2['item_id']

array(['item_0', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5',
       'item_6', 'item_7', 'item_8', 'item_9'], dtype='<U6')

In [28]:
ds_multi[0]

{'target': array([[-1.3378, -1.6111, -1.9259, ..., -0.1209, -0.5072, -0.6661],
        [ 0.1268,  0.0926, -0.142 , ...,  0.5297,  0.5816,  0.3736],
        [-0.3645, -0.1364,  0.1063, ..., -0.7084, -0.5735, -0.5547],
        ...,
        [-0.7917, -0.9609, -0.9034, ..., -0.6366, -0.5672, -0.913 ],
        [ 0.7071,  0.6413,  0.4323, ...,  1.1286,  1.0786,  0.9624],
        [ 1.3763,  1.275 ,  0.6767, ...,  1.1485,  1.3248,  1.1657]],
       dtype=float32),
 'start': array('2021-01-01T00:00:00', dtype='datetime64[s]'),
 'freq': 'H',
 'item_id': 'item_0'}