In [4]:
from collections.abc import Generator
from pathlib import Path
from typing import Any

import datasets
import pandas as pd
from datasets import Features, Sequence, Value

In [5]:
# Function to convert a date string into a pandas Timestamp
def convert_date(date_string):
    """
    Convert a date string into a pandas Timestamp.

    Parameters:
    - date_string: str, date in 'YYYYMM' format

    Returns:
    - pd.Timestamp object representing the date
    """
    year_month = date_string.strip()
    year = int(year_month[:4])
    month = int(year_month[4:])
    return pd.Timestamp(year=year, month=month, day=1)

# 1. Prepare a univariate dataset for pre-training/fine-tuning

In [27]:
def example_gen_func(df) -> Generator[dict[str, Any]]:
    print(df)
    yield {
        "target": df["m3"].to_numpy(),
        "start": df["timestamp"].iloc[0],
        "freq": pd.infer_freq(df['timestamp']),
        "item_id": f"item_0",
    }

In [28]:
features = Features(
    dict(
        target=Sequence(Value("float32")),
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [29]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

# Initialize a dictionary to store derivatives for each state
state_derivative_dict = {}

# Iterate over unique states
for state in all_data['state'].unique():
    derivatives = all_data[all_data['state'] == state]['product'].unique()
    state_derivative_dict[state] = list(derivatives)

# Loop through each state and its derivatives
for state, derivatives in state_derivative_dict.items():
    for derivative in derivatives:
        print(f"========== State: {state}, derivative: {derivative} ==========")

        # Filter data for the current state and derivative
        data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == derivative)].copy()
        data_filtered["timestamp"] = pd.to_datetime(data_filtered["timestamp"])
        data_filtered = data_filtered.iloc[:-12]

        # hf_dataset = datasets.Dataset.from_generator(example_gen_func, features=features)
        hf_dataset = datasets.Dataset.from_generator(lambda: example_gen_func(data_filtered), features=features)
        hf_dataset.save_to_disk(Path(f"dataset_individual/dataset_individual_{state}_{derivative}"))



Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 642.02 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 755.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.16 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 707.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 778.45 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 721.79 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 693.27 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 733.27 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 632.82 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 730.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 696.03 examples/s]






Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 693.04 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 660.10 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 658.34 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 713.20 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 566.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 781.94 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 809.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 753.83 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 728.30 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 775.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 664.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 715.75 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 742.35 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.75 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 719.56 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 741.70 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 784.42 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 788.40 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 785.89 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 709.82 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.36 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 785.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 806.60 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 802.89 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 706.11 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 784.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 853.89 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 678.91 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 767.63 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 649.17 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 782.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.18 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 790.63 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 793.47 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.47 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 817.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 793.62 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 863.91 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 777.44 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 726.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 806.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 736.36 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 755.32 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 694.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 733.65 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 690.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 776.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 719.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 698.58 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 787.96 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 732.76 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 729.32 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 705.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 704.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 686.24 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 741.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 777.59 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 771.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 717.34 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 703.51 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 678.69 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 659.79 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 783.98 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 702.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 813.16 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 638.69 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 840.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 700.45 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 742.35 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 822.41 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 842.91 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 838.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 833.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.64 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 868.93 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 849.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 854.41 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 867.67 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 811.28 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 836.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 824.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 858.43 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 854.06 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 775.00 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 819.52 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 793.02 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 840.54 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 789.44 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 725.91 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 792.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 716.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 754.10 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 799.37 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.76 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 827.77 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 830.23 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 816.97 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 786.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 812.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 821.61 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 822.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 806.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 788.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 827.44 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 880.42 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 765.52 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 806.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 777.73 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 842.23 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 817.60 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 823.22 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 714.17 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 741.31 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 860.37 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 797.55 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 801.82 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 833.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 688.95 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 793.17 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 727.42 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 748.18 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 700.22 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 672.38 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.98 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 755.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 817.28 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 828.91 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 729.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 797.55 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 770.59 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 775.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.03 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.51 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 788.25 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 794.83 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 807.22 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 713.44 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 773.43 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 704.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 822.09 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 858.78 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 731.73 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 818.40 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 818.88 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 827.61 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 646.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 745.65 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 843.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 753.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 729.70 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 761.35 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 702.21 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.05 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 795.88 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 814.11 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 822.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 817.60 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 817.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.05 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 818.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 733.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 839.87 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 759.42 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.78 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 770.30 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 721.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 781.06 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 774.43 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 780.34 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 753.83 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 728.81 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.12 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 784.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.43 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 693.16 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 774.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 772.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 744.07 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.79 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 726.41 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 757.64 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 828.75 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 857.56 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 747.12 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 726.54 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 782.37 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 740.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 770.30 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.43 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 728.56 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 769.03 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 755.73 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 764.69 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.64 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 742.35 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.61 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 782.52 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 841.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 834.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 688.49 examples/s]


##  Validation

In [9]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

# Initialize a dictionary to store derivatives for each state
state_derivative_dict = {}

# Iterate over unique states
for state in all_data['state'].unique():
    derivatives = all_data[all_data['state'] == state]['product'].unique()
    state_derivative_dict[state] = list(derivatives)

# Loop through each state and its derivatives
for state, derivatives in state_derivative_dict.items():
    for derivative in derivatives:
        print(f"========== State: {state}, derivative: {derivative} ==========")

        # Filter data for the current state and derivative
        data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == derivative)].copy()
        data_filtered["timestamp"] = pd.to_datetime(data_filtered["timestamp"])
        data_filtered = data_filtered.iloc[:-24]

        # hf_dataset = datasets.Dataset.from_generator(example_gen_func, features=features)
        hf_dataset = datasets.Dataset.from_generator(lambda: example_gen_func(data_filtered), features=features)
        hf_dataset.save_to_disk(Path(f"dataset_individual_val/dataset_individual_{state}_{derivative}_val"))



Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 650.89 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 645.48 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 744.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 751.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.90 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 700.69 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 767.06 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 722.28 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.52 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 724.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.61 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 717.22 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 703.03 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 706.47 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 675.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 777.88 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.16 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 813.80 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 762.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 820.16 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 784.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 614.73 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 820.00 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.89 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 828.10 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 721.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 729.70 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 760.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 676.17 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 754.51 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 668.84 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 702.21 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 752.07 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 718.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 804.12 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 838.02 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 769.17 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 798.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.05 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 753.42 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.47 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 744.07 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 676.94 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 755.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 755.46 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 828.42 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 773.14 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 704.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 721.79 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 820.96 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 757.23 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 784.72 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 663.45 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 831.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 786.19 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 710.54 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.90 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 683.78 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 765.80 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 748.98 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 742.75 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 787.07 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 700.57 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 753.83 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 726.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 793.47 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 760.11 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 775.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 678.25 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 747.12 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 806.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 667.14 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 726.54 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 780.48 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 731.22 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 777.73 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 764.69 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.51 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 826.30 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 715.26 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 739.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 689.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 708.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 713.56 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 745.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 730.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 667.56 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 642.02 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 767.63 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 756.55 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 718.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 765.10 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 838.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 842.06 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 809.87 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 779.03 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 768.33 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 821.93 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 762.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 734.94 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 740.65 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 781.06 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 772.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 772.86 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 680.56 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 783.54 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 838.69 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 628.93 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 728.94 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 606.38 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 734.17 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 793.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 825.49 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 632.24 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 747.65 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 822.90 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 859.14 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 700.80 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 786.63 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 770.30 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 848.36 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 730.46 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 808.46 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 748.18 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 683.89 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 773.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.78 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 759.15 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 764.55 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.52 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 678.36 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 704.93 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 714.41 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 599.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 601.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 664.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 706.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.79 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 717.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 817.92 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 818.40 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 781.06 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 781.35 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 829.90 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 789.59 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.51 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 801.97 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 724.15 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 800.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 745.65 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 836.85 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 763.43 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 706.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 725.16 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 708.14 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 749.25 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 740.78 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.90 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 760.66 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 818.40 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 758.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 758.88 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 690.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 761.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 772.01 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 762.32 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 771.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 22.07 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 778.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 863.03 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 810.81 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 801.51 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 764.27 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 770.87 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 776.00 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 772.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 756.41 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 834.52 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.64 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 756.41 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 788.40 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 734.30 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 766.08 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 681.11 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 733.53 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 662.71 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 701.27 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 711.62 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 657.11 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 679.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 773.00 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 751.13 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 748.05 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 770.02 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 730.46 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 709.22 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 735.97 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 756.82 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 706.59 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 753.29 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 805.98 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 814.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 769.74 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 709.10 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 771.01 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 840.54 examples/s] 




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 767.77 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 775.72 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 744.73 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 780.77 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 789.44 examples/s]




Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 789.59 examples/s]


# 2. Prepare a multivariate dataset for pre-training/fine-tuning GLOBAL

In [10]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")
all_data["timestamp"] = pd.to_datetime(all_data["timestamp"])

In [11]:
all_data["column_name"] = all_data["product"].str.strip() + "_" + all_data["state"].str.strip()

df_pivot = all_data.pivot(index="timestamp", columns="column_name", values="m3").reset_index()

df_pivot = df_pivot.iloc[:-12]

In [12]:
def multivar_example_gen_func() -> Generator[dict[str, Any], None, None]:
    yield {
        "target": df_pivot.iloc[:, 1:].to_numpy().T,  # array of shape (var, time)
        "start": all_data["timestamp"].iloc[0],
        "freq": 'MS',
        "item_id": "item_0",
    }

In [13]:
features = Features(
    dict(
        target=Sequence(
            Sequence(Value("float32")), length=len(df_pivot.iloc[:, 1:].columns)
        ),  # multivariate time series are saved as (var, time)
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [14]:
hf_dataset = datasets.Dataset.from_generator(
    multivar_example_gen_func, features=features
)
hf_dataset.save_to_disk("dataset_global/dataset_global")

Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 577.25 examples/s]


##  Validation

In [15]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")
all_data["timestamp"] = pd.to_datetime(all_data["timestamp"])

In [16]:
all_data["column_name"] = all_data["product"].str.strip() + "_" + all_data["state"].str.strip()

df_pivot = all_data.pivot(index="timestamp", columns="column_name", values="m3").reset_index()

df_pivot = df_pivot.iloc[:-24]

In [17]:
def multivar_example_gen_func() -> Generator[dict[str, Any], None, None]:
    yield {
        "target": df_pivot.iloc[:, 1:].to_numpy().T,  # array of shape (var, time)
        "start": all_data["timestamp"].iloc[0],
        "freq": 'MS',
        "item_id": "item_0",
    }

In [18]:
features = Features(
    dict(
        target=Sequence(
            Sequence(Value("float32")), length=len(df_pivot.iloc[:, 1:].columns)
        ),  # multivariate time series are saved as (var, time)
        start=Value("timestamp[s]"),
        freq=Value("string"),
        item_id=Value("string"),
    )
)

In [19]:
hf_dataset = datasets.Dataset.from_generator(
    multivar_example_gen_func, features=features
)
hf_dataset.save_to_disk("dataset_global_val/dataset_global_val")

Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 533.49 examples/s]


# 3. Prepare a multivariate dataset for pre-training/fine-tuning PRODUCT

In [20]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")
all_data["timestamp"] = pd.to_datetime(all_data["timestamp"])

In [21]:
unique_products = all_data["product"].unique()

for product in unique_products:
    product_data = all_data[all_data["product"] == product].copy()

    product_data["column_name"] = product_data["product"].str.strip() + "_" + product_data["state"].str.strip()

    df_pivot = product_data.pivot(index="timestamp", columns="column_name", values="m3").reset_index()
    df_pivot = df_pivot.iloc[:-12]  

    def multivar_example_gen_func() -> Generator[dict[str, any], None, None]:
        yield {
            "target": df_pivot.iloc[:, 1:].to_numpy().T, 
            "start": product_data["timestamp"].iloc[0],
            "freq": 'MS',
            "item_id": f"item_{product}",
        }

    features = Features(
        dict(
            target=Sequence(
                Sequence(Value("float32")), length=len(df_pivot.iloc[:, 1:].columns)
            ),  
            start=Value("timestamp[s]"),
            freq=Value("string"),
            item_id=Value("string"),
        )
    )

    hf_dataset = datasets.Dataset.from_generator(multivar_example_gen_func, features=features)
    hf_dataset.save_to_disk(f"dataset_product/dataset_product_{product}")

Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 682.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 718.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 632.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 710.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 655.87 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 631.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 701.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 769.88 examples/s]


##  Validation

In [22]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")
all_data["timestamp"] = pd.to_datetime(all_data["timestamp"])

In [23]:
unique_products = all_data["product"].unique()

for product in unique_products:
    product_data = all_data[all_data["product"] == product].copy()

    product_data["column_name"] = product_data["product"].str.strip() + "_" + product_data["state"].str.strip()

    df_pivot = product_data.pivot(index="timestamp", columns="column_name", values="m3").reset_index()
    df_pivot = df_pivot.iloc[:-24]  

    def multivar_example_gen_func() -> Generator[dict[str, any], None, None]:
        yield {
            "target": df_pivot.iloc[:, 1:].to_numpy().T, 
            "start": product_data["timestamp"].iloc[0],
            "freq": 'MS',
            "item_id": f"item_{product}",
        }

    features = Features(
        dict(
            target=Sequence(
                Sequence(Value("float32")), length=len(df_pivot.iloc[:, 1:].columns)
            ),  
            start=Value("timestamp[s]"),
            freq=Value("string"),
            item_id=Value("string"),
        )
    )

    hf_dataset = datasets.Dataset.from_generator(multivar_example_gen_func, features=features)
    hf_dataset.save_to_disk(f"dataset_product_val/dataset_product_{product}_val")

Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 423.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 643.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 682.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 712.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 723.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 721.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 691.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 638.89 examples/s]


# NOTE: Inspecting the processed data

In [30]:
test1 = datasets.load_from_disk("dataset_individual/dataset_individual_sp_glp").with_format("numpy")
test2 = datasets.load_from_disk("dataset_individual_val/dataset_individual_sp_glp_val").with_format("numpy")
test3 = datasets.load_from_disk("dataset_global/dataset_global").with_format("numpy")
test4 = datasets.load_from_disk("dataset_global_val/dataset_global_val").with_format("numpy")
test5 = datasets.load_from_disk("dataset_product/dataset_product_glp").with_format("numpy")
test6 = datasets.load_from_disk("dataset_product_val/dataset_product_glp_val").with_format("numpy")

In [31]:
test1[0]["target"].shape, test2[0]["target"].shape, test3[0]["target"].shape, test4[0]["target"].shape, test5[0]["target"].shape, test6[0]["target"].shape 

((398,), (386,), (216, 398), (216, 386), (27, 398), (27, 386))