In [6]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import os


def load_and_transform_subsets(dataset_name, subset_names, export_dir, split="train"):
    all_transformed_series = {}

    # Ensure the directory exists
    os.makedirs(export_dir, exist_ok=True)

    for subset_name in subset_names:
        export_path = os.path.join(export_dir, f"{subset_name}.csv")
        if os.path.exists(export_path):
            print(f"Data for {subset_name} already exported. Skipping...")
            continue

        # Load the dataset for the given subset
        dataset = load_dataset(dataset_name, subset_name, split=split)
        df = dataset.to_pandas()

        # Get the first time series formatted DataFrame
        expanded_series = expand_time_series(df, subset_name)
        if expanded_series is not None:
            all_transformed_series[subset_name] = expanded_series
            # Save the series to a CSV file
            expanded_series.to_csv(export_path, index=False, header=False)
            print(f"Exported {export_path}")

    return all_transformed_series


def expand_time_series(df, subset_name):
    for index, row in df.iterrows():
        start = pd.to_datetime(row["start"])
        freq = row["freq"]
        target = np.array(row["target"])

        if target.ndim == 2:
            target = target[:, 0]  # Select the first column only for simplicity

        dates = pd.date_range(start=start, periods=len(target), freq=freq)
        return pd.Series(data=target, index=dates)  # Return the first series

    return None  # Return None if no data


# Example usage:
dataset_name = "Salesforce/lotsa_data"
subset_names = [
    #"bdg-2_fox",
    #"bdg-2_rat",
    #"bdg-2_bear",
    "london_smart_meters_with_missing",
    "smart",
    "sceaux",
    # "largest",  # too large
    #"PEMS03",
    #"PEMS07",
    #"PEMS_BAY",
    #"LOS_LOOP",
    "LOOP_SEATTLE",
    #"oikolab_weather",
    "elecdemand",
    "traffic_hourly",
    #"saugeenday",
    "elf",
    "subseasonal_precip",
    #'solar_power',
    #'wind_power'


]
export_dir = "experiment_export_data"  # Set your custom directory name here

# Load and transform data
transformed_data = load_and_transform_subsets(dataset_name, subset_names, export_dir)


Data for london_smart_meters_with_missing already exported. Skipping...
Data for smart already exported. Skipping...
Data for sceaux already exported. Skipping...
Data for LOOP_SEATTLE already exported. Skipping...
Data for elecdemand already exported. Skipping...
Data for traffic_hourly already exported. Skipping...
Data for elf already exported. Skipping...
Data for subseasonal_precip already exported. Skipping...
Data for solar_power already exported. Skipping...
Data for wind_power already exported. Skipping...


In [7]:
import os
import pandas as pd


def analyze_data_files(directory, subset_order):
    summary_data = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            subset_name = filename[:-4]  # Remove the ".csv" from the filename
            file_size = os.path.getsize(file_path)

            # Load the CSV file, assuming no headers as per your specifications
            df = pd.read_csv(file_path, header=None)
            series_length = len(df)

            if not df.empty:
                # Assume the first column contains the data
                column = df.iloc[:, 0].astype(
                    str
                )  # Ensure all data is treated as string
                # Calculate the number of decimal places
                decimals = column.apply(
                    lambda x: len(x.split(".")[1]) if "." in x else 0
                )
                precision_counts = decimals.value_counts()
                precision_proportions = (precision_counts / series_length * 100).round(
                    2
                )

                # Most common precision and its proportion
                most_common_precision = precision_counts.idxmax()
                most_common_proportion = precision_proportions[most_common_precision]

                # Average precision
                average_precision = (decimals.sum() / series_length).round(2)
            else:
                most_common_precision = 0
                most_common_proportion = 0.0
                average_precision = 0.0

            # Collect data into dictionary
            summary_data.append(
                {
                    "Subset Name": subset_name,
                    "File Size (bytes)": file_size,
                    "Series Length": series_length,
                    "Most Common Precision": most_common_precision,
                    "Most Common Precision Proportion (%)": most_common_proportion,
                    "Average Precision": average_precision,
                }
            )

    # Create DataFrame from collected data
    df_summary = pd.DataFrame(summary_data)

    # Ensure the order matches your subset_names list
    df_summary.set_index("Subset Name", inplace=True)
    df_summary = df_summary.reindex(subset_order).reset_index()

    return df_summary


In [8]:
directory = export_dir  # Specify your data directory here
summary_df = analyze_data_files(directory, subset_names)
print(summary_df)
summary_df.to_csv("summary_df.csv")


                        Subset Name  File Size (bytes)  Series Length  \
0  london_smart_meters_with_missing             140896          23904   
1                             smart             267583          25919   
2                            sceaux             328246          34223   
3                      LOOP_SEATTLE            1003466         105120   
4                        elecdemand             167219          17520   
5                    traffic_hourly             119719          17376   
6                               elf             211782          21792   
7                subseasonal_precip             109296          11323   
8                       solar_power           33781822        7397222   
9                        wind_power           36202451        7397147   

   Most Common Precision  Most Common Precision Proportion (%)  \
0                      3                                 90.50   
1                      7                                 49.70  