# Script - to test all data

In [21]:
import os
import numpy as np
import pandas as pd
import importlib
import zstandard as zstd
import matplotlib.pyplot as plt
from chronos_utils import (
    rolling_forecast_and_reconstruct,
    compute_error_metrics,
    plot_error_distribution,
    plot_time_series_with_forecast,
    decompress_to_csv,
    check_data_valid,
    check_file_valid,
    compress_to_file,
    decompress_to_mem,
    compress_to_file_debug,
    LinearQuantizer,
    decompress_data
)

import numpy as np
import zstandard as zstd

def naive_compress_with_zstd(data, EB):

    # Perform naive quantization
    naive_quantile = ((data / EB).astype('int64') * EB).astype('float32')

    # Check if quantization is within acceptable limits (Optional, for validation)
    is_close = np.allclose(naive_quantile, data, atol=EB)
    if not is_close:
        print(f"All elements in naive zstd are not within the error bound ({EB}): {is_close}")

    # Compress the data using zstandard with a high compression level
    compressor = zstd.ZstdCompressor(level=22)
    compressed_data = compressor.compress(naive_quantile.tobytes())

    return len(compressed_data)

import numpy as np
import os
import subprocess

def compress_with_sz3(array, EB, verbose=False):
    # Write the bytes to a binary file
    temp_input_filename = 'sz_temp.bin'
    temp_output_filename = 'sz_temp.bin.sz'
    
    # Convert the numpy array to bytes and write to file
    array.tofile(temp_input_filename)
    
    # Prepare the SZ3 command based on the dtype of the array
    if array.dtype == np.float32:
        dtype_flag = '-f'
    elif array.dtype == np.float64:
        dtype_flag = '-d'
    elif array.dtype == np.int32:
        dtype_flag = '-I 32'
    elif array.dtype == np.int64:
        dtype_flag = '-I 64'
    else:
        raise ValueError("Unsupported data type for SZ3 compression")

    # Construct the command
    command = f"./sz3 {dtype_flag} -i {temp_input_filename} -z {temp_output_filename} -1 {array.size} -M ABS {EB}"
    
    # Execute the command with or without output
    if verbose:
        print("Executing command:", command)
        subprocess.run(command, shell=True, check=True)
    else:
        subprocess.run(command, shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
    # Get the size of the compressed file
    compressed_size = os.path.getsize(temp_output_filename)
    
    # Remove the temporary files
    os.remove(temp_input_filename)
    os.remove(temp_output_filename)
    
    return compressed_size


def compress_array_with_zstd(array):
    """ Compresses a numpy array using Zstandard and returns the size of the compressed data. """
    compressor = zstd.ZstdCompressor(level=22)
    # Convert the numpy array to bytes and compress
    compressed_data = compressor.compress(array.tobytes())
    return len(compressed_data)


def calculate_entropy_with_scipy(data):
    # Count the frequency of each unique value
    values, counts = np.unique(data, return_counts=True)
    # Calculate the entropy using scipy.stats.entropy
    return entropy(counts, base=2)



import numpy as np

def calculate_rrmse(error, actual):
    mean_actual = np.mean(actual)
    if mean_actual == 0:
        return float('inf')
    squared_relative_errors = (error / mean_actual) ** 2
    return np.sqrt(np.mean(squared_relative_errors))

def calculate_rmse(error, actual):
    mean_actual = np.mean(actual)
    if mean_actual == 0:
        return float('inf')
    squared_relative_errors = (error / mean_actual) ** 2
    return np.mean(squared_relative_errors)

def calculate_rmae(error, actual):
    mean_actual = np.mean(actual)
    if mean_actual == 0:
        return float('inf')
    absolute_relative_errors = np.abs(error / mean_actual)
    return np.sqrt(np.mean(absolute_relative_errors))



In [22]:
import os
import numpy as np
import time


# Define constants
DATA_FOLDER = 'experiment_export_data'
BIN_FOLDER = 'bin_data'
COMPRESSED_FOLDER = 'compressed_data'

# Ensure directories exist
os.makedirs(DATA_FOLDER, exist_ok=True)
os.makedirs(BIN_FOLDER, exist_ok=True)
os.makedirs(COMPRESSED_FOLDER, exist_ok=True)

import numpy as np
import os
import time
from scipy.stats import entropy

import numpy as np
import os
import time
from scipy.stats import entropy

def perform_compression_tests(data_file, CTX, PDT, EB, model, test_len=None, single_mode=True):

    bin_file = f"{BIN_FOLDER}/{data_file}.bin"
    csv_file = f"{DATA_FOLDER}/{data_file}.csv"
    unique_name = f"{data_file}_model{model}_EB{str(EB)}_CTX{CTX}_PDT{PDT}.tar.gz"
    compressed_file = f"{COMPRESSED_FOLDER}/{unique_name}"

    try:
        df = np.fromfile(bin_file, dtype=np.float32)
        if test_len is not None:
            df = df[:test_len]

        # Begin compression testing
        start_time = time.time()
        sz3_size = compress_with_sz3(df, EB)
        mid_time = time.time()
        zstd_size = naive_compress_with_zstd(df, EB)
        end_time = time.time()

        compression_time_sz3 = mid_time - start_time
        compression_time_zstd = end_time - mid_time

        # Debug compression
        start_time = time.time()
        forecast, error = compress_to_file_debug(df, compressed_file, CTX, PDT, EB, model)
        mid_time = time.time()
        
        decompression_time = 0
        if not single_mode:
            de_data, prediction, context, recovered_error = decompress_to_mem(compressed_file)
            end_time = time.time()
            decompression_time = end_time - mid_time
  

        compression_time_debug = mid_time - start_time
  

        # Calculate additional metrics
        df_segment = df[CTX:]
        df_entropy = calculate_entropy_with_scipy(df_segment)
        df_min = np.min(df_segment)
        df_max = np.max(df_segment)
        df_mean = np.mean(df_segment)
        df_variance = np.var(df_segment)

        error_entropy = calculate_entropy_with_scipy(error)
        error_min = np.min(error)
        error_max = np.max(error)
        error_mean = np.mean(error)
        error_variance = np.var(error)

        mean_absolute_error = np.mean(np.abs(error - df_segment))
        mean_squared_error = np.mean((error - df_segment)**2)

        rrmse = calculate_rrmse(error, df_segment)
        rmse = calculate_rmse(error, df_segment)
        rmae = calculate_rmae(error, df_segment)

        # File sizes for comparison
        csv_size = os.path.getsize(csv_file)
        bin_size = os.path.getsize(bin_file)
        fm_size = os.path.getsize(compressed_file)

        # File size for the error vs use our multi-scale way
        zstd_error_size = compress_array_with_zstd(error)        
        

        # Data accuracy check
        is_accurate = np.allclose(de_data, df, atol=EB) if not single_mode else 'skipped'
        

        # Consolidate all results into a dictionary with clearly named keys
        results = {
            "Data File": data_file,
            "Model": model,
            "EB": EB,
            "CTX": CTX,
            "PDT": PDT,
            "Success": True,
            "Error Message": '',
            "csv_size": csv_size,
            "bin_size": bin_size,
            "compressed_size": fm_size,
            "sz3_size": sz3_size,
            "zstd_size": zstd_size,
            "is_accurate": is_accurate,
            "compression_time_sz3": compression_time_sz3,
            "compression_time_zstd": compression_time_zstd,
            "compression_time_debug": compression_time_debug,
            "decompression_time": decompression_time,
            "zstd_error_size": zstd_error_size,
            "df_entropy": df_entropy,
            "df_min": df_min,
            "df_max": df_max,
            "df_mean": df_mean,
            "df_variance": df_variance,
            "error_entropy": error_entropy,
            "error_min": error_min,
            "error_max": error_max,
            "error_mean": error_mean,
            "error_variance": error_variance,
            "mean_absolute_error": mean_absolute_error,
            "mean_squared_error": mean_squared_error,
            "RRMSE": rrmse,
            "RMSE": rmse,
            "RMAE": rmae
        }
        return results

    except Exception as e:
        # Fallback for all fields if an exception occurs
        error_details = {
            "Data File": data_file,
            "Model": model,
            "EB": EB,
            "CTX": CTX,
            "PDT": PDT,
            "Success": False,
            "Error Message": str(e),
            # Set all additional data fields to None
            "csv_size": None,
            "bin_size": None,
            "compressed_size": None,
            "sz3_size": None,
            "zstd_size": None,
            "is_accurate": None,
            "compression_time_sz3": None,
            "compression_time_zstd": None,
            "compression_time_debug": None,
            "decompression_time": None,
            "zstd_error_size": None,
            "df_entropy": None,
            "df_min": None,
            "df_max": None,
            "df_mean": None,
            "df_variance": None,
            "error_entropy": None,
            "error_min": None,
            "error_max": None,
            "error_mean": None,
            "error_variance": None,
            "mean_absolute_error": None,
            "mean_squared_error": None,
            "RRMSE": None,
            "RMSE": None,
            "RMAE": None
        }
        return error_details


In [23]:

import pandas as pd
import os

def run_experiments(data_with_precision, models, EB_values, CTX_values, PDT_values, test_len=None, results_file_path="compression_results.csv", create_new_file=False):
 
    standard_columns = [
    'Data File', 'Model', 'EB', 'CTX', 'PDT', 'Success', 'Error Message',
    'csv_size', 'bin_size', 'compressed_size', 'sz3_size', 'zstd_size', 
    'is_accurate', 'compression_time_sz3', 'compression_time_zstd', 
    'compression_time_debug', 'decompression_time', 'zstd_error_size', 
    'df_entropy', 'df_min', 'df_max', 'df_mean', 'df_variance', 
    'error_entropy', 'error_min', 'error_max', 'error_mean', 
    'error_variance', 'mean_absolute_error', 'mean_squared_error', 
    'RRMSE', 'RMSE', 'RMAE']


    if create_new_file or not os.path.exists(results_file_path):
        pd.DataFrame(columns=standard_columns).to_csv(results_file_path, index=False)
    
    results_df = pd.read_csv(results_file_path) if not create_new_file else pd.DataFrame(columns=standard_columns)

    for data_file, mcp in data_with_precision:
        for EB in EB_values:
            if EB <= 10 ** -mcp:
                print(f"Skipping: {data_file} with EB: {EB} no greater than MCP limit (10^-{mcp})")
                continue
            for CTX in CTX_values:
                for PDT in PDT_values:
                    if PDT > CTX:
                        print(f"Skipping: {data_file} with PDT: {PDT} greater than CTX: {CTX}")
                        continue
                    for model in models:
                        if experiment_already_run(data_file, model, EB, CTX, PDT, results_df):
                            print(f"Skipping already completed experiment: {data_file} with Model: {model}, EB: {EB}, CTX: {CTX}, PDT: {PDT}")
                            continue
                        print(f"Testing {data_file} with Model: {model}, EB: {EB}, CTX: {CTX}, PDT: {PDT}")
                        result = perform_compression_tests(data_file, CTX, PDT, EB, model, test_len)
                        pd.DataFrame([result]).to_csv(results_file_path, mode='a', header=False, index=False)

    print("All experiments are done and results are saved.")

def experiment_already_run(data_file, model, EB, CTX, PDT, results_df):
    """ Check if the experiment configuration has already been tested and recorded. """
    return ((results_df['Data File'] == data_file) & 
            (results_df['Model'] == model) &
            (results_df['EB'] == EB) &
            (results_df['CTX'] == CTX) &
            (results_df['PDT'] == PDT)).any()


In [24]:

data_with_precision = [
   #("london_smart_meters_with_missing", 3),
    ("smart", 7),
    # ("sceaux", 7),
    # ("LOOP_SEATTLE", 6),
    # ("oikolab_weather", 7),
    # ("elecdemand", 7),
    # ("traffic_hourly", 4),
    # ("elf", 4),
    # ("subseasonal_precip", 6)
]



models = [1,2,3,4]
# Create DP values from 0 to -6
DP_values = list(range(-1, -7, -1))
EB_values = [float(10**dp) for dp in DP_values]
CTX_values = [256]
PDT_values = [1]
run_experiments(data_with_precision, models, EB_values, CTX_values, PDT_values, test_len=None, results_file_path="all_model_results.csv",create_new_file=True)

Testing smart with Model: 0, EB: 0.1, CTX: 256, PDT: 1


compress processing:   0%|          | 0/25663 [00:00<?, ?step/s]

KeyboardInterrupt: 