# SSA dataset

In [1]:
import pandas as pd
import os
from typing import Final
from collections.abc import Callable
from pathlib import Path
import matplotlib.pyplot as plt
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [2]:
plt.rcParams["figure.figsize"] = (20, 10)

In [3]:
dataset_collection_name = "SSA"
source_folder = Path(data_raw_folder) / "SSA"
target_folder = data_processed_folder

print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/SSA and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


file handling and transformations

In [4]:
def list_files() -> list[str]:
    return [f.name for f in (source_folder / "data").iterdir() if f.is_file()]

def get_source_file(name: str, label: bool = False) -> Path:
    if label:
        parts = name.split("_")
        prefix = parts[0]
        postfix = parts[2]
        return source_folder / "label" / f"{prefix}_truth_{postfix}.csv"
    else:
        return source_folder / "data" / name

def process(file: str, target: Path, is_train: bool = False) -> int:
    df_data = pd.read_csv(get_source_file(file), names=["value"])
    df_label = pd.read_csv(get_source_file(file, label=True), names=["is_anomaly"])
    df = pd.merge(df_data, df_label, left_index=True, right_index=True)
    df.index.name = "timestamp"
    df.to_csv(target, index=True)
    return len(df)

In [8]:
# shared by all datasets
dataset_type = "real"
train_is_normal = False
train_type = "unsupervised"
input_type = "univariate"
datetime_index = False

# create target directory
dataset_subfolder = Path(input_type) / dataset_collection_name
target_subfolder = target_folder / dataset_subfolder
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA already exist


In [11]:
for f in list_files():
    parts = f.split("_")
    dataset_name = f"{parts[0]}-{parts[2]}"
    filename = f"{dataset_name}.test.csv"
    path = dataset_subfolder / filename
    target_filepath = target_subfolder / filename

    dataset_length = process(f, target_filepath)

    # save metadata
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = None,
        test_path = path,
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )
    print(f"Processed source dataset {f} -> {target_filepath}")

# save metadata of benchmark
dm.save()

Processed source dataset stb_data_15 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-15.test.csv
Processed source dataset stb_data_9 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-9.test.csv
Processed source dataset stb_data_32 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-32.test.csv
Processed source dataset stb_data_10 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-10.test.csv
Processed source dataset stb_data_5 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-5.test.csv
Processed source dataset stb_data_20 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-20.test.csv
Processed source dataset stb_data_19 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-19.test.csv
Processed source dataset stb_data_3 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/SSA/stb-3

In [12]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SSA,stb-10,,univariate/SSA/stb-10.test.csv,real,False,,unsupervised,False,univariate,29518
SSA,stb-11,,univariate/SSA/stb-11.test.csv,real,False,,unsupervised,False,univariate,29517
SSA,stb-12,,univariate/SSA/stb-12.test.csv,real,False,,unsupervised,False,univariate,29517
SSA,stb-13,,univariate/SSA/stb-13.test.csv,real,False,,unsupervised,False,univariate,29517
SSA,stb-14,,univariate/SSA/stb-14.test.csv,real,False,,unsupervised,False,univariate,29518
SSA,stb-15,,univariate/SSA/stb-15.test.csv,real,False,,unsupervised,False,univariate,30493
SSA,stb-17,,univariate/SSA/stb-17.test.csv,real,False,,unsupervised,False,univariate,25296
SSA,stb-18,,univariate/SSA/stb-18.test.csv,real,False,,unsupervised,False,univariate,25283
SSA,stb-19,,univariate/SSA/stb-19.test.csv,real,False,,unsupervised,False,univariate,30493
SSA,stb-2,,univariate/SSA/stb-2.test.csv,real,False,,unsupervised,False,univariate,29553


## Experimentation

In [None]:
files = list_files()
f = files[0]
print("data", get_source_file(f))
print("label", get_source_file(f, label=True))
df_data = pd.read_csv(get_source_file(f), names=["value"])
df_label = pd.read_csv(get_source_file(f, label=True), names=["is_anomaly"])
df = pd.merge(df_data, df_label, left_index=True, right_index=True)
df.index.name = "timestamp"
df