# TSB-UAD: An End-to-End Benchmark Suite for Univariate Time-Series Anomaly Detection

- Source and description: https://github.com/TheDatumOrg/TSB-UAD
- Paper: https://dl.acm.org/doi/pdf/10.14778/3529337.3529354

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from config import data_raw_folder, data_processed_folder
from timeeval import DatasetManager, Datasets
from timeeval.datasets import DatasetAnalyzer, DatasetRecord

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)

In [None]:
def find_datasets(folder):
    if not isinstance(folder, Path):
        folder = Path(folder)
    return sorted([f for f in folder.glob("*.out") if f.is_file()])

In [None]:
dataset_collection_name = "TSB-UAD"
source_folder = Path(data_raw_folder) / "TSB-UAD"
target_folder = Path(data_processed_folder)

print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

In [None]:
# shared by all datasets
dataset_type = "synthetic"
input_type = "univariate"
datetime_index = False
split_at = None
train_is_normal = False
train_type = "unsupervised"

dm = DatasetManager(target_folder, create_if_missing=False)

In [None]:
def process_dataset(dm: DatasetManager, idx: int, f: Path, name_prefix: str = "") -> None:
    print(f"> Processing source dataset {idx}")
    dataset_name = f"{name_prefix}-{f.stem}"
    test_filename = f"{dataset_name}.test.csv"
    test_path = dataset_subfolder / test_filename
    target_test_filepath = target_subfolder / test_filename
    target_meta_filepath = target_test_filepath.parent / f"{dataset_name}.{Datasets.METADATA_FILENAME_SUFFIX}"

    # Prepare datasets
    if not target_test_filepath.exists() or not target_meta_filepath.exists():
        df_test = pd.read_csv(f, header=None)
        df_test.columns = ["value", "is_anomaly"]
        df_test.insert(0, "timestamp", df_test.index)
        df_test.to_csv(target_test_filepath, index=False)
        print(f"  written dataset {idx}")
    else:
        df_test = None
        print(f"  skipped writing dataset {idx} to disk, because it already exists.")

    # Prepare metadata
    def analyze(df_test):
        da = DatasetAnalyzer((dataset_collection_name, dataset_name), is_train=False, df=df_test, ignore_stationarity=True)
        da.save_to_json(target_meta_filepath, overwrite=True)
        meta = da.metadata
        print(f"  analyzed test dataset {idx}")
        return meta

    if target_meta_filepath.exists():
        try:
            meta = DatasetAnalyzer.load_from_json(target_meta_filepath, train=False)
            print(f"  skipped analyzing dataset {idx}, because metadata already exists.")
        except ValueError:
            if df_test is None:
                df_test = pd.read_csv(target_test_filepath)
            meta = analyze(df_test)
    else:
        meta = analyze(df_test)

    dm.add_dataset(DatasetRecord(
          collection_name=dataset_collection_name,
          dataset_name=dataset_name,
          train_path=None,
          test_path=test_path,
          dataset_type=dataset_type,
          datetime_index=datetime_index,
          split_at=split_at,
          train_type=train_type,
          train_is_normal=train_is_normal,
          input_type=input_type,
          length=meta.length,
          dimensions=meta.dimensions,
          contamination=meta.contamination,
          num_anomalies=meta.num_anomalies,
          min_anomaly_length=meta.anomaly_length.min,
          median_anomaly_length=meta.anomaly_length.median,
          max_anomaly_length=meta.anomaly_length.max,
          mean=meta.mean,
          stddev=meta.stddev,
          trend=meta.trend,
          stationarity=meta.get_stationarity_name(),
          period_size=np.nan
    ))
    print(f"... processed source dataset {idx}: {name_prefix}-{f.name} -> {target_test_filepath}")

subcollection = "artificial"

print("#############")
print(f"# Processing sub-collection {subcollection}")
print("#############")

# create target directory
dataset_subfolder = Path(input_type) / f"{dataset_collection_name}-{subcollection}"
target_subfolder = target_folder / dataset_subfolder
target_subfolder.mkdir(parents=True, exist_ok=True)
print(f"Created directories {target_subfolder}")

folder = source_folder / subcollection

i = 0
for file in find_datasets(folder):
    process_dataset(dm, i, file)
    i += 1
dm.save()


subcollection = "synthetic"

print("#############")
print(f"# Processing sub-collection {subcollection}")
print("#############")

# create target directory
dataset_subfolder = Path(input_type) / f"{dataset_collection_name}-{subcollection}"
target_subfolder = target_folder / dataset_subfolder
target_subfolder.mkdir(parents=True, exist_ok=True)
print(f"Created directories {target_subfolder}")

folder = source_folder / subcollection

for subfolder in folder.iterdir():
    if subfolder.is_dir():
        for file in find_datasets(subfolder):
            process_dataset(dm, i, file, name_prefix=subfolder.stem)
            i += 1
dm.save()

In [None]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

## Exploration

In [None]:
folder = source_folder / "artificial"
datasets = find_datasets(folder)
[d.name for d in datasets]

In [None]:
f = datasets[10]
df = pd.read_csv(f, header=None)
df.columns = ["value", "is_anomaly"]
df.insert(0, "timestamp", df.index)
df

In [None]:
df.plot(subplots=True)
# plt.xlim(anomaly[0]-1500, anomaly[1]+1500)
plt.show()