# Numenta Anomaly Benchmark

In [1]:
import pandas as pd
import json
import os
from typing import Final
from collections.abc import Callable
from datetime import datetime
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [2]:
dataset_collection_name = "NAB"
source_folder = os.path.join(data_raw_folder, "Community-NAB")
target_folder = data_processed_folder

from pathlib import Path
print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/Community-NAB and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


In [3]:
def calc_size(filename: str) -> int:
    with open(filename, 'r') as f:
        next(f) # skips header
        c = 0
        for line in f:
            c += 1
    return c

def transform_and_label(source: str, target: str, anomaly_windows: list[str]) -> None:
    df = pd.read_csv(source)
    df["timestamp"] = pd.to_datetime(df['timestamp'], infer_datetime_format=True)
    df["is_anomaly"] = 0

    for t1, t2 in anomaly_windows:
        t1 = datetime.strptime(t1, "%Y-%m-%d %H:%M:%S.%f")
        t2 = datetime.strptime(t2, "%Y-%m-%d %H:%M:%S.%f")
        moreThanT1 = df[df["timestamp"] >= t1]
        betweenT1AndT2 = moreThanT1[moreThanT1["timestamp"] <= t2]
        indices = betweenT1AndT2.index
        df["is_anomaly"].values[indices.values] = 1

    df.to_csv(target, index=False)

In [5]:
# shared by all datasets
input_type = "univariate"
datetime_index = True
train_type = "unsupervised"
train_is_normal = False

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

with open(os.path.join(source_folder, "labels", "combined_windows.json"), 'r') as f:
    windows = json.load(f)

#windows

Directories /home/projects/akita/data/benchmark-data/data-processed/univariate/NAB already exist


In [6]:
# dataset transformation
transform_file: Callable[[str, str, list[str]], None] = transform_and_label

for dataset in windows:
    source_file = os.path.join(source_folder, "data", dataset)
    dataset_type = "real" if dataset.startswith("real") else "synthetic"
    
    # get basename for target filename
    basename = os.path.splitext(os.path.basename(source_file))[0]
    filename = f"{basename}.test.csv"

    # save metadata
    dataset_name = filename.split(".")[0]
    path = os.path.join(dataset_subfolder, filename)
    target_filepath = os.path.join(target_subfolder, filename)
    dataset_length = calc_size(source_file)
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = None,
        test_path = path,
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )
    # transform file
    transform_file(source_file, target_filepath, windows[dataset])
    print(f"Processed source dataset {source_file} -> {target_filepath}")

# save metadata of benchmark
dm.save()

Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Community-NAB/data/artificialNoAnomaly/art_daily_no_noise.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/NAB/art_daily_no_noise.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Community-NAB/data/artificialNoAnomaly/art_daily_perfect_square_wave.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/NAB/art_daily_perfect_square_wave.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Community-NAB/data/artificialNoAnomaly/art_daily_small_noise.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/NAB/art_daily_small_noise.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Community-NAB/data/artificialNoAnomaly/art_flatline.csv -> /home/projects/akita/data/benchmark-data/data-processed/univariate/NAB/art_flatline.test.csv
Processed source dataset /ho

In [7]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NAB,TravelTime_387,,univariate/NAB/TravelTime_387.test.csv,real,True,,unsupervised,False,univariate,2500
NAB,TravelTime_451,,univariate/NAB/TravelTime_451.test.csv,real,True,,unsupervised,False,univariate,2162
NAB,Twitter_volume_AAPL,,univariate/NAB/Twitter_volume_AAPL.test.csv,real,True,,unsupervised,False,univariate,15902
NAB,Twitter_volume_AMZN,,univariate/NAB/Twitter_volume_AMZN.test.csv,real,True,,unsupervised,False,univariate,15831
NAB,Twitter_volume_CRM,,univariate/NAB/Twitter_volume_CRM.test.csv,real,True,,unsupervised,False,univariate,15902
NAB,Twitter_volume_CVS,,univariate/NAB/Twitter_volume_CVS.test.csv,real,True,,unsupervised,False,univariate,15853
NAB,Twitter_volume_FB,,univariate/NAB/Twitter_volume_FB.test.csv,real,True,,unsupervised,False,univariate,15833
NAB,Twitter_volume_GOOG,,univariate/NAB/Twitter_volume_GOOG.test.csv,real,True,,unsupervised,False,univariate,15842
NAB,Twitter_volume_IBM,,univariate/NAB/Twitter_volume_IBM.test.csv,real,True,,unsupervised,False,univariate,15893
NAB,Twitter_volume_KO,,univariate/NAB/Twitter_volume_KO.test.csv,real,True,,unsupervised,False,univariate,15851


## Experimentation

In [None]:
dataset = "realAdExchange/exchange-4_cpc_results.csv"
source_file = os.path.join(source_folder, "data", dataset)
df = pd.read_csv(source_file)
df["timestamp"] = pd.to_datetime(df['timestamp'], infer_datetime_format=True)
df["is_anomaly"] = 0

for t1, t2 in windows[dataset]:
    t1 = datetime.strptime(t1, "%Y-%m-%d %H:%M:%S.%f")
    t2 = datetime.strptime(t2, "%Y-%m-%d %H:%M:%S.%f")
    moreThanT1 = df[df["timestamp"] >= t1]
    betweenT1AndT2 = moreThanT1[moreThanT1["timestamp"] <= t2]
    indices = betweenT1AndT2.index
    df["is_anomaly"].values[indices.values] = 1

df[df["is_anomaly"] == 1]

In [None]:
with open("data-raw/Community-NAB/labels/combined_labels.json", 'r') as f:
    labels = json.load(f)
labels

In [None]:
with open("data-raw/Community-NAB/labels/combined_windows.json", 'r') as f:
    windows = json.load(f)
windows

In [None]:
def to_datetime(str):
    try:
        return datetime.strptime(str, "%Y-%m-%d %H:%M:%S")
    except ValueError:
        return datetime.strptime(str, "%Y-%m-%d %H:%M:%S.%f")

In [None]:
matches = 0
for dataset in windows:
    for (anomaly, anomaly_window) in zip(labels[dataset], windows[dataset]):
        maybe_middle = to_datetime(anomaly)
        (start, end) = [to_datetime(d) for d in anomaly_window]
        diff1 = maybe_middle - start
        diff2 = end - maybe_middle
        if diff1 == diff2:
            # print(f"{dataset}-{anomaly} is in the middle of anomaly window!")
            matches += 1
        else:
            print(dataset)
            print(f"{start} - ({diff1})- {anomaly} -({diff2})- {end}")
print(f"matches: {matches}/{sum(list(map(lambda x: len(labels[x]), labels)))}")