# Bad performing algorithms

Analysis of bad performing algorithms using the default parameters on the benchmark datasets. Quality/Performance is evaluated using the AUC_ROC scores.

In [None]:
# Automatically reload packages:
%load_ext autoreload
%autoreload 2

In [None]:
# imports
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import plotly.offline as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path
from timeeval import Datasets

## Configuration

In [None]:
# constants and configuration
data_path = Path("../data") / "test-cases"
result_path = Path("../results") / "2021-08-22_default-params-merged"

# load results
# result_path = result_path / "results"
print(f"Reading results from {result_path.resolve()}")

df = pd.read_csv(result_path / "results.csv")
df["dataset_name"] = df["dataset"].str.split(".").str[0]

def load_scores_df(algorithm_name, dataset_id, repetition=1):
    params_id = df.loc[(df["algorithm"] == algorithm_name) & (df["collection"] == dataset_id[0]) & (df["dataset"] == dataset_id[1]) & (df["status"] == "Status.OK"), "hyper_params_id"].item()
    path = (
        result_path /
        algorithm_name /
        params_id /
        dataset_id[0] /
        dataset_id[1] /
        str(repetition) /
        "anomaly_scores.ts"
    )
    return pd.read_csv(path, header=None)

# load dataset metadata
dmgr = Datasets(data_path)

def plot_scores(algorithm_name, dataset_name):
    if isinstance(algorithm_name, str):
        algorithms = [algorithm_name]
    else:
        algorithms = algorithm_name
    # construct dataset ID
    dataset_id = ("GutenTAG", f"{dataset_name}.unsupervised")

    # load dataset details
    df_dataset = dmgr.get_dataset_df(dataset_id)

    # check if dataset is multivariate
    dataset_dim = df.loc[df["dataset_name"] == dataset_name, "dataset_input_dimensionality"].unique().item()
    dataset_dim = dataset_dim.lower()
    
    auroc = {}
    df_scores = pd.DataFrame(index=df_dataset.index)
    skip_algos = []
    for algo in algorithms:
        # get algorithm metric results
        try:
            auroc[algo] = df.loc[(df["algorithm"] == algo) & (df["dataset_name"] == dataset_name) & (df["status"] == "Status.OK"), "ROC_AUC"].item()
        except ValueError as e:
            warnings.warn(f"No ROC_AUC score found! Probably {algo} was not executed on {dataset_name}: {repr(e)}")
            auroc[algo] = -1
            skip_algos.append(algo)
            continue

        # load scores
        training_type = df.loc[df["algorithm"] == algo, "algo_training_type"].values[0].lower().replace("_", "-")
        try:
            df_scores[algo] = load_scores_df(algo, ("GutenTAG", f"{dataset_name}.{training_type}")).iloc[:, 0]
        except (ValueError, FileNotFoundError) as e:
            warnings.warn(f"No scores found! Probably {algo} was not executed on {dataset_name}: {repr(e)}")
            df_scores[algo] = np.nan
            skip_algos.append(algo)
    algorithms = [a for a in algorithms if a not in skip_algos]

    # Create plot
    fig = make_subplots(2, 1)
    if dataset_dim == "multivariate":
        for i in range(1, df_dataset.shape[1]-1):
            fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset.iloc[:, i], name=f"channel-{i}"), 1, 1)
    else:
        fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset.iloc[:, 1], name="timeseries"), 1, 1)
    fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset["is_anomaly"], name="label"), 2, 1)
    for algo in algorithms:
        fig.add_trace(go.Scatter(x=df_scores.index, y=df_scores[algo], name=f"{algo}={auroc[algo]:.4f}"), 2, 1)
    fig.update_xaxes(matches="x")
    fig.update_layout(
        title=f"Results of {','.join(algorithms)} on {dataset_name}",
        height=400
    )
    return py.iplot(fig)

def plot_datasets(datasets, max_channels = 20):
    if isinstance(datasets, str):
        datasets = [datasets]
    else:
        datasets = datasets
    n_datasets = len(datasets)
    
    # Create plot
    fig = make_subplots(n_datasets, 1)
    for i, d in enumerate(datasets):
        # construct dataset ID
        dataset_id = ("GutenTAG", f"{d}.unsupervised")
        
        # load dataset details
        try:
            df_dataset = dmgr.get_dataset_df(dataset_id)
        except Exception as e:
            warnings.warn(f"Could not load dataset {d}, because {repr(e)}")
            continue

        # get algorithm metric results
        try:
            auroc = df.loc[df["dataset_name"] == d, "ROC_AUC"].median()
        except ValueError:
            warnings.warn(f"No scores found for dataset {d} found!")
            auroc = -1
            continue

        for j in range(1, min(df_dataset.shape[1]-1, max_channels+1)):
            fig.add_trace(go.Scatter(
                x=df_dataset.index,
                y=df_dataset.iloc[:, j],
                name=f"{d} channel {j} ({auroc:.4f})",
            ), i+1, 1)

        # mark anomaly regions
        s = df_dataset["is_anomaly"].diff()
        anomaly_regions = zip(s[s== 1].index, s[s == -1].index)
        for s, e in anomaly_regions:
            fig.add_vrect(x0=s-1, x1=e,
                          exclude_empty_subplots=True,
                          line_width=0,
                          fillcolor="red",
                          opacity=0.3,
                          annotation_text="anomaly",
                          annotation_position="top left",
                          row=i+1,
                          col=1)

#     fig.update_xaxes(matches="x")
    fig.update_layout(
        title=f"Datasets and ground truth of {','.join(datasets)} datasets",
        height=200*n_datasets if n_datasets > 1 else 400
    )
    return py.iplot(fig)

In [None]:
def select(algo, dataset, column):
    record = df[(df["algorithm"] == algo) & (df["dataset_name"] == dataset) & (df["status"] == "Status.OK")]
    return record[column], record

## Overview over bad performing algorithms

#### Overall algorithm performance based on ROC_AUC

In [None]:
aggregations = ["min", "mean", "median", "max"]
df_overall_scores = df.pivot_table(index="algorithm", values="ROC_AUC", aggfunc=aggregations)
df_overall_scores.columns = aggregations
df_overall_scores = df_overall_scores.sort_values(by="median", ascending=False)

df_overall_scores.tail(10)

#### Algorithms that are bad on any dataset

In [None]:
threshold = 0.9
df_tmp = df_overall_scores[df_overall_scores["max"] < threshold]
df_tmp.sort_values(by="max", ascending=True)

#### Algorithms that are bad on a simple point anomaly dataset "sinus-type-extremum"

In [None]:
threshold = 0.6
dataset = "sinus-type-extremum"
df_tmp = df[(df["dataset_name"] == dataset) & (df["ROC_AUC"] < threshold)][["algorithm", "ROC_AUC"]]
df_tmp.sort_values(by="ROC_AUC")

In [None]:
plot_scores(df_tmp["algorithm"].values, dataset)

#### Algorithms that are bad on a simple subsequence anomaly dataset "sinus-type-platform"

In [None]:
threshold = 0.6
dataset = "sinus-type-platform"
df_tmp = df[(df["dataset_name"] == dataset) & (df["ROC_AUC"] < threshold)][["algorithm", "ROC_AUC"]]
df_tmp.sort_values(by="ROC_AUC")

In [None]:
plot_scores(df_tmp["algorithm"].values, dataset)

## Detailled inspection of bad performing algorithms

List of algorithms to inspect (based on above criteria):

- [KMeans](#KMeans)
- [AutoEncoder](#(Denoising-)AutoEncoder)
- [**Bagel**](#Bagel)
- [**DBStream**](#DBStream)
- [DenoisingAutoEncoder](#(Denoising-)AutoEncoder)
- [DSPOT](#DSPOT)
- [FFT](#FFT)
- [**HOT SAX**](#HOT-SAX)
- [Isolation Forest - Local Outier Factor](#Isolation-Forest---Local-Outier-Factor)
- [LOF](#LOF)
- [MedianMethod](#MedianMethod)
- [**MultiHMM**](#MultiHMM)
- [NormA](#NormA)
- [NumentaHTM](#NumentaHTM)
- [PCC](#PCC)
- [PCI](#PCI)
- [PST](#PST)
- [Robust PCA](#Robust-PCA)
- [SR-CNN](#SR-CNN)
- [SSA](#SSA)
- [Subsequence LOF](#Subsequence-LOF)
- [**TARZAN**](#TARZAN)
- [**TripleES**](#TripleES)
- [TSBitmap](#TSBitmap)
- [XGBoost Regressor](#XGBoost-Regressor)

### COF, LOF, CBLOF

In [None]:
algo = "CBLOF"
df_overall_scores.loc[algo]

In [None]:
df_tmp = df.pivot_table(index="algorithm", values="RANGE_PR_AUC", aggfunc=aggregations)
df_tmp.columns = aggregations
df_tmp.loc[algo]

In [None]:
plot_scores(algo, "sinus-type-extremum")

In [None]:
plot_scores(algo, "ecg-type-mean")

In [None]:
plot_scores(algo, "sinus-noise-01%")

In [None]:
plot_scores(algo, "rw-diff-count-6")

In [None]:
plot_scores([algo, "KMeans", "DWT-MLEAD"], "poly-combined-diff-2")

### HBOS

- same than for LOF

In [None]:
algo = "HBOS"
plot_scores(algo, "poly-combined-diff-2")

In [None]:
plot_scores(algo, "sinus-noise-10%")

In [None]:
select(algo, "sinus-noise-10%", ["ROC_AUC", "RANGE_PR_AUC"])[0]

In [None]:
plot_scores(algo, "sinus-diff-count-5")

In [None]:
select(algo, "sinus-diff-count-5", "RANGE_PR_AUC")[0]

In [None]:
select("Random", "sinus-diff-count-5", "RANGE_PR_AUC")[0]

### KMeans

- `window_size` should actually be `anomaly_window_size`, because if `window_size` is too small (for small period sizes), then the anomaly is not correctly detected

In [None]:
df[(df["algorithm"] == "KMeans") & (df["dataset_name"] == "sinus-type-platform")]

### DBStream

- sometimes the scores seem to be inverted, other times the scores are correct
- score invertion does not depend on dataset, but can be seen within a single dataset containing multiple anomalies (e.g. `ecg-diff-count-4`)
- Scores are higher at the beginning and decrease over time. This is likely due to the streaming character. (non-issue)

In [None]:
plot_scores("DBStream", "sinus-type-mean")

In [None]:
plot_scores("DBStream", "ecg-diff-count-4")

In [None]:
df.loc[(df["algorithm"] == "DBStream") & (df["dataset_name"] == "ecg-diff-count-6"), "hyper_params"].item()

In [None]:
def get_path(algorithm_name, dataset_id, repetition=1):
    params_id = df.loc[(df["algorithm"] == algorithm_name) & (df["collection"] == dataset_id[0]) & (df["dataset"] == dataset_id[1]), "hyper_params_id"].item()
    path = (
        result_path /
        algorithm_name /
        params_id /
        dataset_id[0] /
        dataset_id[1] /
        str(repetition)
    )
    return path
dd = pd.read_csv(get_path("DBStream", ("GutenTAG", "ecg-diff-count-6.unsupervised")) / "docker-algorithm-scores.csv", header=None)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=dd.index,
    y=dd.iloc[:, 0],
    name="scores",
))
fig.update_layout(
    title={"text":"DBStream original scores", "xanchor": "center", "x": 0.5},
    xaxis_title="AUC_ROC score",
    legend_title="Algorithms"
)
py.iplot(fig)

### TARZAN

- Anomaly scores seem to be shifted far to the left.
- Additional noise confuses algorithm (almost all of our datasets have noise)

In [None]:
plot_scores("TARZAN", "sinus-type-mean")

In [None]:
plot_scores("TARZAN", "sinus-noise-00%")

In [None]:
plot_scores("TARZAN", "sinus-noise-10%")

In [None]:
res, rec = select("TARZAN", "sinus-noise-10%", "ROC_AUC")
rec

In [None]:
plot_scores("TARZAN", "sinus-diff-count-5")

In [None]:
plot_scores("TARZAN", "sinus-type-frequency")

### Bagel

- seems to work better on non-periodic datasets (poly, rw) than the sinus and ecg base oscillations
- maybe parameters are bad or training data is not enough, so that algorithm cannot correctly learn the reoccuring patterns

In [None]:
df[(df["algorithm"] == "Bagel") & (df["dataset_name"] == "sinus-type-mean")]["hyper_params"].item()

In [None]:
plot_scores("Bagel", "sinus-type-mean")

In [None]:
plot_scores("Bagel", "rw-type-variance")

In [None]:
plot_scores("Bagel", "poly-type-variance")

### (Denoising-)AutoEncoder

- They are just very bad?!
- I guess, we could exclude them. They don't have a very good implementation and there is no real paper behind them!

In [None]:
plot_scores(["AutoEncoder", "DenoisingAutoEncoder"], "sinus-type-mean")

In [None]:
plot_scores(["AutoEncoder", "DenoisingAutoEncoder"], "poly-type-mean")

In [None]:
plot_scores(["AutoEncoder", "DenoisingAutoEncoder"], "ecg-type-platform")

### DSPOT

- has binary output for each point: anomaly or no anomaly
- the current metrics do not capture this correctly
- does DSPOT fit into our evaluation scheme?

In [None]:
print("hyper params:")
df.loc[(df["algorithm"] == "DSPOT") & (df["dataset_name"] == "sinus-type-mean"), "hyper_params"].item()

In [None]:
plot_scores("DSPOT", "sinus-type-mean")

In [None]:
plot_scores("DSPOT", "poly-type-mean")

### FFT

In [None]:
algo = "FFT"
dataset = "ecg-diff-count-5"
plot_scores(algo, dataset)

In [None]:
df[(df["algorithm"] == algo) & (df["dataset_name"] == dataset)]["hyper_params"].item()

In [None]:
param_id = df[(df["algorithm"] == algo) & (df["dataset_name"] == dataset)]["hyper_params_id"].item()
path = result_path / algo / param_id / "GutenTAG" / (dataset + ".unsupervised") / "1" / "execution.log"
with path.open() as fh:
    print("".join(fh.readlines()))

### LOF

### MedianMethod

### MultiHMM

### NormA

- scores look very broken --> **INVESTIGATE**

In [None]:
algo = "NormA"
dataset = "sinus-diff-count-5"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "ecg-type-mean")

### NumentaHTM

### PCC

### PCI

### PST

In [None]:
df_overall_scores.loc["PST"]

In [None]:
df_tmp = df[(df["algorithm"] == "PST") & (df["status"] == "Status.OK")][["dataset_name", "execute_main_time", "RANGE_PR_AUC", "ROC_AUC"]]
df_tmp = df_tmp[(df_tmp["ROC_AUC"] < 0.8) & (~df["dataset_name"].str.startswith("rw"))]
df_tmp.sort_values(by="ROC_AUC", ascending=False)

In [None]:
algo = "PST"
dataset = "sinus-diff-count-9"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "ecg-type-amplitude")

In [None]:
plot_scores(algo, "sinus-type-extremum")

In [None]:
plot_scores(algo, "sinus-type-platform")

### Robust PCA

### SR-CNN

### SSA

- very sensitive to `window_size` parameter (should be set to `2x period`)

In [None]:
algo = "SSA"
dataset = "sinus-diff-count-5"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
print(f"Execution Status: {select(algo, 'ecg-diff-count-5', 'hyper_params')[0].item()}")
plot_scores(algo, "ecg-diff-count-5")

In [None]:
print(f"Execution Status: {select(algo, 'poly-diff-count-5', 'hyper_params')[0].item()}")
plot_scores(algo, "poly-diff-count-5")

### Subsequence LOF

### TripleES

In [None]:
algo = "TripleES"
dataset = "sinus-diff-count-9"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
print(select(algo, "sinus-type-mean", "hyper_params")[0].item())
plot_scores(algo, "sinus-type-mean")

### TSBitmap

In [None]:
algo = "TSBitmap"
dataset = "sinus-diff-count-9"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "sinus-type-variance")

In [None]:
plot_scores(algo, "ecg-noise-01%")

### XGBoost Regressor

- learning rate was too small, so that algorithm could not really learn the training data

In [None]:
algo = "XGBoost Regressor"
dataset = "ecg-diff-count-5"
plot_scores(algo, dataset)

In [None]:
df[(df["algorithm"] == algo) & (df["dataset_name"] == dataset)]["hyper_params"].item()

### Fast-MCD

- regards every point as a (multidimensional in the case of multivariate data) single object and estimates the covariance matrix from a clean training dataset
- comparing just the different points does not work well
- we introduced a variant that works on univariate data and regards a subsequence as an object: `Subsequence Fast-MCD`

In [None]:
plot_scores("FastMCD", "sinus-type-mean")

In [None]:
plot_scores("FastMCD", "sinus-type-frequency")

In [None]:
plot_scores("FastMCD", "poly-type-mean")

In [None]:
df[(df["algorithm"] == "FastMCD")].iloc[0]["hyper_params"]

### HOT-SAX

- in the implementation of HOT-SAX (the code, where we call it from), we only mark the starting index instead of the whole window as anomalous
- runtime comparison might not be fair, because we let HOT-SAX search for all discords
  - acutally, we know the number of anomalies in the dataset
  - just letting HOT-SAX search for a specific number of anomalies result in bad scores, because two discords could sit within the same anomaly window

In [None]:
algo = "HOT SAX"
dataset = "sinus-diff-count-5"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
print(f"Status {select(algo, 'rw-type-extremum', 'status')[0].item()}")
plot_scores(algo, "rw-type-extremum")

### KNN

- same issues as all point-based methods

In [None]:
algo = "KNN"
dataset = "sinus-diff-count-5"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "ecg-type-variance")

### OmniAnomaly

In [None]:
algo = "OmniAnomaly"
dataset = "sinus-type-platform"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "sinus-type-mean")

In [None]:
plot_scores(algo, "sinus-type-mean")

In [None]:
plot_scores(algo, "sinus-type-variance")

### Median method

- is good for poly and rw based datasets
- has no way to capture a seasonal aspect and therefore fails for those datasets and anomalies that break the cycles
- smaller window sizes (smaller contextes

In [None]:
algo = "MedianMethod"
dataset = "sinus-type-platform"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "ecg-diff-count-5")

In [None]:
plot_scores(algo, "poly-diff-count-5")

In [None]:
df_tmp = df[(df["algorithm"] == algo) & (df["dataset_name"].str.startswith("poly"))][["dataset_name", "execute_main_time", "RANGE_PR_AUC", "ROC_AUC"]]
df_tmp.sort_values(by="ROC_AUC", ascending=False, inplace=True)
df_tmp

In [None]:
plot_scores([algo, "STOMP", "Subsequence LOF"], "poly-diff-count-5")

### S-H-ESD

- Annotates points
- Cannot deal with non-periodic data (it even assumes timestamps)
- Cannot deal with ECG data despite its periodicity
- Cannot deal with trends in the signal

In [None]:
algo = "S-H-ESD"
dataset = "sinus-diff-count-5"
print(f"Execution Status: {select(algo, dataset, 'status')[0].item()}")
print(f"Params: {select(algo, dataset, 'hyper_params')[0].item()}")
print(f"Time:  {select(algo, dataset, 'train_main_time')[0].item()}/{select(algo, dataset, 'execute_main_time')[0].item()} seconds")
plot_scores(algo, dataset)

In [None]:
plot_scores(algo, "sinus-type-mean")

In [None]:
plot_scores(algo, "sinus-position-middle")

In [None]:
plot_scores(algo, "ecg-same-count-1")

In [None]:
plot_scores(algo, "poly-type-extremum")