# TimeEval parameter optimization result analysis

In [1]:
# Automatically reload packages:
%load_ext autoreload
%autoreload 2

In [2]:
# imports
import json
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import plotly.offline as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path
from timeeval import Datasets

## Configuration

Define data and results folder:

In [3]:
# constants and configuration
data_path = Path("../../data") / "test-cases"
result_root_path = Path("../timeeval_experiments/results")
experiment_result_folder = "2021-10-06_optim-part1"

# build paths
result_paths = [d for d in result_root_path.iterdir() if d.is_dir()]
print("Available result directories:")
display(result_paths)

result_path = result_root_path / experiment_result_folder
print("\nSelecting:")
print(f"Data path: {data_path.resolve()}")
print(f"Result path: {result_path.resolve()}")

Available result directories:


[PosixPath('../timeeval_experiments/results/2021-10-06_optim-part1'),
 PosixPath('../timeeval_experiments/results/2021-09-30-torsk'),
 PosixPath('../timeeval_experiments/results/2021-09-27_shared-optim'),
 PosixPath('../timeeval_experiments/results/2021_10_07_13_15_14'),
 PosixPath('../timeeval_experiments/results/2021_10_07_12_02_23'),
 PosixPath('../timeeval_experiments/results/2021-10-04_shared-optim2')]


Selecting:
Data path: /home/sebastian/Documents/Projects/akita/data/test-cases
Result path: /home/sebastian/Documents/Projects/akita/timeeval/timeeval_experiments/results/2021-10-06_optim-part1


Load results and dataset metadata:

In [4]:
# load results
print(f"Reading results from {result_path.resolve()}")
df = pd.read_csv(result_path / "results.csv")

# add dataset_name column
df["dataset_name"] = df["dataset"].str.split(".").str[0]

# load dataset metadata
dmgr = Datasets(data_path)

Reading results from /home/sebastian/Documents/Projects/akita/timeeval/timeeval_experiments/results/2021-10-06_optim-part1


Extract target optimized parameter names that were iterated in this run (per algorithm):

In [5]:
algo_param_mapping = {}
algorithms = df["algorithm"].unique()
param_ignore_list = ["max_anomaly_window_size", "window_size"]

for algo in algorithms:
    param_sets = df.loc[df["algorithm"] == algo, "hyper_params"].unique()
    param_sets = [json.loads(ps) for ps in param_sets]
    param_names = np.unique([name for ps in param_sets for name in ps if name not in param_ignore_list])
    search_space = set()
    for param_name in param_names:
        values = []
        for ps in param_sets:
            try:
                values.append(ps[param_name])
            except:
                pass
        values = np.unique(values)
        if values.shape[0] > 1:
            search_space.add(param_name)
    algo_param_mapping[algo] = list(search_space)

for algo in algo_param_mapping:
    print(algo, algo_param_mapping[algo])

DBStream ['alpha', 'lambda', 'radius', 'min_weight', 'shared_density']
DeepAnT []
Donut ['linear_hidden_size', 'latent_size']
DWT-MLEAD ['quantile_epsilon']
FFT ['local_outlier_threshold', 'fft_parameters', 'max_sign_change_distance']


Extract optimized parameters and their values (columns: optim_param_name and optim_param_value) for each experiment:

In [6]:
def extract_hyper_params(algo):
    param_names = algo_param_mapping[algo]
    def extract(value):
        params = json.loads(value)
        result = None
        for name in param_names:
            try:
                value = params[name]
                result = pd.Series([name, value], index=["optim_param_name", "optim_param_value"])
                break
            except KeyError:
                pass
        if result is None:
            return pd.Series([np.nan, np.nan], index=["optim_param_name", "optim_param_value"])
        return result
    return extract

df[["optim_param_name", "optim_param_value"]] = ""
for algo in algo_param_mapping:
    df_algo = df.loc[df["algorithm"] == algo]
    df.loc[df_algo.index, ["optim_param_name", "optim_param_value"]] = df_algo["hyper_params"].apply(extract_hyper_params(algo))

Extract window size parameters (dependent params) and convert them into multiples of the dataset period size:

In [7]:
dependent_param_names = ["neighbourhood_size", "window_size"]

def extract_window_param(value, param_name=""):
    params = json.loads(value)
    try:
        return params[param_name]
    except KeyError:
        return 0

for param_name in dependent_param_names:
    s_windows = df["hyper_params"].apply(extract_window_param, param_name=param_name)
    df2 = df[s_windows > 0][["dataset"]].copy()
    df2[param_name] = s_windows[df2.index]
    df2["period_size"] = df2["dataset"].apply(lambda d: dmgr.get(("GutenTAG", d)).period_size)
    df2["optim_param_name"] = param_name
    df2["optim_param_value"] = df2[param_name] / df2["period_size"]
    df2["optim_param_value"] = (df2["optim_param_value"]
                                .fillna(df2[param_name])
                                .round(1)
                                .replace(50., 0.5)
                                .replace(100, 1.0)
                                .replace(150, 1.5)
                                .replace(200, 2.0))
    df.loc[df2.index, ["optim_param_name", "optim_param_value"]] = df2[["optim_param_name", "optim_param_value"]]

Define plotting functions:

In [8]:
def load_scores_df(algorithm_name, dataset_id, optim_params, repetition=1):
    params_id = df.loc[(df["algorithm"] == algorithm_name) & (df["collection"] == dataset_id[0]) & (df["dataset"] == dataset_id[1]) & (df["optim_param_name"] == optim_params[0]) & (df["optim_param_value"] == optim_params[1]), "hyper_params_id"].item()
    path = (
        result_path /
        algorithm_name /
        params_id /
        dataset_id[0] /
        dataset_id[1] /
        str(repetition) /
        "anomaly_scores.ts"
    )
    return pd.read_csv(path, header=None)

def plot_scores(algorithm_name, dataset_name):
    if isinstance(algorithm_name, tuple):
        algorithms = [algorithm_name]
    elif not isinstance(algorithm_name, list):
        raise ValueError("Please supply a tuple (algorithm_name, optim_param_name, optim_param_value) or a list thereof as first argument!")
    else:
        algorithms = algorithm_name
    # construct dataset ID
    dataset_id = ("GutenTAG", f"{dataset_name}.unsupervised")

    # load dataset details
    df_dataset = dmgr.get_dataset_df(dataset_id)

    # check if dataset is multivariate
    dataset_dim = df.loc[df["dataset_name"] == dataset_name, "dataset_input_dimensionality"].unique().item()
    dataset_dim = dataset_dim.lower()
    
    auroc = {}
    df_scores = pd.DataFrame(index=df_dataset.index)
    skip_algos = []
    algos = []
    for algo, optim_param_name, optim_param_value in algorithms:
        optim_params = f"{optim_param_name}={optim_param_value}"
        algos.append((algo, optim_params))
        # get algorithm metric results
        try:
            auroc[(algo, optim_params)] = df.loc[
                (df["algorithm"] == algo) & (df["dataset_name"] == dataset_name) & (df["optim_param_name"] == optim_param_name) & (df["optim_param_value"] == optim_param_value),
                "ROC_AUC"
            ].item()
        except ValueError:
            warnings.warn(f"No ROC_AUC score found! Probably {algo} with params {optim_params} was not executed on {dataset_name}.")
            auroc[(algo, optim_params)] = -1
            skip_algos.append((algo, optim_params))
            continue

        # load scores
        training_type = df.loc[df["algorithm"] == algo, "algo_training_type"].values[0].lower().replace("_", "-")
        try:
            df_scores[(algo, optim_params)] = load_scores_df(algo, ("GutenTAG", f"{dataset_name}.{training_type}"), (optim_param_name, optim_param_value)).iloc[:, 0]
        except (ValueError, FileNotFoundError):
            warnings.warn(f"No anomaly scores found! Probably {algo} was not executed on {dataset_name} with params {optim_params}.")
            df_scores[(algo, optim_params)] = np.nan
            skip_algos.append((algo, optim_params))
    algorithms = [a for a in algos if a not in skip_algos]

    # Create plot
    fig = make_subplots(2, 1)
    if dataset_dim == "multivariate":
        for i in range(1, df_dataset.shape[1]-1):
            fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset.iloc[:, i], name=f"channel-{i}"), 1, 1)
    else:
        fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset.iloc[:, 1], name="timeseries"), 1, 1)
    fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset["is_anomaly"], name="label"), 2, 1)
    
    for item in algorithms:
        algo, optim_params = item
        fig.add_trace(go.Scatter(x=df_scores.index, y=df_scores[item], name=f"{algo}={auroc[item]:.4f} ({optim_params})"), 2, 1)
    fig.update_xaxes(matches="x")
    fig.update_layout(
        title=f"Results of {','.join(np.unique([a for a, _ in algorithms]))} on {dataset_name}",
        height=400
    )
    return py.iplot(fig)

## Parameter assessment

In [9]:
sort_by = ("ROC_AUC", "mean")
metric_agg_type = ["mean", "median"]
time_agg_type = "mean"
aggs = {
    "PR_AUC": metric_agg_type,
    "ROC_AUC": metric_agg_type,
    "train_main_time": time_agg_type,
    "execute_main_time": time_agg_type,
    "repetition": "count"
}

df_tmp = df.reset_index()
df_tmp = df_tmp.groupby(by=["algorithm", "optim_param_name", "optim_param_value"]).agg(aggs)
df_tmp = df_tmp.reset_index()
df_tmp = df_tmp.sort_values(by=["algorithm", "optim_param_name", sort_by], ascending=False)
df_tmp = df_tmp.set_index(["algorithm", "optim_param_name", "optim_param_value"])

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_tmp)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PR_AUC,PR_AUC,ROC_AUC,ROC_AUC,train_main_time,execute_main_time,repetition
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,mean,median,mean,mean,count
algorithm,optim_param_name,optim_param_value,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
FFT,max_sign_change_distance,5.0,0.404383,0.476016,0.642634,0.581943,,5.148283,168
FFT,max_sign_change_distance,20.0,0.405193,0.50005,0.639064,0.590061,,5.867943,168
FFT,max_sign_change_distance,10.0,0.401553,0.50005,0.636789,0.566659,,5.797005,168
FFT,max_sign_change_distance,30.0,0.404115,0.50005,0.636391,0.585594,,5.407711,168
FFT,local_outlier_threshold,0.78,0.428873,0.505,0.639053,0.575039,,5.780453,168
FFT,local_outlier_threshold,0.6,0.401553,0.50005,0.636789,0.566659,,5.935648,168
FFT,local_outlier_threshold,0.42,0.389398,0.393046,0.636015,0.579356,,6.162488,168
FFT,fft_parameters,1.0,0.401553,0.50005,0.636789,0.566659,,5.533686,168
FFT,fft_parameters,2.0,0.401553,0.50005,0.636789,0.566659,,5.839319,168
FFT,fft_parameters,3.0,0.401553,0.50005,0.636789,0.566659,,6.147953,168


#### Selected parameters

- DeepAnT: `window_size="1.0 dataset period size"`
- DWT-MLEAD: `quantile_epsilon=0.1`
- ...

In [None]:
plot_scores([
    ("DWT-MLEAD", "quantile_epsilon", 0.1),
    ("DWT-MLEAD", "quantile_epsilon", 0.001)
], "sinus-type-frequency")

In [None]:
plot_scores([
    ("DBStream", "window_size", 1),
    ("DBStream", "window_size", 2)
], "sinus-type-frequency")