# TimeEval result analysis

Reads the results from a TimeEval run and compiles a small report. Change the constants and the configuration to compile the report for another TimeEval run.

In [None]:
# Automatically reload packages:
%load_ext autoreload
%autoreload 2

In [None]:
# imports
import warnings
import pandas as pd
import numpy as np
import scipy as sp
import plotly.offline as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path
from timeeval import Datasets

## Configuration

In [None]:
# constants and configuration
data_path = Path("../data") / "test-cases"
result_root_path = Path("../results")
result_paths = [d for d in result_root_path.iterdir() if d.is_dir()]
result_paths

Select a results folder:

In [None]:
result_path = result_root_path / "2021-09-22_default-params-1&2&3-merged"
#result_path = result_paths[-1]

# load results
print(f"Reading results from {result_path.resolve()}")

df = pd.read_csv(result_path / "results.csv")
df["dataset_name"] = df["dataset"].str.split(".").str[0]

def load_scores_df(algorithm_name, dataset_id, repetition=1):
    params_id = df.loc[(df["algorithm"] == algorithm_name) & (df["collection"] == dataset_id[0]) & (df["dataset"] == dataset_id[1]), "hyper_params_id"].item()
    path = (
        result_path /
        algorithm_name /
        params_id /
        dataset_id[0] /
        dataset_id[1] /
        str(repetition) /
        "anomaly_scores.ts"
    )
    return pd.read_csv(path, header=None)

# load dataset metadata
dmgr = Datasets(data_path)

def plot_scores(algorithm_name, dataset_name):
    if isinstance(algorithm_name, str):
        algorithms = [algorithm_name]
    else:
        algorithms = algorithm_name
    # construct dataset ID
    dataset_id = ("GutenTAG", f"{dataset_name}.unsupervised")

    # load dataset details
    df_dataset = dmgr.get_dataset_df(dataset_id)

    # check if dataset is multivariate
    dataset_dim = df.loc[df["dataset_name"] == dataset_name, "dataset_input_dimensionality"].unique().item()
    dataset_dim = dataset_dim.lower()
    
    auroc = {}
    df_scores = pd.DataFrame(index=df_dataset.index)
    skip_algos = []
    for algo in algorithms:
        # get algorithm metric results
        try:
            auroc[algo] = df.loc[(df["algorithm"] == algo) & (df["dataset_name"] == dataset_name), "ROC_AUC"].item()
        except ValueError:
            warnings.warn(f"No scores found! Probably {algo} was not executed on {dataset_name}.")
            auroc[algo] = -1
            skip_algos.append(algo)
            continue

        # load scores
        training_type = df.loc[df["algorithm"] == algo, "algo_training_type"].values[0].lower().replace("_", "-")
        try:
            df_scores[algo] = load_scores_df(algo, ("GutenTAG", f"{dataset_name}.{training_type}")).iloc[:, 0]
        except (ValueError, FileNotFoundError):
            warnings.warn(f"No scores found! Probably {algo} was not executed on {dataset_name}.")
            df_scores[algo] = np.nan
            skip_algos.append(algo)
    algorithms = [a for a in algorithms if a not in skip_algos]

    # Create plot
    fig = make_subplots(2, 1)
    if dataset_dim == "multivariate":
        for i in range(1, df_dataset.shape[1]-1):
            fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset.iloc[:, i], name=f"channel-{i}"), 1, 1)
    else:
        fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset.iloc[:, 1], name="timeseries"), 1, 1)
    fig.add_trace(go.Scatter(x=df_dataset.index, y=df_dataset["is_anomaly"], name="label"), 2, 1)
    for algo in algorithms:
        fig.add_trace(go.Scatter(x=df_scores.index, y=df_scores[algo], name=f"{algo}={auroc[algo]:.4f}"), 2, 1)
    fig.update_xaxes(matches="x")
    fig.update_layout(
        title=f"Results of {','.join(algorithms)} on {dataset_name}",
        height=400
    )
    return py.iplot(fig)

def plot_datasets(datasets, max_channels = 20):
    if isinstance(datasets, str):
        datasets = [datasets]
    else:
        datasets = datasets
    n_datasets = len(datasets)
    
    # Create plot
    fig = make_subplots(n_datasets, 1)
    for i, d in enumerate(datasets):
        # construct dataset ID
        dataset_id = ("GutenTAG", f"{d}.unsupervised")
        
        # load dataset details
        try:
            df_dataset = dmgr.get_dataset_df(dataset_id)
        except Exception as e:
            warnings.warn(f"Could not load dataset {d}, because {repr(e)}")
            continue

        # get algorithm metric results
        try:
            auroc = df.loc[df["dataset_name"] == d, "ROC_AUC"].median()
        except ValueError:
            warnings.warn(f"No scores found for dataset {d} found!")
            auroc = -1
            continue

        for j in range(1, min(df_dataset.shape[1]-1, max_channels+1)):
            fig.add_trace(go.Scatter(
                x=df_dataset.index,
                y=df_dataset.iloc[:, j],
                name=f"{d} channel {j} ({auroc:.4f})",
            ), i+1, 1)

        # mark anomaly regions
        s = df_dataset["is_anomaly"].diff()
        anomaly_regions = zip(s[s== 1].index, s[s == -1].index)
        for s, e in anomaly_regions:
            fig.add_vrect(x0=s-1, x1=e,
                          exclude_empty_subplots=True,
                          line_width=0,
                          fillcolor="red",
                          opacity=0.3,
                          annotation_text="anomaly",
                          annotation_position="top left",
                          row=i+1,
                          col=1)

#     fig.update_xaxes(matches="x")
    fig.update_layout(
        title=f"Datasets and ground truth of {','.join(datasets)} datasets",
        height=200*n_datasets if n_datasets > 1 else 400
    )
    return py.iplot(fig)

Only consider the best run for each `algorithm`-`dataset`-combination (over all `hyper_params` and `repetition`s) for the analysis in this notebook:

In [None]:
def filter_groups(group):
    if len(group) > 1:
        group = group.sort_values(by="ROC_AUC", ascending=False)
    return group[:1]

df_grouped = df.groupby(by=["algorithm", "collection", "dataset"])
df_grouped = df_grouped.apply(filter_groups)
df_grouped.reset_index(drop=True, inplace=True)
df = df_grouped
df = df.sort_values(by=["algorithm", "dataset"])

## Analyze TimeEval results

In [None]:
df[["algorithm", "dataset_name", "status", "AVERAGE_PRECISION", "PR_AUC", "RANGE_PR_AUC", "ROC_AUC", "execute_main_time", "hyper_params"]]

### Errors

In [None]:
df_error_counts = df.pivot_table(index=["algo_training_type", "algorithm"], columns=["status"], values="repetition", aggfunc="count")
df_error_counts = df_error_counts.fillna(value=0).astype(np.int64)

#### Aggregation of errors per algorithm grouped by algorithm training type

In [None]:
for tpe in ["SEMI_SUPERVISED", "SUPERVISED", "UNSUPERVISED"]:
    print(tpe)
    py.iplot(ff.create_table(df_error_counts.loc[tpe], index=True))

#### Slow algorithms

Algorithms, for which more than 50% of all executions ran into the timeout.

In [None]:
df_error_counts[df_error_counts["Status.TIMEOUT"] > (df_error_counts["Status.ERROR"] + df_error_counts["Status.OK"])]

#### Broken algorithms

Algorithms, which failed for at least 50% of the executions.

In [None]:
error_threshold = 0.5
df_error_counts[df_error_counts["Status.ERROR"] > error_threshold*(
    df_error_counts["Status.TIMEOUT"] + df_error_counts["Status.ERROR"] + df_error_counts["Status.OK"]
)]

### Algorithm quality assessment

#### Overall algorithm performance based on ROC_AUC

In [None]:
aggregations = ["min", "mean", "median", "max"]
df_overall_scores = df.pivot_table(index="algorithm", values="ROC_AUC", aggfunc=aggregations)
df_overall_scores.columns = aggregations
df_overall_scores = df_overall_scores.sort_values(by="median", ascending=False)

df_overall_scores.head()

In [None]:
df_asl = df.pivot(index="algorithm", columns="dataset_name", values="ROC_AUC")
df_asl = df_asl.dropna(axis=0, how="all").dropna(axis=1, how="all")
df_asl["median"] = df_asl.median(axis=1)
df_asl = df_asl.sort_values(by="median", ascending=True)
df_asl = df_asl.drop(columns="median").T

In [None]:
fig = go.Figure()
for c in df_asl.columns:
    fig.add_trace(go.Violin(
        y=df_asl[c],
        name=c
    ))
fig.update_traces(meanline_visible=True, box_visible=True)
fig.update_layout(
    title={"text":"AUC_ROC violin plots", "xanchor": "center", "x": 0.5},
    yaxis_title="AUC_ROC score",
    legend_title="Algorithms",
    violingap=0
)
py.iplot(fig)

In [None]:
n_show = 10
n_show = n_show // 2
fig = go.Figure()
for i, c in enumerate(df_asl.columns):
    fig.add_trace(go.Box(
        x=df_asl[c],
        name=c,
        boxpoints=False,
        visible=None if i < n_show or i > len(df_asl.columns)-n_show-1 else "legendonly"
    ))
fig.update_layout(
    title={"text":"AUC_ROC box plots", "xanchor": "center", "x": 0.5},
    xaxis_title="AUC_ROC score",
    legend_title="Algorithms"
)
py.iplot(fig)

#### Scores of best algorithms

Please select a dataset (and algorithm if needed):

In [None]:
dataset_name = "sinus-combined-diff-2"
algorithm_name = None

plot_scores(algorithm_name if algorithm_name else df_asl.columns[-4:], dataset_name)

### Dataset inspection

#### Datasets based on the achieved AUC_ROC scores

In [None]:
df_datasets = df.pivot(index="dataset_name", columns="algorithm", values="ROC_AUC")
#df_datasets = df_datasets.dropna(axis=0, how="all").dropna(axis=1, how="all")
df_datasets["median"] = df_datasets.median(axis=1)
df_datasets = df_datasets.sort_values(by="median", ascending=True)
df_datasets = df_datasets.drop(columns="median").T

def plot_dataset_boxplot(characteristic):
    df_c = df_datasets.drop(columns=[c for c in df_datasets.columns if characteristic != c.split("-")[1]])
    fig = go.Figure()
    for i, c in enumerate(df_c.columns):
        base_osci = c.split("-")[0]
        fig.add_trace(go.Box(
            x=df_c[c],
            name=c,
            boxpoints=False,
            legendgroup=base_osci,
            visible="legendonly" if base_osci != "sinus" else None
        ))
    fig.update_layout(
        title={"text": f"Dataset scores by characteristic '{characteristic}'", "xanchor": "center", "x": 0.5},
        xaxis_title="AUC_ROC score",
        legend_title="Datasets"
    )
    return py.iplot(fig)

In [None]:
n_show = 10
n_show = n_show // 2
fig = go.Figure()
for i, c in enumerate(df_datasets.columns):
    fig.add_trace(go.Box(
        x=df_datasets[c],
        name=c,
        boxpoints=False,
        visible=None if i < n_show or i > len(df_datasets.columns)-n_show-1 else "legendonly"
    ))
fig.update_layout(
    title={"text":"AUC_ROC box plots", "xanchor": "center", "x": 0.5},
    xaxis_title="AUC_ROC score",
    legend_title="Datasets"
)
py.iplot(fig)

Easiest (best performing) datasets:

In [None]:
plot_datasets(df_datasets.columns[-4:][::-1])

Hardest (lowest performing) datasets:

In [None]:
plot_datasets(df_datasets.columns[:4])

#### Scores depending on anomaly position

In [None]:
plot_dataset_boxplot("position")

In [None]:
plot_datasets(["sinus-position-beginning", "sinus-position-middle", "sinus-position-end"])

#### Scores depending on anomaly characteristics

In [None]:
plot_dataset_boxplot("noise")

In [None]:
plot_dataset_boxplot("length")

In [None]:
plot_dataset_boxplot("type")

In [None]:
plot_datasets(["sinus-type-amplitude", "sinus-type-trend"])

In [None]:
plot_dataset_boxplot("trend")

In [None]:
plot_dataset_boxplot("channels")

## Baselines

In [None]:
baselines = ["normal", "increasing", "Random"]
df_baselines = df[df["algorithm"].isin(baselines)]
print("Mean scores of the baselines algorithms")
df_baselines.groupby(by="algorithm").mean()[["AVERAGE_PRECISION", "PR_AUC", "RANGE_PR_AUC", "ROC_AUC"]]

In [None]:
df_base = df_baselines.pivot(index="algorithm", columns="dataset_name", values="ROC_AUC")
df_base = df_base.dropna(axis=0, how="all").dropna(axis=1, how="all")
df_base["median"] = df_base.median(axis=1)
df_base = df_base.sort_values(by="median", ascending=True)
df_base = df_base.drop(columns="median").T

In [None]:
fig = go.Figure()
for c in df_base.columns:
    fig.add_trace(go.Violin(
        y=df_base[c],
        name=c
    ))
fig.update_traces(meanline_visible=True, box_visible=True)
fig.update_layout(
    title={"text":"AUC_ROC for the baseline algorithms", "xanchor": "center", "x": 0.5},
    yaxis_title="AUC_ROC score",
    legend_title="Baseline algorithms",
    violingap=0
)
py.iplot(fig)

In [None]:
plot_scores(baselines, "sinus-diff-count-5")

## Experimentation

In [None]:
plot_scores(df_asl.columns[-10:], "rw-channels-single-of-5")

In [None]:
n_show = 10
n_show = n_show // 2
fig = go.Figure()
for i, c in enumerate(df_asl.columns):
    fig.add_trace(go.Box(
        x=df_asl[c],
        name=c,
        boxpoints=False,
        visible="legendonly" if n_show < i < len(df_asl.columns)-n_show else None
    ))
fig.update_layout(
    title={"text":"AUC_ROC box plots", "xanchor": "center", "x": 0.5},
    xaxis_title="AUC_ROC score",
    legend_title="Algorithms"
)
py.iplot(fig)

In [None]:
plot_scores("Hybrid-KNN", "ecg-diff-count-4")

In [None]:
def get_path(algorithm_name, dataset_id, repetition=1):
    params_id = df.loc[(df["algorithm"] == algorithm_name) & (df["collection"] == dataset_id[0]) & (df["dataset"] == dataset_id[1]), "hyper_params_id"].item()
    path = (
        result_path /
        algorithm_name /
        params_id /
        dataset_id[0] /
        dataset_id[1] /
        str(repetition)
    )
    return path
dd = pd.read_csv(get_path("DBStream", ("GutenTAG", "ecg-diff-count-6.unsupervised")) / "docker-algorithm-scores.csv", header=None)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=dd.index,
    y=dd.iloc[:, 0],
    name="scores",
))
fig.update_layout(
    title={"text":"DBStream original scores", "xanchor": "center", "x": 0.5},
    xaxis_title="AUC_ROC score",
    legend_title="Algorithms"
)
py.iplot(fig)

In [None]:
df_tmp = df.copy()
df_tmp["overall_time"] = df["execute_main_time"].fillna(0) + df["train_main_time"].fillna(0)
df_tmp = df_tmp.pivot_table(index="algorithm", values=["ROC_AUC", "overall_time"], aggfunc="median")

from sklearn.preprocessing import MinMaxScaler
df_tmp["overall_time"] = 1 - MinMaxScaler(feature_range=(1e-6, 1)).fit_transform(df_tmp["overall_time"].values.reshape(-1, 1)).reshape(-1)
df_tmp = df_tmp.replace(0, np.nan)
df_tmp["weighted ROC_AUC"] = df_tmp["ROC_AUC"] * df_tmp["overall_time"]

df_tmp = df_tmp.sort_values(by="weighted ROC_AUC", ascending=True, na_position="first")

df_tmp.reset_index(drop=False, inplace=True)
fig = px.bar(df_tmp, x="algorithm", y=["ROC_AUC", "weighted ROC_AUC"], hover_data=["overall_time"], barmode="group")
py.iplot(fig)