# Dataset analysis

In [None]:
# imports
import pandas as pd
import numpy as np
import scipy as sp
import plotly.offline as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path
from timeeval import Datasets

In [None]:
data_path = Path("../data") / "test-cases"
# load dataset metadata
dmgr = Datasets(data_path)

In [None]:
def plot_datasets(datasets, max_channels = 20):
    if isinstance(datasets, str):
        datasets = [datasets]
    else:
        datasets = datasets
    n_datasets = len(datasets)
    
    # Create plot
    fig = make_subplots(n_datasets, 1)
    for i, d in enumerate(datasets):
        # construct dataset ID
        dataset_id = ("GutenTAG", f"{d}.unsupervised")
        
        # load dataset details
        try:
            df_dataset = dmgr.get_dataset_df(dataset_id)
        except Exception as e:
            warnings.warn(f"Could not load dataset {d}, because {repr(e)}")
            continue

        for j in range(1, min(df_dataset.shape[1]-1, max_channels+1)):
            fig.add_trace(go.Scatter(
                x=df_dataset.index,
                y=df_dataset.iloc[:, j],
                name=f"{d} channel {j}",
            ), i+1, 1)

        # mark anomaly regions
        s = df_dataset["is_anomaly"].diff()
        anomaly_regions = zip(s[s== 1].index, s[s == -1].index)
        for s, e in anomaly_regions:
            fig.add_vrect(x0=s-1, x1=e,
                          exclude_empty_subplots=True,
                          line_width=0,
                          fillcolor="red",
                          opacity=0.3,
                          annotation_text="anomaly",
                          annotation_position="top left",
                          row=i+1,
                          col=1)

#     fig.update_xaxes(matches="x")
    fig.update_layout(
        title=f"Datasets and ground truth of {','.join(datasets)} datasets",
        height=200*n_datasets if n_datasets > 1 else 400
    )
    return py.iplot(fig)

## Lookup all datasets and plot them

In [None]:
datasets = np.unique([d.split(".")[0] for d in dmgr.get_dataset_names() if not d.startswith("cbf")])
datasets.sort()
i = 0
len(datasets)

In [None]:
if i >= len(datasets):
    print("FINISHED!")
    i = 0
else:
    print(f"Dataset {i}: {datasets[i]}")
    plot_datasets(datasets[i])
    i += 1

In [None]:
plot_datasets(["sinus-type-mean", "ecg-noise-01%", "poly-type-trend"])