# Exathlon (VLDB21) datasets

In [1]:
from typing import List, Optional
import matplotlib
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets
from timeeval.datasets import DatasetAnalyzer, DatasetRecord
from IPython.display import display, Markdown
import warnings

In [2]:
plt.rcParams["figure.figsize"] = (20, 10)

In [3]:
def find_datasets(folder):
    if not isinstance(folder, Path):
        folder = Path(folder)
    return sorted([f for d in source_folder.iterdir() if d.is_dir() for f in d.iterdir() if f.is_file()])

def plot_dataset(f, start_column_idx=0, end_column_idx=10):
    name = f.stem
    column_idxs = [0] + list(range(start_column_idx+1, end_column_idx))
    data = pd.read_csv(f).iloc[:, column_idxs].set_index("t")
    data.plot()
    
    # add anomaly labels
    try:
        anomalies = df_labels.loc[name]
    except KeyError:
        anomalies = pd.DataFrame([], columns=["begin", "end"])
    ax = plt.gca()
    for name, row in anomalies.iterrows():
        begin = row["begin"].astype(np.int_)
        end = row["end"].astype(np.int_)
        ax.add_patch(matplotlib.patches.Rectangle(
            (begin, data.min().min()),
            end-begin,
            data.max().max()-data.min().min(),
            color="yellow", alpha=0.75
        ))
    plt.legend()
    plt.title(name)
    return data

In [4]:
dataset_collection_name = "Exathlon"
source_folder = Path(data_raw_folder) / "exathlon" / "data" / "raw"
target_folder = Path(data_processed_folder)

print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw and
saving processed datasets in /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed


In [5]:
# shared by all datasets
dataset_type = "real"
input_type = "multivariate"
datetime_index = True
split_at = None

# create target directory
dataset_subfolder = Path(input_type) / dataset_collection_name
target_subfolder = target_folder / dataset_subfolder
target_subfolder.mkdir(parents=True, exist_ok=True)
print(f"Created directories {target_subfolder}")

dm = Datasets(target_folder)

Created directories /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon


In [6]:
# load ground truth
df_labels = pd.read_csv(source_folder / "ground_truth.csv")

# we use extended end marker for anomalies
df_labels["extended_effect_end"].fillna(df_labels["root_cause_end"], axis=0, inplace=True)
df_labels.drop(columns=["root_cause_end", "trace_type", "anomaly_type", "anomaly_details"], inplace=True)
df_labels.columns = ["trace", "begin", "end"]
df_labels.set_index("trace", inplace=True)
df_labels

Unnamed: 0_level_0,begin,end
trace,Unnamed: 1_level_1,Unnamed: 2_level_1
2_1_100000_60,1.527535e+09,1.527536e+09
2_1_100000_60,1.527550e+09,1.527551e+09
2_1_100000_60,1.527564e+09,1.527565e+09
4_1_100000_61,1.527276e+09,1.527278e+09
4_1_100000_61,1.527291e+09,1.527293e+09
...,...,...
4_5_1000000_90,1.528967e+09,1.528967e+09
5_4_1000000_82,1.528983e+09,1.528983e+09
5_4_1000000_82,1.528987e+09,1.528987e+09
5_5_1000000_92,1.528967e+09,1.528967e+09


In [7]:
# load datasets and their metadata
datasets = find_datasets(source_folder)
datasets = dict((d.stem, d) for d in datasets)

df_datasets = pd.DataFrame({"filename": list(datasets.keys())}, columns=["filename", "app", "tpe", "rate"])
df_datasets.iloc[:, 1:] = df_datasets["filename"].str.split("_", expand=True).iloc[:, :3]
df_datasets.loc[:, ["app", "tpe", "rate"]] = df_datasets[["app", "tpe", "rate"]].astype(np.int_)
df_datasets = df_datasets.set_index(["app", "rate"]).sort_index()
df_datasets

Unnamed: 0_level_0,Unnamed: 1_level_0,filename,tpe
app,rate,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10000,1_0_10000_17,0
1,100000,1_0_100000_15,0
1,100000,1_0_100000_16,0
1,100000,1_2_100000_68,2
1,500000,1_0_500000_18,0
...,...,...,...
10,1000000,10_2_1000000_67,2
10,1000000,10_3_1000000_75,3
10,1000000,10_4_1000000_79,4
10,1000000,10_5_1000000_85,5


In [12]:
# define preprocessing methods
def preprocess(dataset_name: Path, corr_threshold: float = 0.95, log_prefix: str = "  ", preselected_columns: Optional[List[str]] = None):
    exclude_columns = ["timestamp", "is_anomaly"]
    print(f"{log_prefix}reading dataset {dataset_name.name}")
    df = pd.read_csv(dataset_name)
    s_index = pd.Index(pd.to_datetime(df["t"], unit="s"), name="timestamp")
    df.drop(columns=["t"], inplace=True)
    
    print(f"{log_prefix}adding timestamp index")
    df.index = s_index
        
    # add labels
    print(f"{log_prefix}adding anomaly labels")
    df["is_anomaly"] = 0
    try:
        anomalies = df_labels.loc[dataset_name.stem:dataset_name.stem]
    except KeyError:
        anomalies = pd.DataFrame(columns=df_labels.columns, index=pd.Index([], name="trace"))
    for i, anomaly in anomalies.iterrows():
        begin = pd.to_datetime(anomaly["begin"], unit="s")
        end = pd.to_datetime(anomaly["end"], unit="s")
        df.loc[begin:end, "is_anomaly"] = 1
    
    if preselected_columns is not None:
        # use columns from test dataset (preselected)
        print(f"{log_prefix}reducing columns")
        columns = set(list(preselected_columns) + exclude_columns)
        unavailable_columns = columns - set(df.columns)
        if len(unavailable_columns) > 0:
            warnings.warn(f"Preselected columns contain column names that are not in the dataset ({unavailable_columns})")
        df = df[[c for c in columns if c not in unavailable_columns]]
    else:
        # columns with no real value
        print(f"{log_prefix}removing constant and empty columns")
        to_drop = [c for c in df.columns if c not in exclude_columns and (np.all(df[c].isin([0, -1])) or df[c].unique().shape[0] == 1)]
        df.drop(columns=to_drop, inplace=True)

        # only select correlated values
        print(f"{log_prefix}selecting with label correlated columns")
        s_corr = df.drop(columns=["is_anomaly"]).corrwith(df["is_anomaly"], drop=True).abs()
        cols = list(s_corr[s_corr > 0.5].index.values)
        df = df[cols + ["is_anomaly"]]

        # drop highly correlated columns
        print(f"{log_prefix}removing highly correlated columns")
        df_corr = df.drop(columns=["is_anomaly"]).corr().abs()
        df_corr = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(np.bool_))
        to_drop = [c for c in df_corr.columns if any(df_corr[c] > corr_threshold)]
        df.drop(columns=to_drop, inplace=True)
        
    # make timestamp index a column
    df.reset_index(drop=False, inplace=True)
    print(f"{log_prefix}final dataset shape: {df.shape}")

    return df


def transform_and_record(s_dataset: pd.Series,
                         s_partner: Optional[pd.Series] = None,
                         train_is_normal: bool = False,
                         train_type: str = "unsupervised"):
    if s_partner is None:
        train = False
        dataset_name = s_dataset["filename"]
    else:
        train = True
        dataset_name = s_dataset["filename"] + "-" + s_partner["filename"].split("_")[-1]

    test_filename = f"{dataset_name}.test.csv"
    test_path = dataset_subfolder / test_filename
    target_test_filepath = target_subfolder / test_filename
    target_meta_filepath = target_test_filepath.parent / f"{dataset_name}.{Datasets.METADATA_FILENAME_PREFIX}"
    
    if train:
        train_filename = f"{dataset_name}.train.csv"
        train_path = dataset_subfolder / train_filename
        target_train_filepath = target_subfolder / train_filename
    else:
        train_path = ""

    # Prepare test dataset
    print("  Preparing test dataset")
    f = datasets[s_dataset["filename"]]
    df_test = preprocess(f, log_prefix="  > ")
    test_columns = df_test.columns
    if set(test_columns) == set(["timestamp", "is_anomaly"]):
        warnings.warn(f"Found no columns that correlate with anomaly labels. Manual inspection needed for dataset {dataset_name}!")
        return
    df_test.to_csv(target_test_filepath, index=False)

    print("  Analyzing test metadata")
    da = DatasetAnalyzer((dataset_collection_name, dataset_name), is_train=False, df=df_test)
    da.save_to_json(target_meta_filepath, overwrite=True)
    meta = da.metadata

    # Prepare train dataset
    if train:
        print("  Preparing train dataset")
        f = datasets[s_partner["filename"]]
        df_train = preprocess(f, log_prefix="  > ", preselected_columns=test_columns)
        df_train.to_csv(target_train_filepath, index=False)

        print("  Analyzing train metadata")
        DatasetAnalyzer((dataset_collection_name, dataset_name), is_train=True, df=df_train)\
            .save_to_json(target_meta_filepath, overwrite=False)

    dm.add_dataset(DatasetRecord(
        collection_name=dataset_collection_name,
        dataset_name=dataset_name,
        train_path=train_path,
        test_path=test_path,
        dataset_type=dataset_type,
        datetime_index=datetime_index,
        split_at=split_at,
        train_type=train_type,
        train_is_normal=train_is_normal,
        input_type=input_type,
        length=meta.length,
        dimensions=meta.dimensions,
        contamination=meta.contamination,
        num_anomalies=meta.num_anomalies,
        min_anomaly_length=meta.anomaly_length.min,
        median_anomaly_length=meta.anomaly_length.median,
        max_anomaly_length=meta.anomaly_length.max,
        mean=meta.mean,
        stddev=meta.stddev,
        trend=meta.trend,
        stationarity=meta.get_stationarity_name(),
        period_size=np.nan
    ))
    print(f"Processed dataset {dataset_name}: {datasets[s_dataset['filename']]} -> {target_test_filepath}")

In [13]:
for _, s_dataset in df_datasets[df_datasets["tpe"] != 0].iterrows():
    dataset_name = s_dataset["filename"]
    df_partners = df_datasets.loc[s_dataset.name]
    df_partners = df_partners[(df_partners["filename"] != dataset_name) & (df_partners["tpe"].isin([s_dataset["tpe"], 0]))]
    
    if df_partners.shape[0] == 0:
        print(f"  {dataset_name} is unsupervised!")
        transform_and_record(s_dataset, train_is_normal=False, train_type="unsupervised")
        continue

    for _, s_partner in df_partners.iterrows():
        dataset_name = s_dataset["filename"] + "-" + s_partner["filename"].split("_")[-1]
        print(f"\n## Processing {dataset_name}")

        if s_partner["tpe"] == 0:
            print(f"  {dataset_name} is semi-supervised!")
            train_is_normal = True
            train_type = "semi-supervised"
        elif s_partner["tpe"] == s_dataset["tpe"]:
            print(f"  {dataset_name} is supervised!")
            train_is_normal = False
            train_type = "supervised"
        else:
            print(f"Found uncompatible partners:\ndataset={s_dataset}\npartner={s_partner}")
            continue

        transform_and_record(s_dataset, s_partner, train_is_normal=train_is_normal, train_type=train_type)

dm.save()


## Processing 1_2_100000_68-15
  1_2_100000_68-15 is semi-supervised!
  Preparing test dataset
  > reading dataset 1_2_100000_68.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (2936, 46)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 1_0_100000_15.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (2682, 45)
  Analyzing train metadata


[('Exathlon', '1_2_100000_68-15') (train)] KPSS trend stationarity test for 3_NettyBlockTransfer_shuffle-server_usedDirectMemory_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-15') (train)] KPSS trend stationarity test for 3_executor_bytesWritten_count encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-15') (train)] KPSS trend stationarity test for 3_jvm_heap_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-15') (train)] KPSS trend stationarity test for 3_jvm_heap_used_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-15') (train)] KPSS trend stationarity test for 3_jvm_pools_PS-Eden-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-15') (train)] KPSS trend stationarity test for 3_jvm_pools_PS-Eden-Space_used_value encountered an error: cannot convert 

Processed dataset 1_2_100000_68-15: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app1/1_2_100000_68.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/1_2_100000_68-15.test.csv

## Processing 1_2_100000_68-16
  1_2_100000_68-16 is semi-supervised!
  Preparing test dataset
  > reading dataset 1_2_100000_68.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (2936, 46)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 1_0_100000_16.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (3582, 45)
  Analyzing train metadata


[('Exathlon', '1_2_100000_68-16') (train)] KPSS trend stationarity test for 3_NettyBlockTransfer_shuffle-server_usedDirectMemory_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-16') (train)] KPSS trend stationarity test for 3_executor_bytesWritten_count encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-16') (train)] KPSS trend stationarity test for 3_jvm_heap_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-16') (train)] KPSS trend stationarity test for 3_jvm_heap_used_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-16') (train)] KPSS trend stationarity test for 3_jvm_pools_PS-Eden-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '1_2_100000_68-16') (train)] KPSS trend stationarity test for 3_jvm_pools_PS-Eden-Space_used_value encountered an error: cannot convert 

Processed dataset 1_2_100000_68-16: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app1/1_2_100000_68.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/1_2_100000_68-16.test.csv

## Processing 1_4_1000000_80-14
  1_4_1000000_80-14 is semi-supervised!
  Preparing test dataset
  > reading dataset 1_4_1000000_80.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43087, 6)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 1_0_1000000_14.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (14347, 5)
  Analyzing train metadata


[('Exathlon', '1_4_1000000_80-14') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 1_4_1000000_80-14: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app1/1_4_1000000_80.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/1_4_1000000_80-14.test.csv

## Processing 1_5_1000000_86-14
  1_5_1000000_86-14 is semi-supervised!
  Preparing test dataset
  > reading dataset 1_5_1000000_86.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (3618, 2)

## Processing 2_1_100000_60-20
  2_1_100000_60-20 is semi-supervised!
  Preparing test dataset
  > reading dataset 2_1_100000_60.csv




  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46655, 16)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 2_0_100000_20.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28636, 14)




  Analyzing train metadata


[('Exathlon', '2_1_100000_60-20') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 2_1_100000_60-20: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app2/2_1_100000_60.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/2_1_100000_60-20.test.csv

## Processing 2_1_100000_60-22
  2_1_100000_60-22 is semi-supervised!
  Preparing test dataset
  > reading dataset 2_1_100000_60.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46655, 16)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 2_0_100000_22.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (4257, 14)
  Analyzing train metadata


[('Exathlon', '2_1_100000_60-22') (train)] KPSS trend stationarity test for driver_DAGScheduler_stage_waitingStages_value encountered an error: cannot convert float infinity to integer
[('Exathlon', '2_1_100000_60-22') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 2_1_100000_60-22: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app2/2_1_100000_60.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/2_1_100000_60-22.test.csv
  2_2_200000_69 is unsupervised!
  Preparing test dataset
  > reading dataset 2_2_200000_69.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (2874, 132)
  Analyzing test metadata
Processed dataset 2_2_200000_69: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app2/2_2_200000_69.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/2_2_200000_69.test.csv

## Processing 2_5_1000000_87-88
  2_5_1000000_87-88 is supervised!
  Preparing test dataset
  > reading dataset 2_5_100000



  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (3621, 2)
  3_2_500000_70 is unsupervised!
  Preparing test dataset
  > reading dataset 3_2_500000_70.csv




  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (2611, 210)
  Analyzing test metadata
Processed dataset 3_2_500000_70: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app3/3_2_500000_70.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/3_2_500000_70.test.csv
  3_2_1000000_71 is unsupervised!
  Preparing test dataset
  > reading dataset 3_2_1000000_71.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (2474, 196)
  Analyzing test metadata
Processed dataset 3_2_1000000_71: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app3/3_2_1000000_71.csv -> /home/s



  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (5916, 2)

## Processing 4_1_100000_61-27
  4_1_100000_61-27 is semi-supervised!
  Preparing test dataset
  > reading dataset 4_1_100000_61.csv




  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (129197, 22)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 4_0_100000_27.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28718, 21)




  Analyzing train metadata


[('Exathlon', '4_1_100000_61-27') (train)] KPSS trend stationarity test for 4_jvm_pools_PS-Survivor-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '4_1_100000_61-27') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 4_1_100000_61-27: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app4/4_1_100000_61.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/4_1_100000_61-27.test.csv

## Processing 4_1_100000_61-28
  4_1_100000_61-28 is semi-supervised!
  Preparing test dataset
  > reading dataset 4_1_100000_61.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (129197, 22)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 4_0_100000_28.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28696, 21)




  Analyzing train metadata


[('Exathlon', '4_1_100000_61-28') (train)] KPSS trend stationarity test for 4_jvm_pools_PS-Survivor-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '4_1_100000_61-28') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 4_1_100000_61-28: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app4/4_1_100000_61.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/4_1_100000_61-28.test.csv

## Processing 4_1_100000_61-29
  4_1_100000_61-29 is semi-supervised!
  Preparing test dataset
  > reading dataset 4_1_100000_61.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (129197, 22)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 4_0_100000_29.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28703, 21)




  Analyzing train metadata


[('Exathlon', '4_1_100000_61-29') (train)] KPSS trend stationarity test for 4_jvm_pools_PS-Survivor-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '4_1_100000_61-29') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 4_1_100000_61-29: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app4/4_1_100000_61.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/4_1_100000_61-29.test.csv

## Processing 4_1_100000_61-30
  4_1_100000_61-30 is semi-supervised!
  Preparing test dataset
  > reading dataset 4_1_100000_61.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (129197, 22)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 4_0_100000_30.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28707, 21)




  Analyzing train metadata


[('Exathlon', '4_1_100000_61-30') (train)] KPSS trend stationarity test for 4_jvm_pools_PS-Survivor-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '4_1_100000_61-30') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 4_1_100000_61-30: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app4/4_1_100000_61.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/4_1_100000_61-30.test.csv

## Processing 4_1_100000_61-32
  4_1_100000_61-32 is semi-supervised!
  Preparing test dataset
  > reading dataset 4_1_100000_61.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (129197, 22)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 4_0_100000_32.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (86182, 21)




  Analyzing train metadata


[('Exathlon', '4_1_100000_61-32') (train)] KPSS trend stationarity test for 4_jvm_pools_PS-Survivor-Space_committed_value encountered an error: cannot convert float NaN to integer
[('Exathlon', '4_1_100000_61-32') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 4_1_100000_61-32: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app4/4_1_100000_61.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/4_1_100000_61-32.test.csv

## Processing 4_5_1000000_90-31
  4_5_1000000_90-31 is semi-supervised!
  Preparing test dataset
  > reading dataset 4_5_1000000_90.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (3621, 2)

## Processing 5_1_100000_63-33
  5_1_100000_63-33 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv




  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_33.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28704, 33)




  Analyzing train metadata


[('Exathlon', '5_1_100000_63-33') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_63-33: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-33.test.csv

## Processing 5_1_100000_63-34
  5_1_100000_63-34 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_34.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28702, 33)




  Analyzing train metadata


[('Exathlon', '5_1_100000_63-34') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_63-34: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-34.test.csv

## Processing 5_1_100000_63-35
  5_1_100000_63-35 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_35.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28702, 33)




  Analyzing train metadata


[('Exathlon', '5_1_100000_63-35') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_63-35: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-35.test.csv

## Processing 5_1_100000_63-36
  5_1_100000_63-36 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_36.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (4724, 33)
  Analyzing train metadata


[('Exathlon', '5_1_100000_63-36') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_63-36: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-36.test.csv

## Processing 5_1_100000_63-37
  5_1_100000_63-37 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_37.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (3581, 33)
  Analyzing train metadata


[('Exathlon', '5_1_100000_63-37') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_63-37: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-37.test.csv

## Processing 5_1_100000_63-40
  5_1_100000_63-40 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_40.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28699, 33)




  Analyzing train metadata


[('Exathlon', '5_1_100000_63-40') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_63-40: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-40.test.csv

## Processing 5_1_100000_63-64
  5_1_100000_63-64 is supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43066, 35)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (46660, 33)




  Analyzing train metadata
Processed dataset 5_1_100000_63-64: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_63.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_63-64.test.csv

## Processing 5_1_100000_64-33
  5_1_100000_64-33 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_33.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28704, 20)




  Analyzing train metadata


[('Exathlon', '5_1_100000_64-33') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_64-33: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-33.test.csv

## Processing 5_1_100000_64-34
  5_1_100000_64-34 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_34.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28702, 20)




  Analyzing train metadata


[('Exathlon', '5_1_100000_64-34') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_64-34: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-34.test.csv

## Processing 5_1_100000_64-35
  5_1_100000_64-35 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_35.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28702, 20)




  Analyzing train metadata


[('Exathlon', '5_1_100000_64-35') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_64-35: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-35.test.csv

## Processing 5_1_100000_64-36
  5_1_100000_64-36 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_36.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (4724, 20)
  Analyzing train metadata


[('Exathlon', '5_1_100000_64-36') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_64-36: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-36.test.csv

## Processing 5_1_100000_64-37
  5_1_100000_64-37 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_37.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (3581, 20)
  Analyzing train metadata


[('Exathlon', '5_1_100000_64-37') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_64-37: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-37.test.csv

## Processing 5_1_100000_64-40
  5_1_100000_64-40 is semi-supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_0_100000_40.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (28699, 20)




  Analyzing train metadata


[('Exathlon', '5_1_100000_64-40') (train)] KPSS trend stationarity test for is_anomaly encountered an error: cannot convert float NaN to integer


Processed dataset 5_1_100000_64-40: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-40.test.csv

## Processing 5_1_100000_64-63
  5_1_100000_64-63 is supervised!
  Preparing test dataset
  > reading dataset 5_1_100000_64.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 23)
  Analyzing test metadata
  Preparing train dataset
  > reading dataset 5_1_100000_63.csv
  > adding timestamp index
  > adding anomaly labels
  > reducing columns
  > final dataset shape: (43066, 20)




  Analyzing train metadata
Processed dataset 5_1_100000_64-63: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_100000_64.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_100000_64-63.test.csv
  5_1_500000_62 is unsupervised!
  Preparing test dataset
  > reading dataset 5_1_500000_62.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46660, 18)
  Analyzing test metadata
Processed dataset 5_1_500000_62: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app5/5_1_500000_62.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/5_1_500000_62.test.csv
  5_2_1000000_72 is unsupervised!
  Preparing test dataset
  > reading dataset 5_2_1000000_72.cs



  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43075, 2)

## Processing 5_5_1000000_92-91
  5_5_1000000_92-91 is supervised!
  Preparing test dataset
  > reading dataset 5_5_1000000_92.csv




  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (3614, 2)
  6_3_200000_76 is unsupervised!
  Preparing test dataset
  > reading dataset 6_3_200000_76.csv




  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46654, 9)
  Analyzing test metadata
Processed dataset 6_3_200000_76: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app6/6_3_200000_76.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/6_3_200000_76.test.csv
  6_1_500000_65 is unsupervised!
  Preparing test dataset
  > reading dataset 6_1_500000_65.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46649, 14)
  Analyzing test metadata
Processed dataset 6_1_500000_65: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app6/6_1_500000_65.csv -> /home/sebast



  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46641, 6)
  Analyzing test metadata
Processed dataset 8_3_200000_73: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app8/8_3_200000_73.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/8_3_200000_73.test.csv
  8_4_1000000_77 is unsupervised!
  Preparing test dataset
  > reading dataset 8_4_1000000_77.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (43078, 4)
  Analyzing test metadata
Processed dataset 8_4_1000000_77: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app8/8_4_1000000_77.csv -> /home/seb



  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46650, 9)
  Analyzing test metadata
Processed dataset 9_3_500000_74: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app9/9_3_500000_74.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/9_3_500000_74.test.csv
  9_2_1000000_66 is unsupervised!
  Preparing test dataset
  > reading dataset 9_2_1000000_66.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (7481, 241)
  Analyzing test metadata
Processed dataset 9_2_1000000_66: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app9/9_2_1000000_66.csv -> /home/se



  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (10250, 167)
  Analyzing test metadata
Processed dataset 10_2_1000000_67: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app10/10_2_1000000_67.csv -> /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-processed/multivariate/Exathlon/10_2_1000000_67.test.csv
  10_3_1000000_75 is unsupervised!
  Preparing test dataset
  > reading dataset 10_3_1000000_75.csv
  > adding timestamp index
  > adding anomaly labels
  > removing constant and empty columns
  > selecting with label correlated columns
  > removing highly correlated columns
  > final dataset shape: (46656, 10)
  Analyzing test metadata
Processed dataset 10_3_1000000_75: /home/sebastian/Documents/Projects/akita/data/benchmark-data/data-raw/exathlon/data/raw/app10/10_3_1000000_75.c



In [16]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length,dimensions,contamination,num_anomalies,min_anomaly_length,median_anomaly_length,max_anomaly_length,mean,stddev,trend,stationarity,period_size
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Exathlon,10_2_1000000_67,,multivariate/Exathlon/10_2_1000000_67.test.csv,real,True,,unsupervised,False,multivariate,10250,165,0.510146,1,5229,5229,5229,3475919000.0,4479414000.0,kubic trend,trend_stationary,
Exathlon,10_3_1000000_75,,multivariate/Exathlon/10_3_1000000_75.test.csv,real,True,,unsupervised,False,multivariate,46656,8,0.081169,4,885,965,971,615865.9,178713.8,no trend,difference_stationary,
Exathlon,10_4_1000000_79,,multivariate/Exathlon/10_4_1000000_79.test.csv,real,True,,unsupervised,False,multivariate,43086,3,0.128766,6,474,831,1581,1689.696,271.663,no trend,difference_stationary,
Exathlon,1_2_100000_68-15,multivariate/Exathlon/1_2_100000_68-15.train.csv,multivariate/Exathlon/1_2_100000_68-15.test.csv,real,True,,semi-supervised,True,multivariate,2936,44,0.788147,1,2314,2314,2314,11697110000.0,16855650000.0,kubic trend,trend_stationary,
Exathlon,1_2_100000_68-16,multivariate/Exathlon/1_2_100000_68-16.train.csv,multivariate/Exathlon/1_2_100000_68-16.test.csv,real,True,,semi-supervised,True,multivariate,2936,44,0.788147,1,2314,2314,2314,11697110000.0,16855650000.0,kubic trend,trend_stationary,
Exathlon,1_4_1000000_80-14,multivariate/Exathlon/1_4_1000000_80-14.train.csv,multivariate/Exathlon/1_4_1000000_80-14.test.csv,real,True,,semi-supervised,True,multivariate,43087,4,0.128299,6,464,828,1538,798.356,156.3176,no trend,difference_stationary,
Exathlon,2_1_100000_60-20,multivariate/Exathlon/2_1_100000_60-20.train.csv,multivariate/Exathlon/2_1_100000_60-20.test.csv,real,True,,semi-supervised,True,multivariate,46655,14,0.063187,3,978,982,988,1050236000.0,124610700.0,no trend,trend_stationary,
Exathlon,2_1_100000_60-22,multivariate/Exathlon/2_1_100000_60-22.train.csv,multivariate/Exathlon/2_1_100000_60-22.test.csv,real,True,,semi-supervised,True,multivariate,46655,14,0.063187,3,978,982,988,1050236000.0,124610700.0,no trend,trend_stationary,
Exathlon,2_2_200000_69,,multivariate/Exathlon/2_2_200000_69.test.csv,real,True,,unsupervised,False,multivariate,2874,130,0.575505,1,1654,1654,1654,18880700000.0,21363420000.0,kubic trend,trend_stationary,
Exathlon,3_2_1000000_71,,multivariate/Exathlon/3_2_1000000_71.test.csv,real,True,,unsupervised,False,multivariate,2474,194,0.183104,1,453,453,453,6744530000.0,2986543000.0,kubic trend,trend_stationary,


Datasets for which our automatic extraction of relevant columns (dimensions/channels/variates) did not work properly:

- 1_5_1000000_86
- 2_5_1000000_87
- 2_5_1000000_88
- 3_4_1000000_81
- 3_5_1000000_89
- 4_5_1000000_90
- 5_4_1000000_82
- 5_5_1000000_91
- 5_5_1000000_92
- 6_5_1000000_93
- 8_5_1000000_83
- 9_5_1000000_84
- 10_5_1000000_85

These datasets require manual inspection and preprocessing!

## Exploration

In [None]:
df_labels

In [None]:
plot_dataset(datasets[list(datasets.keys())[23]], 110, 140)
plt.show()

In [None]:
df = pd.read_csv(datasets[23])
print(f"Columns = {df.shape[1]}")
print(f"Rows = {df.shape[0]}")
df.describe()

In [None]:
[c for c in df.columns]

In [None]:
# drop highly correlated columns
df_corr = df.corr().abs()
df_corr = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(np.bool_))
df_corr

In [None]:
threshold = 0.95
to_drop = [c for c in df_corr.columns if any(df_corr[c] > threshold)]
df2 = df.drop(columns=to_drop)
df2

In [None]:
datasets = find_datasets(source_folder)
datasets = dict((d.stem, d) for d in datasets)
df_datasets = pd.DataFrame({"filename": list(datasets.keys())}, columns=["filename", "app", "tpe", "rate"])
df_datasets.iloc[:, 1:] = df_datasets["filename"].str.split("_", expand=True).iloc[:, :3]
df_datasets.loc[:, ["app", "tpe", "rate"]] = df_datasets[["app", "tpe", "rate"]].astype(np.int_)
res = df_datasets.groupby(by=["app", "rate", "tpe"]).count()
df_datasets = df_datasets.set_index(["app", "rate"]).sort_index()
res

In [None]:
def preprocess(dataset_name: Path, corr_threshold: float = 0.95, log_prefix: str = "  ", preselected_columns: Optional[List[str]] = None):
    print(f"{log_prefix}reading dataset {dataset_name.name}")
    df = pd.read_csv(dataset_name)
    s_index = pd.Index(pd.to_datetime(df["t"], unit="s"), name="timestamp")
    df.drop(columns=["t"], inplace=True)
    
    print(f"{log_prefix}adding timestamp index")
    df.index = s_index
        
    # add labels
    print(f"{log_prefix}adding anomaly labels")
    df["is_anomaly"] = 0
    try:
        anomalies = df_labels.loc[dataset_name.stem:dataset_name.stem]
    except KeyError:
        anomalies = pd.DataFrame(columns=df_labels.columns, index=pd.Index([], name="trace"))
    for i, anomaly in anomalies.iterrows():
        begin = pd.to_datetime(anomaly["begin"], unit="s")
        end = pd.to_datetime(anomaly["end"], unit="s")
        df.loc[begin:end, "is_anomaly"] = 1
    
    if preselected_columns is not None:
        # use columns from test dataset (preselected)
        print(f"{log_prefix}reducing columns")
        columns = list(np.unique(list(preselected_columns) + ["is_anomaly"]))
        unavailable_columns = set(columns) - set(df.columns)
        if len(unavailable_columns) > 0:
            warnings.warn(f"Preselected columns contain column names that are not in the dataset ({unavailable_columns})")
        df = df[[c for c in columns if c not in unavailable_columns]]
    else:
        # columns with no real value
        print(f"{log_prefix}removing constant and empty columns")
        to_drop = [c for c in df.columns if c != "is_anomaly" and (np.all(df[c].isin([0, -1])) or df[c].unique().shape[0] == 1)]
        df.drop(columns=to_drop, inplace=True)

        # only select correlated values
        print(f"{log_prefix}selecting with label correlated columns")
        s_corr = df.drop(columns=["is_anomaly"]).corrwith(df["is_anomaly"], drop=True).abs()
        cols = list(s_corr[s_corr > 0.5].index.values)
        df = df[cols + ["is_anomaly"]]

        # drop highly correlated columns
        print(f"{log_prefix}removing highly correlated columns")
        df_corr = df.drop(columns=["is_anomaly"]).corr().abs()
        df_corr = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(np.bool_))
        to_drop = [c for c in df_corr.columns if any(df_corr[c] > corr_threshold)]
        df.drop(columns=to_drop, inplace=True)
        
    # make timestamp index a column
    df.reset_index(drop=False, inplace=True)
    print(f"{log_prefix}final dataset shape: {df.shape}")

    return df
    

df = preprocess(datasets[list(datasets.keys())[24]])
df

In [None]:
df_corr = df.drop(columns=["is_anomaly"]).corrwith(df["is_anomaly"], drop=True).abs()
corr_cols = df_corr[df_corr > 0.5]#.index.values
corr_cols.sort_values()

In [None]:
df[["driver_BlockManager_memory_memUsed_MB_value"]].plot()
#plt.gca().set_ylim(0, 20)
#plt.gca().set_xlim(pd.to_datetime(1.527534e+09, unit="s"), pd.to_datetime(1.527537e+09, unit="s"))
plt.gca().get_legend().remove()
plt.show()