# IOPS AI Challenge Dataset

In [10]:
import pandas as pd
import os
from typing import Final
from collections.abc import Callable
import matplotlib.pyplot as plt
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [11]:
plt.rcParams['figure.figsize'] = (20, 10)

In [12]:
dataset_collection_name = "IOPS"
source_folder = os.path.join(data_raw_folder, "IOPS AI Challenge/KPI Anomaly Detection Final Data Set")
target_folder = data_processed_folder

from pathlib import Path
print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/IOPS AI Challenge/KPI Anomaly Detection Final Data Set and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


In [13]:
def transform(df: pd.DataFrame, kpi: str) -> pd.DataFrame:
    df = df[df["KPI ID"] == kpi].copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
    df = df[["timestamp", "value", "label"]]
    df.columns = ["timestamp", "value", "is_anomaly"]
    return df

## Transformation

In [14]:
train_type = "supervised"
train_is_normal = False
input_type = "univariate"
datetime_index = True
dataset_type = "real"

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/univariate/IOPS already exist


In [15]:
print("Reading source datasets...")
df_train = pd.read_csv(os.path.join(source_folder, "phase2_train.csv"))
df_test = pd.read_hdf(os.path.join(source_folder, "phase2_ground_truth.hdf"))
df_test["KPI ID"] = df_test["KPI ID"].transform(lambda x: str(x))
dfs = {
    "train": df_train,
    "test": df_test
}
print("...done.\n\nStarting processing...")

for kpi in df_test["KPI ID"].unique():
    dataset_length = 0
    paths = {}

    # transform file
    for t_type in ["train", "test"]:
        filename = f"KPI-{kpi}.{t_type}.csv"
        path = os.path.join(dataset_subfolder, filename)
        target_filepath = os.path.join(target_subfolder, filename)
        paths[t_type] = path
        df_target = transform(dfs[t_type], kpi)
        if t_type == "test":
            dataset_length = len(df_target)
        df_target.to_csv(target_filepath, index=False)
        print(f"Processed KPI {kpi} -> {target_filepath}")

    dataset_name = kpi

    # save metadata
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = paths["train"],
        test_path = paths["test"],
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )
print("...done.")
dm.save()
print("Metadata saved.")

Reading source datasets...
...done.

Starting processing...
Processed KPI da10a69f-d836-3baa-ad40-3e548ecf1fbd -> /home/projects/akita/data/benchmark-data/data-processed/univariate/IOPS/KPI-da10a69f-d836-3baa-ad40-3e548ecf1fbd.train.csv
Processed KPI da10a69f-d836-3baa-ad40-3e548ecf1fbd -> /home/projects/akita/data/benchmark-data/data-processed/univariate/IOPS/KPI-da10a69f-d836-3baa-ad40-3e548ecf1fbd.test.csv
Processed KPI e0747cad-8dc8-38a9-a9ab-855b61f5551d -> /home/projects/akita/data/benchmark-data/data-processed/univariate/IOPS/KPI-e0747cad-8dc8-38a9-a9ab-855b61f5551d.train.csv
Processed KPI e0747cad-8dc8-38a9-a9ab-855b61f5551d -> /home/projects/akita/data/benchmark-data/data-processed/univariate/IOPS/KPI-e0747cad-8dc8-38a9-a9ab-855b61f5551d.test.csv
Processed KPI ab216663-dcc2-3a24-b1ee-2c3e550e06c9 -> /home/projects/akita/data/benchmark-data/data-processed/univariate/IOPS/KPI-ab216663-dcc2-3a24-b1ee-2c3e550e06c9.train.csv
Processed KPI ab216663-dcc2-3a24-b1ee-2c3e550e06c9 -> /ho

In [17]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
IOPS,05f10d3a-239c-3bef-9bdc-a2feeb0037aa,univariate/IOPS/KPI-05f10d3a-239c-3bef-9bdc-a2...,univariate/IOPS/KPI-05f10d3a-239c-3bef-9bdc-a2...,real,True,,supervised,False,univariate,149130
IOPS,0efb375b-b902-3661-ab23-9a0bb799f4e3,univariate/IOPS/KPI-0efb375b-b902-3661-ab23-9a...,univariate/IOPS/KPI-0efb375b-b902-3661-ab23-9a...,real,True,,supervised,False,univariate,8784
IOPS,1c6d7a26-1f1a-3321-bb4d-7a9d969ec8f0,univariate/IOPS/KPI-1c6d7a26-1f1a-3321-bb4d-7a...,univariate/IOPS/KPI-1c6d7a26-1f1a-3321-bb4d-7a...,real,True,,supervised,False,univariate,149156
IOPS,301c70d8-1630-35ac-8f96-bc1b6f4359ea,univariate/IOPS/KPI-301c70d8-1630-35ac-8f96-bc...,univariate/IOPS/KPI-301c70d8-1630-35ac-8f96-bc...,real,True,,supervised,False,univariate,8784
IOPS,42d6616d-c9c5-370a-a8ba-17ead74f3114,univariate/IOPS/KPI-42d6616d-c9c5-370a-a8ba-17...,univariate/IOPS/KPI-42d6616d-c9c5-370a-a8ba-17...,real,True,,supervised,False,univariate,149161
IOPS,43115f2a-baeb-3b01-96f7-4ea14188343c,univariate/IOPS/KPI-43115f2a-baeb-3b01-96f7-4e...,univariate/IOPS/KPI-43115f2a-baeb-3b01-96f7-4e...,real,True,,supervised,False,univariate,110629
IOPS,431a8542-c468-3988-a508-3afd06a218da,univariate/IOPS/KPI-431a8542-c468-3988-a508-3a...,univariate/IOPS/KPI-431a8542-c468-3988-a508-3a...,real,True,,supervised,False,univariate,111566
IOPS,4d2af31a-9916-3d9f-8a8e-8a268a48c095,univariate/IOPS/KPI-4d2af31a-9916-3d9f-8a8e-8a...,univariate/IOPS/KPI-4d2af31a-9916-3d9f-8a8e-8a...,real,True,,supervised,False,univariate,111370
IOPS,54350a12-7a9d-3ca8-b81f-f886b9d156fd,univariate/IOPS/KPI-54350a12-7a9d-3ca8-b81f-f8...,univariate/IOPS/KPI-54350a12-7a9d-3ca8-b81f-f8...,real,True,,supervised,False,univariate,7616
IOPS,55f8b8b8-b659-38df-b3df-e4a5a8a54bc9,univariate/IOPS/KPI-55f8b8b8-b659-38df-b3df-e4...,univariate/IOPS/KPI-55f8b8b8-b659-38df-b3df-e4...,real,True,,supervised,False,univariate,149133


## Experimentation

In [None]:
os.listdir(source_folder)

In [None]:
train = pd.read_csv("data-raw/IOPS AI Challenge/KPI Anomaly Detection Final Data Set/phase2_train.csv")

In [None]:
test = pd.read_hdf("data-raw/IOPS AI Challenge/KPI Anomaly Detection Final Data Set/phase2_ground_truth.hdf")

In [None]:
test["KPI ID"] = test["KPI ID"].transform(lambda x: str(x))
test

In [None]:
test["KPI ID"].unique()

In [None]:
transform(train, "da10a69f-d836-3baa-ad40-3e548ecf1fbd")

In [None]:
transform(test, "da10a69f-d836-3baa-ad40-3e548ecf1fbd")

In [None]:
train_dfs = [x for _, x in train.groupby("KPI ID")]
train_dfs[0]

In [None]:
test_dfs = [x for _, x in test.groupby("KPI ID")]
test_dfs[0]

In [None]:
train_kpis = train["KPI ID"].unique()
test_kpis = test["KPI ID"].unique()
set(test) - set(train)

In [None]:
for df in test_dfs:
    df.plot(x="timestamp", y=["value"])
plt.show()