# Server Machine Dataset (SMD) from OmniAnomaly

In [1]:
import pandas as pd
import os
from typing import Final
from collections.abc import Callable
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [2]:
dataset_collection_name = "SMD"
source_folder = os.path.join(data_raw_folder, "Server Machine Dataset")
target_folder = data_processed_folder

from pathlib import Path
print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


file handling and transformations

In [3]:
def list_regular_files(path: str) -> list[str]:
    return [f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))]

def get_source_path(file, tpe="train"):
    return os.path.join(source_folder, tpe, file)

def calc_size(filename: str) -> int:
    with open(filename, 'r') as f:
        c = 0
        for line in f:
            c += 1
    return c

In [4]:
def transform_and_label(source_file: str, target: str, tpe: str) -> int:
    df = pd.read_csv(get_source_path(source_file, tpe), header=None)
    df.index.name = "timestamp"
    df.columns = list(map(lambda v: f"value-{v}", df.columns))

    if tpe == "test":
        df_label = pd.read_csv(get_source_path(source_file, "test_label"), header=None)
        df_label.columns=["is_anomaly"]
        df = pd.merge(df, df_label, left_index=True, right_index=True, how="inner")
    else:
        df["is_anomaly"] = 0

    df.to_csv(target)
    return len(df)

In [5]:
# shared by all datasets
dataset_type = "real"
train_is_normal = True
train_type = "semi-supervised"
input_type = "multivariate"
datetime_index = False

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD already exist


In [6]:
# dataset transformation
transform_file: Callable[[str, str, str], int] = transform_and_label

for f in list_regular_files(get_source_path(".")):
    paths = {}
    for t_type in ["train", "test"]:
        dataset_name = os.path.splitext(f)[0]
        source_file = get_source_path(f, t_type)
        filename = f"{dataset_name}.{t_type}.csv"
        path = os.path.join(dataset_subfolder, filename)
        target_filepath = os.path.join(target_subfolder, filename)
        paths[t_type] = path
        
        # transform file
        dataset_length = transform_file(f, target_filepath, t_type)
        print(f"Processed source dataset {source_file} -> {target_filepath}")

    # save metadata
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = paths["train"],
        test_path = paths["test"],
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )

# save metadata of benchmark
dm.save()

Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-6.train.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-6.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-8.train.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-8.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-1.txt -> /home/pro

In [7]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SMD,machine-1-1,multivariate/SMD/machine-1-1.train.csv,multivariate/SMD/machine-1-1.test.csv,real,False,,semi-supervised,True,multivariate,28479
SMD,machine-1-2,multivariate/SMD/machine-1-2.train.csv,multivariate/SMD/machine-1-2.test.csv,real,False,,semi-supervised,True,multivariate,23694
SMD,machine-1-3,multivariate/SMD/machine-1-3.train.csv,multivariate/SMD/machine-1-3.test.csv,real,False,,semi-supervised,True,multivariate,23703
SMD,machine-1-4,multivariate/SMD/machine-1-4.train.csv,multivariate/SMD/machine-1-4.test.csv,real,False,,semi-supervised,True,multivariate,23707
SMD,machine-1-5,multivariate/SMD/machine-1-5.train.csv,multivariate/SMD/machine-1-5.test.csv,real,False,,semi-supervised,True,multivariate,23706
SMD,machine-1-6,multivariate/SMD/machine-1-6.train.csv,multivariate/SMD/machine-1-6.test.csv,real,False,,semi-supervised,True,multivariate,23689
SMD,machine-1-7,multivariate/SMD/machine-1-7.train.csv,multivariate/SMD/machine-1-7.test.csv,real,False,,semi-supervised,True,multivariate,23697
SMD,machine-1-8,multivariate/SMD/machine-1-8.train.csv,multivariate/SMD/machine-1-8.test.csv,real,False,,semi-supervised,True,multivariate,23699
SMD,machine-2-1,multivariate/SMD/machine-2-1.train.csv,multivariate/SMD/machine-2-1.test.csv,real,False,,semi-supervised,True,multivariate,23694
SMD,machine-2-2,multivariate/SMD/machine-2-2.train.csv,multivariate/SMD/machine-2-2.test.csv,real,False,,semi-supervised,True,multivariate,23700


## Experimentation

In [None]:
train_folder = os.path.join(source_folder, "train")
for f in list_regular_files(get_source_path(".")):
    for p in ["train", "test"]:
        file = get_source_path(f, p)
        filename = f"{os.path.splitext(f)[0]}.{p}.csv"
        if p == "test":
            file = file + " & " + get_source_path(f, "test_label")
        print(p, ":", file, "->", os.path.join(dataset_subfolder, filename))

In [None]:
df = pd.read_csv(get_source_path("machine-1-1.txt", "test"), header=None)
df.index.name = "timestamp"
df.columns = list(map(lambda v: f"value-{v}", df.columns))
df

df_label = pd.read_csv(get_source_path("machine-1-1.txt", "test_label"), header=None)
df_label.columns=["is_anomaly"]
df = pd.merge(df, df_label, left_index=True, right_index=True, how="inner")
#df.to_csv("test.csv")
df

In [None]:
pd.read_csv(os.path.join(target_folder, input_type, dataset_collection_name, "machine-3-11.train.csv"))