# Server Machine Dataset (SMD) from OmniAnomaly

In [43]:
import pandas as pd
import os
from typing import Final
from collections.abc import Callable
from config import data_raw_folder, data_processed_folder

In [44]:
dataset_collection_name = "SMD"
source_folder = os.path.join(data_raw_folder, "Server Machine Dataset")
target_folder = data_processed_folder

metadata handling

In [45]:
# type
DatasetMetadataRecord = {
    "collection_name": str,
    "dataset_name": str,
    "train_path": str,
    "test_path": str,
    "dataset_type": str,
    "datetime_index": bool,
    "split_at": int,
    "train_type": str,
    "train_is_normal": bool,
    "input_type": str,
    "length": int
}

class DatasetMetadata:
    """
    ATTENTION: Not thread-safe! There is no check for changes to the underlying `dataset.csv` file while this class is loaded.
    """
    
    FILENAME: Final[str] = "datasets.csv"
    
    _filepath: str
    _df: pd.DataFrame
    _dirty: bool

    def __init__(self, target_folder: str):
        self._filepath = os.path.join(target_folder, self.FILENAME)
        self._dirty = False
        if not os.path.isfile(self._filepath):
            self._df = self._create_metadata_file()
        else:
            self.refresh(force = True)
    
    def __enter__(self) -> 'DatasetMetadata':
        return self
    
    def __exit__(self, exception_type, exception_value, exception_traceback) -> 'DatasetMetadata':
        self.save()
        return self
    
    def __repr__(self) -> str:
        return repr(self._df)
    
    def __str__(self) -> str:
        return str(self._df)
        
    def _create_metadata_file(self) -> pd.DataFrame:
        df_temp = pd.DataFrame(columns=["dataset_name", "collection_name", "train_path", "test_path", "type", "datetime_index", "split_at", "train_type", "train_is_normal", "input_type", "length"])
        df_temp.set_index(["collection_name", "dataset_name"], inplace=True)
        df_temp.to_csv(self._filepath)
        return df_temp
    
    def add_dataset(self,
        dataset_name: str,
        collection_name: str,
        train_path: str,
        test_path: str,
        dataset_type: str,
        datetime_index: bool,
        split_at: int,
        train_type: str,
        train_is_normal: bool,
        input_type: str,
        dataset_length: int
    ) -> 'DatasetMetadata':
        df_new = pd.DataFrame({
            "train_path": train_path,
            "test_path": test_path,
            "type": dataset_type,
            "datetime_index": datetime_index,
            "split_at": split_at,
            "train_type": train_type,
            "train_is_normal": train_is_normal,
            "input_type": input_type,
            "length": dataset_length
        }, index=[(dataset_collection_name, dataset_name)])
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
        return self
    
    def add_datasets(self, datasets: list[DatasetMetadataRecord]) -> 'DatasetMetadata':
        df_new = pd.DataFrame(datasets)
        df_new.set_index(["collection_name", "dataset_name"], inplace = True)
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
        return self
    
    def refresh(self, force: bool = False) -> None:
        if not force and self._dirty:
            raise Exception("There are unsaved changes in memory that would get lost by reading from disk again!")
        else:
            self._df = pd.read_csv(self._filepath, index_col=["collection_name", "dataset_name"])
    
    def save(self) -> None:
        self._df.to_csv(self._filepath)
        self._dirty = False

file handling and transformations

In [46]:
def list_regular_files(path: str) -> list[str]:
    return [f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))]

def get_source_path(file, tpe="train"):
    return os.path.join(source_folder, tpe, file)

def calc_size(filename: str) -> int:
    with open(filename, 'r') as f:
        c = 0
        for line in f:
            c += 1
    return c

In [53]:
def transform_and_label(source_file: str, target: str, tpe: str) -> int:
    df = pd.read_csv(get_source_path(source_file, tpe), header=None)
    df.index.name = "timestamp"
    df.columns = list(map(lambda v: f"value-{v}", df.columns))

    if tpe == "test":
        df_label = pd.read_csv(get_source_path(source_file, "test_label"), header=None)
        df_label.columns=["is_anomaly"]
        df = pd.merge(df, df_label, left_index=True, right_index=True, how="inner")
    else:
        df["is_anomaly"] = 0

    df.to_csv(target)
    return len(df)

In [54]:
# shared by all datasets
dataset_type = "real"
train_is_normal = True
train_type = "semi-supervised"
input_type = "multivariate"
datetime_index = False

dm = DatasetMetadata(target_folder)

# create target directory
dataset_subfolder = os.path.join(target_folder, input_type, dataset_collection_name)
try:
    os.makedirs(dataset_subfolder)
    print(f"Created directories {dataset_subfolder}")
except FileExistsError:
    print(f"Directories {dataset_subfolder} already exist")
    pass

Directories data-processed/multivariate/SMD already exist


In [55]:
# dataset transformation
transform_file: Callable[[str, str, str], int] = transform_and_label

for f in list_regular_files(get_source_path(".")):
    paths = {}
    for t_type in ["train", "test"]:
        dataset_name = os.path.splitext(f)[0]
        source_file = get_source_path(f, t_type)
        filename = f"{dataset_name}.{t_type}.csv"
        path = os.path.join(dataset_subfolder, filename)
        paths[t_type] = path
        
        # transform file
        dataset_length = transform_file(f, path, t_type)
        print(f"Processed source dataset {source_file} -> {path}")

    # save metadata
    dm.add_dataset(
        dataset_name = dataset_name,
        collection_name = dataset_collection_name,
        train_path = paths["train"],
        test_path = paths["test"],
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )

# save metadata of benchmark
dm.save()

Processed source dataset data-raw/Server Machine Dataset/train/machine-3-6.txt -> data-processed/multivariate/SMD/machine-3-6.train.csv
Processed source dataset data-raw/Server Machine Dataset/test/machine-3-6.txt -> data-processed/multivariate/SMD/machine-3-6.test.csv
Processed source dataset data-raw/Server Machine Dataset/train/machine-1-8.txt -> data-processed/multivariate/SMD/machine-1-8.train.csv
Processed source dataset data-raw/Server Machine Dataset/test/machine-1-8.txt -> data-processed/multivariate/SMD/machine-1-8.test.csv
Processed source dataset data-raw/Server Machine Dataset/train/machine-1-1.txt -> data-processed/multivariate/SMD/machine-1-1.train.csv
Processed source dataset data-raw/Server Machine Dataset/test/machine-1-1.txt -> data-processed/multivariate/SMD/machine-1-1.test.csv
Processed source dataset data-raw/Server Machine Dataset/train/machine-3-9.txt -> data-processed/multivariate/SMD/machine-3-9.train.csv
Processed source dataset data-raw/Server Machine Datas

In [59]:
dm.refresh()
dm._df.loc["SMD"]

Unnamed: 0_level_0,train_path,test_path,type,datetime_index,split_at,train_type,train_is_normal,input_type,length
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
machine-3-6,data-processed/multivariate/SMD/machine-3-6.tr...,data-processed/multivariate/SMD/machine-3-6.te...,real,False,,semi-supervised,True,multivariate,28726
machine-1-8,data-processed/multivariate/SMD/machine-1-8.tr...,data-processed/multivariate/SMD/machine-1-8.te...,real,False,,semi-supervised,True,multivariate,23699
machine-1-1,data-processed/multivariate/SMD/machine-1-1.tr...,data-processed/multivariate/SMD/machine-1-1.te...,real,False,,semi-supervised,True,multivariate,28479
machine-3-9,data-processed/multivariate/SMD/machine-3-9.tr...,data-processed/multivariate/SMD/machine-3-9.te...,real,False,,semi-supervised,True,multivariate,28713
machine-3-10,data-processed/multivariate/SMD/machine-3-10.t...,data-processed/multivariate/SMD/machine-3-10.t...,real,False,,semi-supervised,True,multivariate,23693
machine-3-7,data-processed/multivariate/SMD/machine-3-7.tr...,data-processed/multivariate/SMD/machine-3-7.te...,real,False,,semi-supervised,True,multivariate,28705
machine-1-2,data-processed/multivariate/SMD/machine-1-2.tr...,data-processed/multivariate/SMD/machine-1-2.te...,real,False,,semi-supervised,True,multivariate,23694
machine-2-1,data-processed/multivariate/SMD/machine-2-1.tr...,data-processed/multivariate/SMD/machine-2-1.te...,real,False,,semi-supervised,True,multivariate,23694
machine-2-5,data-processed/multivariate/SMD/machine-2-5.tr...,data-processed/multivariate/SMD/machine-2-5.te...,real,False,,semi-supervised,True,multivariate,23689
machine-2-7,data-processed/multivariate/SMD/machine-2-7.tr...,data-processed/multivariate/SMD/machine-2-7.te...,real,False,,semi-supervised,True,multivariate,23696


## Experimentation

In [12]:
train_folder = os.path.join(source_folder, "train")
for f in list_regular_files(get_source_path(".")):
    for p in ["train", "test"]:
        file = get_source_path(f, p)
        filename = f"{os.path.splitext(f)[0]}.{p}.csv"
        if p == "test":
            file = file + " & " + get_source_path(f, "test_label")
        print(p, ":", file, "->", os.path.join(dataset_subfolder, filename))

train : data-raw/Server Machine Dataset/train/machine-3-6.txt -> data-processed/multivariate/SMD/machine-3-6.train.csv
test : data-raw/Server Machine Dataset/test/machine-3-6.txt & data-raw/Server Machine Dataset/test_label/machine-3-6.txt -> data-processed/multivariate/SMD/machine-3-6.test.csv
train : data-raw/Server Machine Dataset/train/machine-1-8.txt -> data-processed/multivariate/SMD/machine-1-8.train.csv
test : data-raw/Server Machine Dataset/test/machine-1-8.txt & data-raw/Server Machine Dataset/test_label/machine-1-8.txt -> data-processed/multivariate/SMD/machine-1-8.test.csv
train : data-raw/Server Machine Dataset/train/machine-1-1.txt -> data-processed/multivariate/SMD/machine-1-1.train.csv
test : data-raw/Server Machine Dataset/test/machine-1-1.txt & data-raw/Server Machine Dataset/test_label/machine-1-1.txt -> data-processed/multivariate/SMD/machine-1-1.test.csv
train : data-raw/Server Machine Dataset/train/machine-3-9.txt -> data-processed/multivariate/SMD/machine-3-9.tra

In [41]:
df = pd.read_csv(get_source_path("machine-1-1.txt", "test"), header=None)
df.index.name = "timestamp"
df.columns = list(map(lambda v: f"value-{v}", df.columns))
df

df_label = pd.read_csv(get_source_path("machine-1-1.txt", "test_label"), header=None)
df_label.columns=["is_anomaly"]
df = pd.merge(df, df_label, left_index=True, right_index=True, how="inner")
#df.to_csv("test.csv")
df

Unnamed: 0_level_0,value-0,value-1,value-2,value-3,value-4,value-5,value-6,value-7,value-8,value-9,...,value-29,value-30,value-31,value-32,value-33,value-34,value-35,value-36,value-37,is_anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.075269,0.065678,0.070234,0.074332,0.0,0.933333,0.274011,0.0,0.031081,0.000000,...,0.008596,0.068036,0.048893,0.000386,0.000034,0.064432,0.064500,0.0,0.0,0
1,0.086022,0.080508,0.075808,0.076655,0.0,0.930769,0.274953,0.0,0.031081,0.000122,...,0.008596,0.070020,0.050437,0.000386,0.000022,0.065228,0.065224,0.0,0.0,0
2,0.075269,0.064619,0.071349,0.074332,0.0,0.928205,0.274953,0.0,0.030940,0.000366,...,0.008596,0.069684,0.055069,0.000386,0.000045,0.067111,0.067178,0.0,0.0,0
3,0.086022,0.048729,0.063545,0.070848,0.0,0.928205,0.273070,0.0,0.027250,0.000244,...,0.010029,0.073253,0.051467,0.000000,0.000034,0.066676,0.066744,0.0,0.0,0
4,0.086022,0.051907,0.062430,0.070848,0.0,0.933333,0.274011,0.0,0.030940,0.000244,...,0.008596,0.070932,0.051467,0.000386,0.000022,0.066604,0.066671,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28474,0.075269,0.051907,0.047938,0.047619,0.0,0.907692,0.257062,0.0,0.043571,0.000244,...,0.031519,0.046733,0.040144,0.000000,0.000022,0.042931,0.043000,0.0,0.0,0
28475,0.064516,0.025424,0.039019,0.044135,0.0,0.905128,0.257062,0.0,0.032501,0.000000,...,0.031519,0.047438,0.048893,0.000000,0.000056,0.046550,0.046619,0.0,0.0,0
28476,0.064516,0.080508,0.050167,0.047619,0.0,0.907692,0.258004,0.0,0.026114,0.000611,...,0.031519,0.046797,0.040144,0.000386,0.000045,0.043003,0.043000,0.0,0.0,0
28477,0.064516,0.056144,0.047938,0.046458,0.0,0.902564,0.257062,0.0,0.033210,0.000122,...,0.031519,0.041884,0.043232,0.000000,0.000045,0.039890,0.039959,0.0,0.0,0


In [66]:
pd.read_csv(os.path.join(target_folder, input_type, dataset_collection_name, "machine-3-11.train.csv"))

Unnamed: 0,timestamp,value-0,value-1,value-2,value-3,value-4,value-5,value-6,value-7,value-8,...,value-29,value-30,value-31,value-32,value-33,value-34,value-35,value-36,value-37,is_anomaly
0,0,0.19,0.096790,0.137413,0.203953,0.981326,0.981494,0.957620,0.0,0.0,...,0.25,0.404039,0.0,0.105263,0.318182,0.428481,0.426371,0.147589,0.0,0
1,1,0.18,0.054364,0.118182,0.193953,0.981326,0.981452,0.957480,0.0,0.0,...,0.25,0.399984,0.0,0.052632,0.227273,0.421157,0.419060,0.125140,0.0,0
2,2,0.18,0.114051,0.126399,0.195116,0.981326,0.981452,0.957480,0.0,0.0,...,0.25,0.390323,0.0,0.052632,0.318182,0.411348,0.410116,0.124626,0.0,0
3,3,0.18,0.095338,0.122378,0.192093,0.981326,0.981480,0.957573,0.0,0.0,...,0.25,0.382118,0.0,0.052632,0.318182,0.404542,0.402622,0.115367,0.0,0
4,4,0.17,0.114857,0.127273,0.192791,0.981326,0.981424,0.957386,0.0,0.0,...,0.25,0.372457,0.0,0.105263,0.318182,0.390266,0.389100,0.118398,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28690,28690,0.35,0.266817,0.286014,0.383721,0.988796,0.991431,0.978388,0.0,0.0,...,0.25,0.687340,0.0,0.052632,0.318182,0.697593,0.698776,0.265013,0.0,0
28691,28691,0.35,0.227940,0.277622,0.380000,0.988796,0.991445,0.978435,0.0,0.0,...,0.25,0.682489,0.0,0.105263,0.363636,0.694040,0.694807,0.356168,0.0,0
28692,28692,0.35,0.211002,0.264161,0.373023,0.988796,0.991473,0.978529,0.0,0.0,...,0.25,0.684713,0.0,0.105263,0.272727,0.696881,0.697398,0.241205,0.0,0
28693,28693,0.34,0.222617,0.257343,0.368372,0.988796,0.991487,0.978529,0.0,0.0,...,0.25,0.683729,0.0,0.105263,0.409091,0.693166,0.693880,0.295306,0.0,0
