# Genesis Demonstrator

In [6]:
import pandas as pd
import os
from typing import Final
from config import data_raw_folder, data_processed_folder

In [7]:
dataset_collection_name = "Genesis"
source_folder = os.path.join(data_raw_folder, "genesis-demonstrator/data")
target_folder = data_processed_folder

Metadata handling

In [10]:
# type
DatasetMetadataRecord = {
    "collection_name": str,
    "dataset_name": str,
    "train_path": str,
    "test_path": str,
    "dataset_type": str,
    "datetime_index": bool,
    "split_at": int,
    "train_type": str,
    "train_is_normal": bool,
    "input_type": str,
    "length": int
}

class DatasetMetadata:
    """
    ATTENTION: Not thread-safe! There is no check for changes to the underlying `dataset.csv` file while this class is loaded.
    """
    
    FILENAME: Final[str] = "datasets.csv"
    
    _filepath: str
    _df: pd.DataFrame
    _dirty: bool

    def __init__(self, target_folder: str):
        self._filepath = os.path.join(target_folder, self.FILENAME)
        self._dirty = False
        if not os.path.isfile(self._filepath):
            self._df = self._create_metadata_file()
        else:
            self.refresh(force = True)
    
    def __enter__(self) -> 'DatasetMetadata':
        return self
    
    def __exit__(self, exception_type, exception_value, exception_traceback) -> 'DatasetMetadata':
        self.save()
        return self
    
    def __repr__(self) -> str:
        return repr(self._df)
    
    def __str__(self) -> str:
        return str(self._df)
        
    def _create_metadata_file(self) -> pd.DataFrame:
        df_temp = pd.DataFrame(columns=["dataset_name", "collection_name", "train_path", "test_path", "type", "datetime_index", "split_at", "train_type", "train_is_normal", "input_type", "length"])
        df_temp.set_index(["collection_name", "dataset_name"], inplace=True)
        dir = os.path.dirname(self._filepath)
        if not os.path.exists(dir):
            print(f"Directory {dir} does not exist, creating it!")
            os.mkdir(dir)
        df_temp.to_csv(self._filepath)
        return df_temp
    
    def add_dataset(self,
        dataset_name: str,
        collection_name: str,
        train_path: str,
        test_path: str,
        dataset_type: str,
        datetime_index: bool,
        split_at: int,
        train_type: str,
        train_is_normal: bool,
        input_type: str,
        dataset_length: int
    ) -> None:
        df_new = pd.DataFrame({
            "train_path": train_path,
            "test_path": test_path,
            "type": dataset_type,
            "datetime_index": datetime_index,
            "split_at": split_at,
            "train_type": train_type,
            "train_is_normal": train_is_normal,
            "input_type": input_type,
            "length": dataset_length
        }, index=[(dataset_collection_name, dataset_name)])
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
    
    def add_datasets(self, datasets: list[DatasetMetadataRecord]) -> None:
        df_new = pd.DataFrame(datasets)
        df_new.set_index(["collection_name", "dataset_name"], inplace = True)
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
    
    def refresh(self, force: bool = False) -> None:
        if not force and self._dirty:
            raise Exception("There are unsaved changes in memory that would get lost by reading from disk again!")
        else:
            self._df = pd.read_csv(self._filepath, index_col=["collection_name", "dataset_name"])
    
    def save(self) -> None:
        self._df.to_csv(self._filepath)
        self._dirty = False

## Dataset transformation and pre-processing

In [12]:
train_type = "unsupervised"
train_is_normal = False
input_type = "multivariate"
datetime_index = True
dataset_type = "real"

# create target directory
dataset_subfolder = os.path.join(target_folder, input_type, dataset_collection_name)
try:
    os.makedirs(dataset_subfolder)
    print(f"Created directories {dataset_subfolder}")
except FileExistsError:
    print(f"Directories {dataset_subfolder} already exist")
    pass

dm = DatasetMetadata(target_folder)

Directories ../../../data/benchmark-data/data-processed/multivariate/Genesis already exist


In [13]:
# get target filenames
dataset_name = "genesis-anomalies"
filename = f"{dataset_name}.test.csv"

source_file = os.path.join(source_folder, "Genesis_AnomalyLabels.csv")
path = os.path.join(dataset_subfolder, filename)

# transform file
df = pd.read_csv(source_file)
#df = df.rename(columns={"Timestamp": "timestamp"})
df.insert(len(df.columns), "is_anomaly", df.loc[:, "Label"])
df.insert(1, "timestamp", pd.to_datetime(df["Timestamp"], unit='s'))
df = df.drop(columns=["Timestamp", "Label"])
df.to_csv(path, index=False)
print(f"Processed source dataset {source_file} -> {path}")

dataset_length = len(df)

# save metadata
dm.add_dataset(
    dataset_name = dataset_name,
    collection_name = dataset_collection_name,
    train_path = None,
    test_path = path,
    dataset_type = dataset_type,
    datetime_index = datetime_index,
    split_at = None,
    train_type = train_type,
    train_is_normal = train_is_normal,
    input_type = input_type,
    dataset_length = dataset_length
)

dm.save()

Processed source dataset ../../../data/benchmark-data/data-raw/genesis-demonstrator/data/Genesis_AnomalyLabels.csv -> ../../../data/benchmark-data/data-processed/multivariate/Genesis/genesis-anomalies.test.csv


In [14]:
dm.refresh()
dm._df.loc["Genesis"]

Unnamed: 0_level_0,train_path,test_path,type,datetime_index,split_at,train_type,train_is_normal,input_type,length
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
genesis-anomalies,,../../../data/benchmark-data/data-processed/mu...,real,True,,unsupervised,False,multivariate,16220
