# Gasoil heating loop (GHL)

Use the `DANGER` annotation for anomalies as described in the paper:

> In the test set the additional variables labeled as ATTACK, DANGER and FAULT are introduced.
> They determine different parts of attack evolution.
> We will use the DANGER series to compare results with the fault-detection algorithm.

In [10]:
import pandas as pd
import os
from typing import List
from pathlib import Path
import matplotlib.pyplot as plt
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets

In [11]:
plt.rcParams["figure.figsize"] = (20, 10)

In [12]:
dataset_collection_name = "GHL"
source_folder = Path(data_raw_folder) / "Gasoil Heating Loop" / "data"
target_folder = data_processed_folder

print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/Gasoil Heating Loop/data and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


file handling and transformations

In [13]:
def list_regular_files(path: Path) -> List[Path]:
    return [f for f in path.iterdir() if f.is_file()]

def process(source: Path, target: Path, is_train: bool = False) -> int:
    df = pd.read_csv(source)
    # fix to constant samling frequency (floor timestamp and remove duplicates taking first value)
    df.insert(0, "timestamp", df["Time"].astype(int))
    df = df.drop(["Time"], axis=1)
    df = df.drop_duplicates(subset="timestamp", keep="first", ignore_index=True)
    # remove "distrubances" random value columns
    rand_column_names = [c for c in df.columns if "rand" in c]
    df = df.drop(rand_column_names, axis=1)
    # add anomaly label
    if is_train:
        df["is_anomaly"] = 0
    else:
        df["is_anomaly"] = (df["DANGER"] == 1).astype(int)
        df = df.drop(["DANGER", "FAULT", "ATTACK"], axis=1)
    df.to_csv(target, index=False)
    return len(df)

In [14]:
# shared by all datasets
dataset_type = "synthetic"
train_is_normal = True
train_type = "semi-supervised"
input_type = "multivariate"
datetime_index = False

# create target directory
dataset_subfolder = Path(input_type) / dataset_collection_name
target_subfolder = target_folder / dataset_subfolder
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/GHL already exist


In [15]:
files = list_regular_files(source_folder)
train_file = [f for f in files if "train" in str(f)]
if train_file:
    train_file = train_file[0]
else:
    raise ValueError("Training file not found!")
files = [f for f in files if "train" not in str(f)]

# Process training file
filename = f"{train_file.stem}.train.csv"
train_path = dataset_subfolder / filename
train_target_filepath = target_subfolder / filename

process(train_file, train_target_filepath, is_train=True)
print(f"Processed training dataset {train_file} -> {train_target_filepath}")

for f in files:
    dataset_name = f.stem
    filename = f"{dataset_name}.test.csv"
    path = dataset_subfolder / filename
    target_filepath = target_subfolder / filename

    dataset_length = process(f, target_filepath)

    # save metadata
    dm.add_dataset((dataset_collection_name, dataset_name),
        train_path = train_path,
        test_path = path,
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )
    print(f"Processed source dataset {f} -> {target_filepath}")

# save metadata of benchmark
dm.save()

Processed training dataset /home/projects/akita/data/benchmark-data/data-raw/Gasoil Heating Loop/data/train_1500000_seed_11_vars_23.csv -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/GHL/train_1500000_seed_11_vars_23.train.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Gasoil Heating Loop/data/45_Lev_corr_Temp_fault_seed_193_vars_23.csv -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/GHL/45_Lev_corr_Temp_fault_seed_193_vars_23.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Gasoil Heating Loop/data/34_Lev_corr_Temp_fault_seed_151_vars_23.csv -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/GHL/34_Lev_corr_Temp_fault_seed_151_vars_23.test.csv
Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Gasoil Heating Loop/data/23_Lev_fault_Temp_corr_seed_79_vars_23.csv -> /home/projects/akita/data/benchmark-data/data-processed/multivar

In [16]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GHL,01_Lev_fault_Temp_corr_seed_11_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/01_Lev_fault_Temp_corr_seed_1...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,02_Lev_fault_Temp_corr_seed_17_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/02_Lev_fault_Temp_corr_seed_1...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,03_Lev_fault_Temp_corr_seed_19_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/03_Lev_fault_Temp_corr_seed_1...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,04_Lev_fault_Temp_corr_seed_23_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/04_Lev_fault_Temp_corr_seed_2...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,05_Lev_fault_Temp_corr_seed_27_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/05_Lev_fault_Temp_corr_seed_2...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,06_Lev_fault_Temp_corr_seed_29_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/06_Lev_fault_Temp_corr_seed_2...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,07_Lev_fault_Temp_corr_seed_31_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/07_Lev_fault_Temp_corr_seed_3...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,08_Lev_fault_Temp_corr_seed_33_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/08_Lev_fault_Temp_corr_seed_3...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,09_Lev_fault_Temp_corr_seed_37_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/09_Lev_fault_Temp_corr_seed_3...,synthetic,False,,semi-supervised,True,multivariate,200001
GHL,10_Lev_fault_Temp_corr_seed_39_vars_23,multivariate/GHL/train_1500000_seed_11_vars_23...,multivariate/GHL/10_Lev_fault_Temp_corr_seed_3...,synthetic,False,,semi-supervised,True,multivariate,200001


## Experimentation

In [None]:
files = [f for f in list_regular_files(source_folder)]
files

In [None]:
df = pd.read_csv(files[0])
# fix to constant samling frequency (floor timestamp and remove duplicates taking first value)
df.insert(0, "timestamp", df["Time"].astype(int))
df = df.drop(["Time"], axis=1)
df = df.drop_duplicates(subset="timestamp", keep="first", ignore_index=True)
# remove "disturbances" random value columns
rand_column_names = [c for c in df.columns if "rand" in c]
df = df.drop(rand_column_names, axis=1)
# add anomaly label
df["is_anomaly"] = (df["DANGER"] == 1).astype(int)
df = df.drop(["DANGER", "FAULT", "ATTACK"], axis=1)
df

In [None]:
plt.Figure()
pd.read_csv(files[0]).iloc[135000:150000].plot(x="Time", y=["DANGER", "FAULT", "ATTACK"])#, "RT_level", "RT_temperature.T", "HT_temperature.T", "inj_valve_act", "heater_act"])
#plt.yscale("log")
plt.show()

In [None]:
train_file = [f for f in files if "train" in str(f)]
if train_file:
    train_file = train_file[0]
else:
    raise ValueError("Training file not found!")
train_file

In [None]:
df_train = pd.read_csv(train_file)
# fix to constant samling frequency (floor timestamp and remove duplicates taking first value)
df_train.insert(0, "timestamp", df_train["Time"].astype(int))
df_train = df_train.drop(["Time"], axis=1)
df_train = df_train.drop_duplicates(subset="timestamp", keep="first", ignore_index=True)
# remove "disturbances" random value columns
rand_column_names = [c for c in df_train.columns if "rand" in c]
df_train = df_train.drop(rand_column_names, axis=1)
df_train["is_anomaly"] = 0
df_train