# NASA Spacecraft Telemetry Data

In [22]:
import numpy as np
import pandas as pd
import json
import os
from typing import Final
from collections.abc import Callable
from config import data_raw_folder, data_processed_folder

In [23]:
dataset_collection_prefix = "NASA"
source_folder = os.path.join(data_raw_folder, "NASA Spacecraft Telemetry Data")
target_folder = data_processed_folder

Metadata handling

In [30]:
# type
DatasetMetadataRecord = {
    "dataset_name": str,
    "collection_name": str,
    "train_path": str,
    "test_path": str,
    "dataset_type": str,
    "datetime_index": bool,
    "split_at": int,
    "train_type": str,
    "train_is_normal": bool,
    "input_type": str,
    "length": int
}

class DatasetMetadata:
    """
    ATTENTION: Not thread-safe! There is no check for changes to the underlying `dataset.csv` file while this class is loaded.
    """
    
    FILENAME: Final[str] = "datasets.csv"
    
    _filepath: str
    _df: pd.DataFrame
    _dirty: bool

    def __init__(self, target_folder: str):
        self._filepath = os.path.join(target_folder, self.FILENAME)
        self._dirty = False
        if not os.path.isfile(self._filepath):
            self._df = self._create_metadata_file()
        else:
            self.refresh(force = True)
    
    def __enter__(self) -> 'DatasetMetadata':
        return self
    
    def __exit__(self, exception_type, exception_value, exception_traceback) -> 'DatasetMetadata':
        self.save()
        return self
    
    def __repr__(self) -> str:
        return repr(self._df)
    
    def __str__(self) -> str:
        return str(self._df)
        
    def _create_metadata_file(self) -> pd.DataFrame:
        df_temp = pd.DataFrame(columns=["dataset_name", "collection_name", "train_path", "test_path", "dataset_type", "datetime_index", "split_at", "train_type", "train_is_normal", "input_type", "length"])
        df_temp.set_index(["dataset_name", "collection_name"], inplace=True)
        df_temp.to_csv(self._filepath)
        return df_temp
    
    def add_dataset(self,
        dataset_name: str,
        collection_name: str,
        train_path: str,
        test_path: str,
        dataset_type: str,
        datetime_index: bool,
        split_at: int,
        train_type: str,
        train_is_normal: bool,
        input_type: str,
        dataset_length: int
    ) -> 'DatasetMetadata':
        df_new = pd.DataFrame({
            "train_path": train_path,
            "test_path": test_path,
            "dataset_type": dataset_type,
            "datetime_index": datetime_index,
            "split_at": split_at,
            "train_type": train_type,
            "train_is_normal": train_is_normal,
            "input_type": input_type,
            "length": dataset_length
        }, index=[(dataset_name, collection_name)])
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
        return self
    
    def add_datasets(self, datasets: list[DatasetMetadataRecord]) -> 'DatasetMetadata':
        df_new = pd.DataFrame(datasets)
        df_new.set_index(["dataset_name", "collection_name"], inplace = True)
        df = pd.concat([self._df, df_new], axis=0)
        df = df[~df.index.duplicated(keep = "last")]
        self._df = df
        self._dirty = True
        return self
    
    def refresh(self, force: bool = False) -> None:
        if not force and self._dirty:
            raise Exception("There are unsaved changes in memory that would get lost by reading from disk again!")
        else:
            self._df = pd.read_csv(self._filepath, index_col=["dataset_name", "collection_name"])
    
    def save(self) -> None:
        self._df.to_csv(self._filepath)
        self._dirty = False

In [31]:
def create_target_subfolder(input_type: str, dataset_collection_name: str) -> str:
    dataset_subfolder = os.path.join(target_folder, input_type, dataset_collection_name)
    try:
        os.makedirs(dataset_subfolder)
        print(f"Created directories {dataset_subfolder}")
    except FileExistsError:
        print(f"Directories {dataset_subfolder} already exist")
        pass
    return dataset_subfolder

def transform_and_label(source: str, target: str, anomaly_windows: list[str], force_all_normal: bool = False) -> None:
    df = pd.DataFrame(np.load(source)[:,1], columns=["value"])
    df.index.name = "timestamp"
    df = df[["value"]]
    df["is_anomaly"] = 0

    if not force_all_normal:
        for t1, t2 in anomaly_windows:
            tmp = df[df.index >= t1]
            tmp = tmp[tmp.index <= t2]
            df["is_anomaly"].values[tmp.index] = 1

    df.to_csv(target)

In [32]:
# shared by all datasets
dataset_type = "real"
input_type = "univariate"
datetime_index = False
train_type = "semi-supervised"
train_is_normal = True

dm = DatasetMetadata(target_folder)

In [33]:
# dataset transformation
transform_file: Callable[[str, str, list[str], bool], None] = transform_and_label

meta = pd.read_csv(os.path.join(source_folder, "labeled_anomalies.csv"))
json.loads(meta["anomaly_sequences"][0])

for _, dataset in meta.iterrows():
    
    dataset_name = dataset["chan_id"]
    collection_name = dataset_collection_prefix + "-" + dataset["spacecraft"]
    dataset_length = dataset["num_values"]
    dataset_subfolder = create_target_subfolder(input_type, collection_name)
    
    windows = json.loads(dataset["anomaly_sequences"])
    
    paths = {}
    for t_type in ["train", "test"]:
        source_file = os.path.join(source_folder, t_type, dataset_name + ".npy")
        filename = f"{dataset_name}.{t_type}.csv"
        path = os.path.join(dataset_subfolder, filename)
        paths[t_type] = path
        
        # transform file
        transform_file(source_file, path, windows, force_all_normal=(t_type == "train"))
        print(f"Processed source dataset {source_file} -> {path}")

    # save metadata
    dm.add_dataset(
        dataset_name = dataset_name,
        collection_name = collection_name,
        train_path = paths["train"],
        test_path = paths["test"],
        dataset_type = dataset_type,
        datetime_index = datetime_index,
        split_at = None,
        train_type = train_type,
        train_is_normal = train_is_normal,
        input_type = input_type,
        dataset_length = dataset_length
    )

# save metadata of benchmark
dm.save()

Directories data-processed/univariate/NASA-SMAP already exist
Processed source dataset data-raw/NASA Spacecraft Telemetry Data/train/P-1.npy -> data-processed/univariate/NASA-SMAP/P-1.train.csv
Processed source dataset data-raw/NASA Spacecraft Telemetry Data/test/P-1.npy -> data-processed/univariate/NASA-SMAP/P-1.test.csv
Directories data-processed/univariate/NASA-SMAP already exist
Processed source dataset data-raw/NASA Spacecraft Telemetry Data/train/S-1.npy -> data-processed/univariate/NASA-SMAP/S-1.train.csv
Processed source dataset data-raw/NASA Spacecraft Telemetry Data/test/S-1.npy -> data-processed/univariate/NASA-SMAP/S-1.test.csv
Directories data-processed/univariate/NASA-SMAP already exist
Processed source dataset data-raw/NASA Spacecraft Telemetry Data/train/E-1.npy -> data-processed/univariate/NASA-SMAP/E-1.train.csv
Processed source dataset data-raw/NASA Spacecraft Telemetry Data/test/E-1.npy -> data-processed/univariate/NASA-SMAP/E-1.test.csv
Directories data-processed/u

In [34]:
dm.refresh()
dm._df

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,type,datetime_index,split_at,train_type,train_is_normal,input_type,length
dataset_name,collection_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A1Benchmark-6,WebscopeS5,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1439
A1Benchmark-3,WebscopeS5,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1461
A1Benchmark-40,WebscopeS5,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1427
A1Benchmark-20,WebscopeS5,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1422
A1Benchmark-4,WebscopeS5,,data-processed/univariate/WebscopeS5/A1Benchma...,real,True,,unsupervised,False,univariate,1423
...,...,...,...,...,...,...,...,...,...,...
P-11,NASA-MSL,data-processed/univariate/NASA-MSL/P-11.train.csv,data-processed/univariate/NASA-MSL/P-11.test.csv,real,False,,semi-supervised,True,univariate,3535
D-15,NASA-MSL,data-processed/univariate/NASA-MSL/D-15.train.csv,data-processed/univariate/NASA-MSL/D-15.test.csv,real,False,,semi-supervised,True,univariate,2158
D-16,NASA-MSL,data-processed/univariate/NASA-MSL/D-16.train.csv,data-processed/univariate/NASA-MSL/D-16.test.csv,real,False,,semi-supervised,True,univariate,2191
M-7,NASA-MSL,data-processed/univariate/NASA-MSL/M-7.train.csv,data-processed/univariate/NASA-MSL/M-7.test.csv,real,False,,semi-supervised,True,univariate,2156


## Experimentation

In [7]:
meta = pd.read_csv(os.path.join(source_folder, "labeled_anomalies.csv"))
json.loads(meta["anomaly_sequences"][0])
dataset = next(meta.iterrows())[1]
print(dataset)

chan_id                                                     P-1
spacecraft                                                 SMAP
anomaly_sequences    [[2149, 2349], [4536, 4844], [3539, 3779]]
class                      [contextual, contextual, contextual]
num_values                                                 8505
Name: 0, dtype: object


In [10]:
columns = ["value"] + ["C" + str(i) for i in range(24)]
filename = os.path.join(source_folder, "test", dataset["chan_id"] + ".npy")
print(f"loading {filename}")
dd = np.load(filename)
df = pd.DataFrame(dd, columns=columns)
df.index.name = "timestamp"
df = df[["value"]]
df["is_anomaly"] = 0

windows = json.loads(dataset["anomaly_sequences"])

for t1, t2 in windows:
    tmp = df[df.index >= t1]
    tmp = tmp[tmp.index <= t2]
    df["is_anomaly"].values[tmp.index] = 1

print(windows)
df

loading data-raw/NASA Spacecraft Telemetry Data/test/P-1.npy
[[2149, 2349], [4536, 4844], [3539, 3779]]


Unnamed: 0_level_0,value,is_anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.695162,0
1,-0.685704,0
2,-0.725719,0
3,-0.761368,0
4,-0.745362,0
...,...,...
8500,0.293561,0
8501,0.341579,0
8502,-0.316115,0
8503,-0.297199,0


In [19]:
trainfile = os.path.join(source_folder, "train", dataset["chan_id"] + ".npy")
testfile = os.path.join(source_folder, "test", dataset["chan_id"] + ".npy")
windows = json.loads(dataset["anomaly_sequences"])

transform_and_label(trainfile, "P-1.train.csv", windows, force_all_normal=True)
transform_and_label(testfile, "P-1.test.csv", windows, force_all_normal=False)

df = pd.read_csv("P-1.train.csv", index_col="timestamp")
anomalies_in_train = len(df[df["is_anomaly"] == 1])
df = pd.read_csv("P-1.test.csv", index_col="timestamp")
anomalies_in_test = len(df[df["is_anomaly"] == 1])
(anomalies_in_train, anomalies_in_test)

(0, 751)

In [38]:
df = pd.read_csv("data-processed/univariate/NASA-MSL/C-1.train.csv")
df

Unnamed: 0,timestamp,value,is_anomaly
0,0,0.0,0
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
...,...,...,...
2153,2153,0.0,0
2154,2154,0.0,0
2155,2155,0.0,0
2156,2156,0.0,0


In [17]:
np.load("data-raw/NASA Spacecraft Telemetry Data/train/P-1.npy")[:,1]

array([0., 0., 0., ..., 0., 0., 0.])