# Proposed work flow for training models and comparing them. 

Each model has been trained on the same training data in separate workbooks, and the trained models have been saved.

In this workbook, we import the three trained models along with thresholds, that have been determined on validation set. We have a test set consisting of six sensors, where there are periods of high moisture level, which can be viewed as anomalous. This is converted into a test set, that is passed to each model. The three models are then compared graphically by inspecting where they are able to detect anomalies, and where they fail.


# Initialization

In [1]:
!pip install einops

Collecting einops
  Downloading https://files.pythonhosted.org/packages/5d/a0/9935e030634bf60ecd572c775f64ace82ceddf2f504a5fd3902438f07090/einops-0.3.0-py2.py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.0


In [2]:
import io
import json
import os

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import math

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import random

import copy

from einops import rearrange

import seaborn as sns
%pylab inline

##Ignoring warnings for now
import warnings
warnings.filterwarnings("ignore")

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
%%capture
!pip install pandas matplotlib google-cloud-storage

import os
from google.colab import drive
drive.mount('/content/drive')

# Point environment variable `GOOGLE_APPLICATION_CREDENTIALS` to 
# location of service account file 'dtu-course-02456-students.json'.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/content/drive/My Drive/Woodsense/Tech/Software/Deep Learning Course DTU/Students Folder/gcp-service-accounts/dtu-course-02456-students.json"
#os.environ['GOOGLE_APPLICATION_CREDENTIALS']  = "/content/drive/My Drive/Colab Notebooks/02456-deep-learning-with-PyTorch/WoodSense/gcp-service-accounts/dtu-course-02456-students.json"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly&response_type=code

Enter your authorization code:
4/1AY0e-g4DkhmRAhLeMFVUiVOq6ssX0O1qjRDuUNPj1QBFuCK25Mnmg79JLE0


# Defining functions for preprocessing

The following functions are used to convert the test data set into a format that an be used by the models. This includes encoding of the timestamp, normalization, and converting it to samples holding data for 24 hour sequences.

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The GRU Auto-Encoder takes 24 hour sequences and reconstructs the same sequences. The periods are not overlapping, and they always start at midnight.

In [5]:
def create_inout_sequences_GRU(input_data, tw, target_size = 1, 
                                step_size = 1,moisture=True,
                                start_from_midnight=True,
                                mean_and_std = None):
    '''Function that generates sequences based on the current time steps
    input_data = The unsctructured data from Woodsense
    tw = timewindow/timestemp we want to look back on
    target_size = how many hours we want to predict // For anomaly detection this would most likely be 1
    step_size = How many steps the window takes over the time series
    moisture = Choose moisture or ohms. If moisture = True then ohms is removed and vice versa. 

    ! Note: 
    mean_and_std is only applicable if data is standardised with global mean and standard deviation. 
    '''
    inout_seq = []
    label = []
    timestamps = []
    train = []
    sensor_list = []

    data, mean_and_std = standardise(input_data,mean_and_std)
    #print('standard done')

    for sensors in input_data['sensor_id'].unique():
      data = input_data.loc[input_data['sensor_id'] == sensors]
      #print('in loop')

      if start_from_midnight:
        data = remove_before_00(data)

      #print('from midnigt done')

      timestamp = data.timestamp.astype(int).to_numpy()#.tolist()
      
      data = data.drop(columns=['timestamp'])

      #print('timestamp removed')

      first = data.iloc[0]
      last = data.iloc[-1]
      L = len(data)

      data = data.drop(columns=[ 'sensor_id'])
      features = data.columns.tolist()
      data = data.astype(np.float32).to_numpy().tolist()
      data = torch.FloatTensor(data)
      for i in range(24,L-tw-target_size, step_size):
        #print('window making begun')
        train_seq = data[i:i+tw]
        #label_seq = data[i+tw:i+tw+target_size]
        timestamp_seq = timestamp[i:i+tw]
        #print('sequences made')
        train.append(train_seq)
        #label.append(label_seq)
        timestamps.append(timestamp_seq)
        #inout_seq.append((train_seq ,timestamps))
        sensor_list += [sensors]
    return train, timestamps, sensor_list, mean_and_std, features

For the DeepAnT model, we create samples for every 25 hour sequence (24 hours as input and 1 hour as target value), and these sequences do overlap.

In [6]:
def create_inout_sequences_deepant(input_data, tw, target_size = 1, step_size = 1, 
                                   keep_sensor_id = False, 
                                   mean_and_std = None):
    '''Function that generates sequences based on the current time steps
    input_data = The unsctructured data from Woodsens 
    tw = timewindow/timestemp we want to look back on
    target_size = how many hours we want to predict // For anomaly detection this would most likely be 1
    '''
    inout_seq = []
    label = []
    train = []
    for sensors in input_data['sensor_id'].unique():
      data = input_data[input_data['sensor_id'] == sensors]
      data = remove_before_00(data)
      L = len(data)
      timestamps = data['timestamp']
      data = data.drop(['timestamp', 'sensor_id'], axis=1)
      data, mean_and_std = standardise(data,mean_and_std)
      #print(data.isna().sum())
      label_data = data.drop(['tod_sin', 'tod_cos', 'doy_sin', 'doy_cos', 'weather_humidity', 'weather_pressure','weather_temp_dew', 'weather_temp_dry','weather_precip_past10min', 'weather_wind_max', 'weather_wind_speed'], axis=1)
      data = data.astype(np.float32).to_numpy().tolist()
      data = torch.FloatTensor(data)
      label_data = label_data.astype(np.float32).to_numpy().tolist()
      label_data = torch.FloatTensor(label_data)
      for i in range(0,L-tw-target_size, step_size):
        train_seq = data[i:i+tw]
        train_label = label_data[i+tw:i+tw+target_size]
        timestamp = timestamps[i+tw:i+tw+target_size]
        label.append(train_label)
        train.append(train_seq)
        if keep_sensor_id == False:
          inout_seq.append((train_seq ,train_label))
        else:
          inout_seq.append((sensors, timestamp, train_seq ,train_label))
    return train, label, inout_seq

For the Transformer model, we create samples for every 25 hour sequence (24 hours as input and 1 hour as target value), and these sequences do overlap.

In [7]:
def create_inout_sequences_transformer(input_data, tw, target_size = 1, 
                            step_size = 1,moisture=True,
                            start_from_midnight=True,std_on_sensor=False,
                            mean_and_std = None):
    '''Function that generates sequences based on the current time steps
    input_data = The unsctructured data from Woodsense
    tw = timewindow/timestemp we want to look back on
    target_size = how many hours we want to predict // For anomaly detection this would most likely be 1
    step_size = How many steps the window takes over the time series
    moisture = Choose moisture or ohms. If moisture = True then ohms is removed and vice versa. 

    ! Note: 
    mean_and_std is only applicable if data is standardised with global mean and standard deviation. 
    '''
    inout_seq = []
    label = []
    timestamps = []
    train = []
    tgt = []
    true = []
    sensor_list = []
    print('Number of sensors: ',len(input_data['sensor_id'].unique()))

    #if moisture:
    #  input_data = input_data.drop(columns=['ohms'])
    #else:
    #  input_data = input_data.drop(columns=['moisture'])

    if std_on_sensor is False:
      data, mean_and_std = standardise(input_data,mean_and_std)

    for sensors in input_data['sensor_id'].unique():
      data = input_data.loc[input_data['sensor_id'] == sensors]

      if start_from_midnight:
        data = remove_before_00(data)

      timestamp = data.timestamp.astype(int).to_numpy()#.tolist()
      
      data = data.drop(columns=['timestamp'])

      first = data.iloc[0]
      last = data.iloc[-1]
      L = len(data)
      if std_on_sensor:
        data, mean_and_std = standardise(data,mean_and_std)
      data = data.drop(columns=[ 'sensor_id'])
      features = data.columns.tolist()
      data = data.astype(np.float32).to_numpy().tolist()
      data = torch.FloatTensor(data)
      for i in range(0,L-tw-target_size, step_size):

        train_seq = data[i:i+tw-1]
        tgt_seq = data[i+tw-2:i+tw-1+target_size]
        true_seq = data[i+tw-1:i+tw+target_size]

        assert train_seq[-1,2] == tgt_seq[0,2]
        assert tgt_seq[-1,2] == true_seq[-2,2]

        timestamp_seq = timestamp[i:i+tw+target_size]
        #print(len(timestamp_seq))
        train.append(train_seq)
        tgt.append(tgt_seq)
        true.append(true_seq)
        timestamps.append(timestamp_seq)
        sensor_list += [sensors]
    return train, tgt, true, timestamps,  sensor_list, mean_and_std, features

Standardise data using the mean and standard deviation, such that the data will have zero mean and a standard deviation fo 1 after standardizing.

In [8]:
def standardise(data,mean_and_stds=None):
  """
  Standardises each column in the data. 

  Parameters:
  data (DataFrame): Data object to standardize. Columns must not contain timestamps or strings.

  """

  if mean_and_stds is None:

    mean_and_stds = pd.DataFrame()
    measures = []
    means = []
    stds = []
    for col in data.columns: 
      if col not in ['timestamp','sensor_id','tod_sin','tod_cos','doy_sin','doy_cos']:
        mean = data[col].mean()
        std = data[col].std()

        measures += [col]
        means += [mean]
        stds += [std]
        data.loc[:,col] = (data[col]-mean)/std

    mean_and_stds['measure'] = measures
    mean_and_stds['mean'] = means
    mean_and_stds['std'] = stds
  
  else: 

    for m in mean_and_stds['measure'].unique():
      mean = mean_and_stds.loc[mean_and_stds['measure'] == m, 'mean'].to_numpy()
      std = mean_and_stds.loc[mean_and_stds['measure'] == m, 'std'].to_numpy()
      data.loc[:,m] = (data.loc[:,m]- mean)/std

  return data, mean_and_stds

For the GRU Auto-Encoder, each period will start at midnight. This function removes all timestamps prior to midnight at the beginning of the time series for each sensor. That is, if sensor 20 begins at 15:00, then the data from 15:00 to 23:00 will be removed.

In [9]:
def remove_before_00(df):
  """
  Function removes all the first datarows which measurements are before 00 AM. The function is meant to work 
  in concurrence with create_inout_sequences per sensor to ensure the time-windows start from 00 AM. 

  Parameters:
  df (DataFrame): DataFrame grouped on a specific sensor

  """
  df['hour'] = df.timestamp.dt.hour
  df['day'] = df.timestamp.dt.dayofyear
  df['month'] = df.timestamp.dt.month

  min_month = df.month.min()
  min_day_in_min_month = df[df.month == min_month].day.min()
  max_month = df.month.max()
  max_day_in_max_month= df[df.month == max_month].day.max()

  df = df.sort_values(['timestamp'])

  if (df.head(1)['hour'].values == 0):
    pass
  else:
    df = df[(df.month >= min_month) & (df.day >= min_day_in_min_month+1)]

  if (df.tail(1)['hour'].values == 0):
    pass
  else:
    #Find first zero: 
    df = df[(df.day != max_day_in_max_month)]

  return df.drop(columns=['hour','day','month'])


In [10]:
class DatasetAE(torch.utils.data.Dataset):
  """
  DataLoader is meant for AutoEncoders
  """

  def __init__(self,data,timestamps,sensors):
    super(DatasetAE, self).__init__()

    self.data = data
    self.timestamps = timestamps
    #print(len(timestamps[0]))
    self.sensors = sensors

  def __len__(self):
    return len(self.data)
 
  def __getitem__(self,idx):
    d = self.data[idx]
    t = self.timestamps[idx]
    s = self.sensors[idx]

    return d, t, np.array([s])

We define classes for each model, which can be used with a dataloader.

In [11]:
class Dataset_deepant(torch.utils.data.Dataset):
  """
  DataLoader is meant for DeepAnT
  """
  def __init__(self, data, labels):
        'Initialization'
        self.labels = labels
        self.data = data

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = self.data[index]
        y = self.labels[index]

        return X, y

In [12]:
class DatasetTransformer(torch.utils.data.Dataset):
  """
  DataLoader is meant for AutoEncoders
  """

  def __init__(self,data,tgt,true,timestamps,sensors):
    super(DatasetTransformer, self).__init__()

    self.data = data
    self.tgt = tgt
    self.true = true
    self.timestamps = timestamps
    #print(len(timestamps[0]))
    self.sensors = sensors

  def __len__(self):
    return len(self.data)
 
  def __getitem__(self,idx):
    d = self.data[idx]
    tgt = self.tgt[idx]
    true = self.true[idx]
    t = self.timestamps[idx]
    s = self.sensors[idx]

    return d, tgt, true, t, np.array([s])

# Preprocessing data for each model

Load table with mean and standard deviation of training set, and load the test data set.

In [13]:
# Loading mean and std. dev. for each feature in training set to standardise test set

mean_and_std = pd.read_csv('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/mean_and_std_train.csv',index_col=False,delimiter=';')

#Loading test data (5 different sensors)
df = pd.read_csv('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/model_eval_data.csv',index_col=False,delimiter=';')
df['timestamp'] = pd.to_datetime(df['timestamp'],infer_datetime_format=True)

## GRU Auto-Encoder Data loader 

Create dataset and data loader to be used with GRU auto-encoder

In [14]:
tw = 24
step_size = 24
batch_size = 64
shuffle = False

test_ae, timestamps_ae, sensors_ae, mean_and_std, features = create_inout_sequences_GRU(input_data = df, tw = tw, target_size = 1, 
                                                                                      step_size = 24, moisture=True,
                                                                                      start_from_midnight=True,
                                                                                       mean_and_std = mean_and_std)

test_ae = DatasetAE(test_ae,timestamps_ae,sensors_ae)

AE_loader = DataLoader(test_ae,batch_size=batch_size, shuffle=shuffle, num_workers=8)

## DeepAnT data loader

Create dataset and data loader to be used with CNN DeepAnT model

In [None]:
mean_and_std = pd.read_csv('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/mean_and_std_train.csv',index_col=False,delimiter=';')

#Loading test data (5 different sensors)
df = pd.read_csv('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/model_eval_data.csv',index_col=False,delimiter=';')
df['timestamp'] = pd.to_datetime(df['timestamp'],infer_datetime_format=True)

# sequences for training
window = 24
test_seq_deepant, test_label_deepant, test_inout_seq_deepant = create_inout_sequences_deepant(input_data = df, 
                                                                                              tw=window, 
                                                                                              target_size = 1, 
                                                                                              step_size = 1, 
                                                                                              keep_sensor_id = True, 
                                                                                              mean_and_std = mean_and_std)


## Transformer data loader


Create data set and data loader to be used with transformer model

In [None]:
tw = 24
step_size = 1
target_size = 1
batch_size = 128
shuffle=False


src_tf, tgt_tf, true_tf, timestamps_tf, sensors_tf, mean_and_std, features = create_inout_sequences_transformer(df, tw = tw, target_size = 1, 
                                                                                      step_size = step_size, 
                                                                                      start_from_midnight=True,
                                                                                       mean_and_std = mean_and_std)

test_tf = DatasetTransformer(src_tf,tgt_tf,true_tf,timestamps_tf,sensors_tf)

TF_loader = DataLoader(test_tf,batch_size=batch_size, shuffle=shuffle, num_workers=8)

# Loading models

## Load GRU Auto-Encoder

The GRU auto-encoder consists of three parts: an encoder, a decoder, and a RecurrentAutoEncoder, which combines the encoder and decoder.  All the classes must be defined before loading the model.



In [None]:
class Encoder(nn.Module):

  def __init__(self, seq_len, n_features_in, embedding_dim=64,dropout=0.99,num_layers=1,bidirectional=True):
    super(Encoder, self).__init__()
    print(dropout)
    if bidirectional:
      self.num_directions = 2
    else:
      self.num_directions = 1

    self.seq_len =  seq_len
    self.n_features = n_features_in
    self.embedding_dim = embedding_dim
    self.hidden_dim1 = 2 * embedding_dim 
    self.hidden_dim2 = int((2 * embedding_dim)*self.num_directions)

    self.rnn1 = nn.GRU(
      input_size=self.n_features,
      hidden_size= self.hidden_dim1,
      num_layers=num_layers,
      batch_first=True,
      dropout = 0,
      bidirectional = bidirectional
    )

    self.rnn2 = nn.GRU(
      input_size=self.hidden_dim2,
      #hidden_size= int(embedding_dim/self.num_directions),
      hidden_size = embedding_dim,
      num_layers=num_layers,
      batch_first=True,
      dropout = 0,
      bidirectional = bidirectional
    )
  

  def forward(self, x):

    x, _ = self.rnn1(x)

    x,hidden_n = self.rnn2(x)

    hidden_n = hidden_n.permute(1,0,2)

    return hidden_n

In [None]:
class Decoder(nn.Module):

  def __init__(self, seq_len, input_dim=64, n_features_out=3,dropout=0.99,num_layers=1,bidirectional=False):
    super(Decoder, self).__init__()
    print(dropout)
    if bidirectional:
      self.num_directions = 2
    else:
      self.num_directions = 1

    self.seq_len = seq_len
    self.input_dim1 = input_dim
    self.hidden_size1 = input_dim

    self.input_dim2 = self.hidden_size1 * self.num_directions
    self.hidden_size2 = int(2*self.input_dim2/self.num_directions)

    self.n_features = n_features_out
    self.num_layers = num_layers
    

    self.rnn1 = nn.GRU(
      input_size=input_dim,
      hidden_size= self.hidden_size1,
      num_layers=num_layers,
      batch_first=True,
      dropout = 0,
      bidirectional = bidirectional
    )

    self.rnn2 = nn.GRU(
      input_size= self.input_dim2,
      hidden_size= self.hidden_size2,
      num_layers=num_layers,
      batch_first=True,
      dropout = 0,
      bidirectional = bidirectional
    )

    
    self.output_layer = nn.Linear(self.hidden_size2*self.num_directions, n_features_out)

    self.repetitions = int(self.seq_len/(self.num_layers*self.num_directions))

  def forward(self, x):

    x = x.repeat(1,self.repetitions, 1)

    x, hidden_n = self.rnn1(x)

    x, hidden_n = self.rnn2(x)

    return self.output_layer(x)

In [None]:
class RecurrentAutoencoder(nn.Module):

  def __init__(self, seq_len,n_features_in, n_features_out, embedding_dim=64,dropout=0,num_layers=1,bidirectional=False):
    super(RecurrentAutoencoder, self).__init__()

    self.encoder = Encoder(seq_len=seq_len, n_features_in=n_features_in, embedding_dim = embedding_dim,
                           dropout=dropout,num_layers=num_layers,bidirectional=bidirectional).to(device)
    self.decoder = Decoder(seq_len=seq_len, input_dim = embedding_dim, n_features_out = n_features_out,
                           dropout=dropout,num_layers=num_layers,bidirectional=bidirectional).to(device)

  def forward(self, x):
    x = self.encoder(x)
    x = self.decoder(x)

    return x

The optimized model is loaded from file.

In [None]:
gru_ae = torch.load('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/GRU_AE_epoch_1000.pt', map_location=torch.device(device))
gru_ae.eval()

## Load DeepAnT model

The DeepAnT model is defined as class Network_2layers, which is first defined:

In [None]:
class Network_2layers(nn.Module):
    def __init__(self, features=14, tw=24, target_features=3, filters1=32, filters2=32, kernel1=3, kernel2=3):
        super(Network_2layers, self).__init__()
        padding1 = int(kernel1/2)
        padding2 = int(kernel2/2)
        dimensions_conv_out = int(tw/(2*2))
        self.convolutional = nn.Sequential(
                nn.Conv1d(features, filters1, kernel1, stride=1, padding=padding1), # 32x24
                #nn.Dropout2d(0.5), #50 % probability 
                nn.MaxPool1d(2, stride=2), #32x12 
                nn.ReLU(),
                nn.Conv1d(filters1, filters2, kernel2, stride=1, padding=padding2), #32x12
                #nn.Dropout2d(0.2), #50 % probability 
                nn.MaxPool1d(2, stride=2),    # 32 x 6 
                nn.ReLU()
        )
        self.fully_connected = nn.Sequential(
                nn.Linear(filters2*dimensions_conv_out, target_features),
                #nn.Dropout(p=0.2),
                #nn.ReLU(),
                #nn.Linear(50, 4)
        )
    def forward(self, x):
        x = torch.transpose(x, 2, 1)
        x = self.convolutional(x)
        #reshape x so it becomes flat, except for the first dimension (which is the minibatch)
        x = x.view(x.size(0), -1)
        x = self.fully_connected(x)

        x_shape = x.shape
        #print(x_shape)
        x = torch.reshape(x, (x_shape[0], 1, x_shape[1]) )
        #print(x.shape)

        return x

The optimized DeepAnT model is loaded along with the found thresholds.

In [None]:
# Load model
path = '/content/drive/My Drive/WoodSense/notebooks/Stine/'
name = 'DeepANT_final_version3_epoch_100.pt_epoch_100.pt'
deepant_model = torch.load(path+name, map_location=torch.device(device))

# define networks and parameters
net_deepant = deepant_model[0]
deepant_threshold = deepant_model[1][0]
deepant_param_thresholds = deepant_model[1][1]
net_deepant

## Load Transformer model

The transformer model consists of two classes: PositionalEncoding and OutlierTransformer. These are defined before loading the model.

In [None]:
#Create original positional encoding
#Source: https://pytorch.org/tutorials/beginner/transformer_tutorial
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
        

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class OutlierTransformer(nn.Module):
    def __init__(self, n_features_in,n_features_out, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, 
                 max_seq_length, pos_dropout, trans_dropout,pos_encoding=True):
        super().__init__()
        self.d_model = d_model
        self.pos_encoding = pos_encoding
        self.embed_src = nn.Linear(n_features_in,d_model)
        self.embed_tgt = nn.Linear(n_features_in,d_model) # Needs to be if we need to predict more than 1 timepoint
        self.embed = nn.Linear(n_features_in,d_model)

        self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)

        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, trans_dropout)
        #self.fc = nn.Linear(d_model, n_features_out)

        self.linear_dropout = nn.Dropout(trans_dropout)

        self.fc = nn.Linear(d_model, n_features_out)

    def forward(self, src, tgt,tgt_mask=None):

        src = rearrange(src, 'n s e -> s n e')
        tgt = rearrange(tgt, 'n t e -> t n e')

        src = self.embed_src(src) * math.sqrt(self.d_model)
        tgt = self.embed_tgt(tgt) * math.sqrt(self.d_model)

        if self.pos_encoding:
          src = self.pos_enc(src)
          tgt = self.pos_enc(tgt)

        output = self.transformer(src, tgt, tgt_mask=tgt_mask)

        output = rearrange(output, 't n e -> n t e')
        self.linear_dropout(output)
        
        output = self.fc(output)
        #print(output.shape)
        return output

The optimized transformer model is loaded from file.

In [None]:
transformer = torch.load('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/Transformer_final_weather_epoch_100.pt', map_location=torch.device(device))
transformer.eval()

# Running test data through models

To compare the models, we run the test data through each of the model to compute a prediction or reconstruction. For each model, we make a data frame with the actual sensor data along with predicted/reconstructed data, the associated errors on each parameter, and the overall Euclidean distance.

## Functions to create dataframes


First, we define a couple of functions, which will be used to easily compute the predictions and losses, and to create the data frames needed to make plots.

First we define a function to compute reconstructions for the GRU auto-encoder.

In [None]:
def test_loss_calculator_AE(model,test_loader,Loss_type = 'L2',only_sensor_labels=True):

  if Loss_type == 'L2':
    criterion = nn.MSELoss(reduction='mean').to(device)
  elif Loss_type == 'L1':
    criterion = nn.L1Loss(reduction='mean').to(device)
  else:
    print('Choose either L1 or L2 in Loss_type')
    return

  val_losses = []

  true_seqs = []
  pred_seqs = []
  timesteps = []
  sensors = []

  model = model.eval()
  with torch.no_grad():
    for batch in test_loader:
      seq_true, timestep, sensor = batch

      seq_true = seq_true.to(device)
      seq_pred = model(seq_true)

      if only_sensor_labels:
        seq_true = seq_true[:,:,0:3]

      loss = criterion(seq_pred, seq_true)

      val_losses.append(loss.item())

      seq_true = seq_true.cpu().numpy()
      seq_pred = seq_pred.cpu().numpy()

      true_seqs += [seq_true]
      pred_seqs += [seq_pred]
      timesteps += [timestep.numpy()]
      sensors += [sensor.numpy()]

    return np.mean(val_losses), true_seqs, pred_seqs, timesteps, sensors


Next, we make a function to compute the predictions and losses for the transformer model.

In [None]:
def test_loss_calculator_transformer(model,test_loader,Loss_type = 'L2',only_sensor_labels=True):

  if Loss_type == 'L2':
    criterion = nn.MSELoss(reduction='mean').to(device)
  elif Loss_type == 'L1':
    criterion = nn.L1Loss(reduction='mean').to(device)
  else:
    print('Choose either L1 or L2 in Loss_type')
    return

  val_losses = []
  last_val_losses = []

  true_seqs = []
  pred_seqs = []
  timesteps = []
  sensors = []


  
  model = model.eval()
  with torch.no_grad():
    for batch in test_loader:
      src,tgt,true_tgt,time, sensor = batch

      src = src.to(device)
      tgt = tgt.to(device)
      true_tgt = true_tgt[:,-1:,:3].to(device) # First 3 as targets to predict

      tgt_mask = model.transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

      pred_tgt = model(src,tgt,tgt_mask)
      pred_tgt = pred_tgt[:,-1:,:3]

      loss = criterion(pred_tgt, true_tgt)
      #loss_last = criterion_last(pred_tgt[:,-1,:],true_tgt[:,-1,:])
      #print(loss)
      true_seqs.append(true_tgt.cpu().numpy())
      pred_seqs.append(pred_tgt.cpu().numpy())
      timesteps.append(time.numpy()[:,-1])
      sensors.append(sensor.numpy().squeeze())
      val_losses.append(loss.item())

    return np.mean(val_losses), true_seqs, pred_seqs, timesteps, sensors


The next function is used to convert the sequences of reconstructions, true values, timestamps, and sensor id and convert it into a data frame for the GRU auto-encoder. 

In [None]:
#Creating dataframe holding both L1 and L2 loss for each sample

def create_dataframe_AE(true_seqs,pred_seqs,timestamps,sensors,time_step = 1):

  """
  Creating dataframe holding both L1 and L2 loss for each sample for 
  both temperature, humidity and moisture for the GRU auto-encoder
  """


  cols = [str(i) + ' - ' + str(i+time_step) for i in range(0,24,time_step)]

  measure_dict = {'temperature':0,'humidity':1,'moisture':2}

  df = pd.DataFrame({'time-period':[],
                          'timestamp':[],
                          'sensor':[],
                          'true':[],
                          'pred':[],
                          'L1Loss':[],
                          'L2Loss':[]})
  
  for m_type in ['temperature','humidity','moisture']:
    m_idx = measure_dict[m_type]

    for true_batch, pred_batch, timestamp, sensor in zip(true_seqs,pred_seqs,timestamps, sensors):
      for i,t in enumerate(range(0,24,time_step)):

        true_timestep = true_batch[:,t:t+time_step,m_idx].reshape(-1)
        pred_timestep = pred_batch[:,t:t+time_step,m_idx].reshape(-1)
        
        timestep = timestamp[:,t:t+time_step].reshape(-1)

        sensor_per_row = np.array([[s]*time_step for s in sensor]).squeeze().reshape(-1)

        col_timestep = [cols[i]] * len(true_timestep)

        L2_loss = (true_timestep - pred_timestep)**2
        L1_loss = np.absolute(true_timestep - pred_timestep)

        df_tmp = pd.DataFrame({'time-period':col_timestep,
                               'timestamp':timestep,
                               'sensor':sensor_per_row,
                                'true':true_timestep,
                               'pred':pred_timestep,
                               'L1Loss':L1_loss,
                               'L2Loss':L2_loss})
        
        df_tmp['measure_type'] = m_type
        
        df = df.append(df_tmp)

  df.timestamp = pd.to_datetime(df.timestamp,infer_datetime_format=True)
  df.sensor = df.sensor.astype(int)
  return df

The next function is used to convert the sequences of reconstructions, true values, timestamps, and sensor id and convert it into a data frame for the Transformer model

In [None]:
def create_dataframe_transformer(true_seqs,pred_seqs,timesteps,sensors,time_step = 4,m_type = 'moisture'):
  """
  Creating dataframe holding both L1 and L2 loss for each sample for 
  both temperature, humidity and moisture for the Transformer model
  """

  cols_start = [str(i) + ' - ' + str(i+time_step) for i in range(0,24,time_step)]
  cols = []
  for col in cols_start:
    cols += [col]*time_step

  measure_dict = {'temperature':0,'humidity':1,'moisture':2,'euclidian_all':-1}
  m_idx = measure_dict[m_type]

  df = pd.DataFrame({'time_periods':[],
                     'sensor_id':[],
                      'timestamp':[],
                      'loss':[],
                      'euclidean_dist':[],
                      'loss_temp':[],
                      'loss_humid':[],
                      'loss_moist':[],
                      'true_temp':[],
                      'true_humid':[],
                      'true_moist':[],
                      'pred_temp':[],
                      'pred_humid':[],
                      'pred_moist':[]})


  thresholds = pd.read_csv('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/Transformer_threshold_final.csv')

  for true_batch, pred_batch, time, sensor in zip(true_seqs,pred_seqs,timesteps,sensors):

    measures_true = []
    measures_pred = []
    measures_loss = []

    for m_idx in range(4):

      if m_idx == 3:
        
        euclidian_distance = np.linalg.norm(true_batch - pred_batch,ord=2,axis=2).squeeze()

      else:

        true_timestep = true_batch[:,:,m_idx].reshape(-1)
        pred_timestep = pred_batch[:,:,m_idx].reshape(-1)

        measures_true.append(true_timestep)
        measures_pred.append(pred_timestep)

        time_df = pd.DataFrame()
        time_df['time'] = pd.to_datetime(time)
        time_df['hour'] = time_df['time'].dt.hour
        col_timestep = [cols[i] for i in time_df['hour']]

        #print(true_timestep[:10])
        #print(pred_timestep[:10])

        L1_loss = np.absolute(true_timestep - pred_timestep)
        #print(L1_loss[:10])


        measures_loss.append(L1_loss)

    df_tmp = pd.DataFrame({'time_periods':col_timestep,
                        'sensor_id':sensor,
                        'timestamp': time_df['time'],
                        'euclidean_dist':euclidian_distance,
                        'loss_temp':measures_loss[0],
                        'loss_humid':measures_loss[1],
                        'loss_moist':measures_loss[2],
                        'true_temp':measures_true[0],
                        'true_humid':measures_true[1],
                        'true_moist':measures_true[2],
                        'pred_temp':measures_pred[0],
                        'pred_humid':measures_pred[1],
                        'pred_moist':measures_pred[2]
                        })   
    
    df = df.append(df_tmp)

  df['loss'] = df[['loss_temp','loss_humid','loss_moist']].mean(1)
  df['threshold_euc'] = thresholds['euclidian_threshold'].to_numpy()[0]
  df['threshold_temp'] = thresholds['temperature_threshold'].to_numpy()[0]
  df['threshold_humid'] = thresholds['humidity_threshold'].to_numpy()[0]
  df['threshold_moist'] = thresholds['moisture_threshold'].to_numpy()[0]
  df['sensor_id'] = df['sensor_id'].astype(int)

  df.reset_index(inplace=True)

  return df

In [None]:
def plot_scatter(true_seqs,pred_seqs,timesteps,sensors,time_step=4,m_type='moisture',figsize=(20,10),alpha=0.8):
  """
  Makes scatter plot of predicted/reconstructed values versus the true values.
  """
  df = create_dataframe_AE(true_seqs,pred_seqs,timesteps,sensors,time_step)
  df = df[df.measure_type == m_type]
  plt.figure(figsize=figsize)
  sns.set_style('whitegrid')
  sns.scatterplot(data=df, x="true", y="pred", hue="time-period",alpha=alpha)
  plt.show()

The dataframe created by create_dataframe_AE differs in format from the dataframe created for the DeepAnT model, the transformer model, and the format used in plots. The next function can therefore be used to convert the dataframe to the right format.

In [None]:
# Convert to dataframe layout used in plots
def convert_dataframe(df_data, df_thresholds):
  '''
  This function converts from the dataframe from create_dataframe_AE and returns
  the data in a dataframe, that can be used in the plotting functions
  '''
  df_data = df_data.drop(columns = ['L2Loss'])

  df_temp = df_data[df_data['measure_type']=='temperature']
  df_temp.columns = ['time-period', 'timestamp', 'sensor_id', 'true_temp', 'pred_temp', 'loss_temp', 'measure_type']
  df_temp = df_temp.drop(columns = ['measure_type'])

  df_moist = df_data[df_data['measure_type']=='moisture']
  df_moist.columns = ['time-period', 'timestamp', 'sensor_id', 'true_moist', 'pred_moist', 'loss_moist', 'measure_type']
  df_moist = df_moist.drop(columns = ['measure_type'])

  df_humid = df_data[df_data['measure_type']=='humidity']
  df_humid.columns = ['time-period', 'timestamp', 'sensor_id', 'true_humid', 'pred_humid', 'loss_humid', 'measure_type']
  df_humid = df_humid.drop(columns = ['measure_type'])


  new_df = pd.merge(df_temp, df_moist, on=["sensor_id", "timestamp", "time-period"])
  new_df = pd.merge(new_df, df_humid, on=["sensor_id", "timestamp", "time-period"])

  new_df.loc[:,'euclidean_dist'] = sqrt(new_df.loc[:,'loss_temp']**2 + new_df.loc[:,'loss_humid']**2 + new_df.loc[:,'loss_moist']**2)
  new_df.loc[:,'loss'] = (new_df.loc[:,'loss_temp'] + new_df.loc[:,'loss_humid'] + new_df.loc[:,'loss_moist'])/3
  new_df.sort_values(by=['sensor_id','timestamp'], inplace = True)


  # Add thresholds to dataframe
  new_df.loc[:, 'threshold_euc'] = 0
  new_df.loc[:, 'threshold_temp'] = 0
  new_df.loc[:, 'threshold_humid'] = 0
  new_df.loc[:, 'threshold_moist'] = 0
  for index, row in df_thresholds.iterrows():
    new_df.loc[new_df.timestamp.dt.hour == row.timesteps,'threshold_euc'] = row.euclidian_threshold
    new_df.loc[new_df.timestamp.dt.hour == row.timesteps,'threshold_temp']= row.temperature_threshold
    new_df.loc[new_df.timestamp.dt.hour == row.timesteps,'threshold_humid'] = row.humidity_threshold
    new_df.loc[new_df.timestamp.dt.hour == row.timesteps,'threshold_moist'] = row.moisture_threshold

  new_df.reset_index(inplace=True)

  return new_df



## GRU Auto-Encoder create dataframe

The dataframe is made using the defined functions:

In [None]:
gru_test_loss, true_seqs, pred_seqs, timesteps, sensors = test_loss_calculator_AE(gru_ae,AE_loader)

In [None]:
df_gru_loss = create_dataframe_AE(true_seqs,pred_seqs,timesteps, sensors, time_step=4)

In [None]:
# show top of the dataframe
df_gru_loss[0:4 ]

To get an overview of how the reconstructions are compared to the true values, we make a scatter plot. For low moisture levels, the points are close to diagonal (corresponding to good reconstructions), but for higher moisture levels, the points are more scattered, which indicates that the model is not at good at recreating the data, which could result in detected anomalies.

In [None]:
plot_scatter(true_seqs,pred_seqs,timesteps,sensors,time_step=4,m_type='moisture')

The thresholds for the GRU auto-encoder are loaded.

In [None]:
gru_thresholds = pd.read_csv('/content/drive/MyDrive/WoodSense/notebooks/Final models and comparison/GRU_AE_threshold.csv',index_col=False,delimiter=',')

We convert the dataframe to the wanted format for the model comparison using the defined function.

In [None]:
# Convert dataframe to format used in plots
df_gru_loss_plot = convert_dataframe(df_gru_loss, gru_thresholds) # convert dataframe to the one used in plots
df_gru_loss_plot.head()

## DeepAnT create dataframe

Next we create the dataframe for the DeepAnT model. This dataframe will have the same format as the one created for the GRU auto-encoder model.

In [None]:
def euclidean_dist(x, y): # Calculates euclidean dist
  if (type(x) == float):
    return abs(x-y)
  return np.sqrt(sum((y-x)**2))

In [None]:
# Use trained network to predict next points. 
# Store all values (true values, predictions, losses, thresholds) in a dataframe

net_deepant = net_deepant.to(device)
combined_data = [];
criterion = nn.L1Loss()
parameters_predicted = ['temp', 'humid', 'moist']

for i in range(len(test_inout_seq_deepant)):
  sensor, timestamp, X, y = test_inout_seq_deepant[i]
  timestamp = timestamp.iloc[0]
  X, y = X.to(device), y.to(device)
  with torch.no_grad():
    X = X.reshape(1, tw, 14)
    output = net_deepant(X)
  y = y.reshape(output.shape)
  
  loss = criterion(y, output) # same loss as used in training
  loss = float(loss.cpu().detach().numpy())
  output = output.cpu().detach().numpy()
  output = np.squeeze(output)  
  y = y.cpu().detach().numpy()
  y = np.squeeze(y)

  if y.size == 1:
    euc_dist = abs(y-output);
    temp_list = [sensor, timestamp, loss, euc_dist, euc_dist, y, output]
  else:
    euc_dist = euclidean_dist(y,output) # euclidean distance
    temp_list = [sensor, timestamp, loss, euc_dist] # list to hold sensor id, losses, values
    
    for y_i, output_i in zip(y, output): # iterate through the predicted features
      temp_list.append(abs(output_i-y_i))
    temp_list.extend(y)
    temp_list.extend(output)
    
  combined_data.append(temp_list)

# Convert to dataframe
column_names = ['sensor_id', 'timestamp', 'loss', 'euclidean_dist']
for i, param in enumerate(parameters_predicted):
  column_names.append('loss_' + param)

for i, param in enumerate(parameters_predicted):
  column_names.append('true_' + param)

for i, param in enumerate(parameters_predicted):
  column_names.append('pred_' + param)

df_test_loss_deepant = pd.DataFrame(combined_data)
df_test_loss_deepant.columns = column_names

# Add thresholds to dataframe
df_test_loss_deepant.loc[:, 'threshold_euc'] = deepant_threshold
df_test_loss_deepant.loc[:, 'threshold_temp'] = deepant_param_thresholds[0]
df_test_loss_deepant.loc[:, 'threshold_humid'] = deepant_param_thresholds[1]
df_test_loss_deepant.loc[:, 'threshold_moist'] = deepant_param_thresholds[2]

# Show top of dataframe
df_test_loss_deepant.head()

We make a scatter plot of the predicted versus true values. We see that moisture levels around 1 result in the best predictions. For lower values and higher values, the predictions seem to be further from the diagonal line.

In [None]:
m_type='moist'
figsize=(20,10)
alpha = 0.8
plt.figure(figsize=figsize)
sns.set_style('whitegrid')
sns.scatterplot(data=df_test_loss_deepant, x="true_"+m_type, y="pred_"+m_type,alpha=alpha)
plt.show()

## Transformer create dataframe

We create a dateframe for the transformer model and make a scatter plot of predicted versus true values of moisture.

In [None]:
tf_test_loss, tf_true_seqs, tf_pred_seqs, tf_timesteps, tf_sensors = test_loss_calculator_transformer(transformer,TF_loader)

In [None]:
df_transformer_loss = create_dataframe_transformer(tf_true_seqs,tf_pred_seqs,tf_timesteps,tf_sensors,time_step = 4)

In [None]:
df_transformer_loss.head(5)

In [None]:
  plt.figure(figsize=(20,10))
  sns.set_style('whitegrid')
  sns.scatterplot(data=df_transformer_loss, x="true_moist", y="pred_moist", hue="time_periods",alpha=0.8)
  plt.show()

# Plots and model comparison

First we convert the data back to the original units using the mean and standard deviation.

In [None]:
mean_and_std

In [None]:
parameters = ['temp', 'humid', 'moist']
df_models = [df_transformer_loss, df_gru_loss_plot, df_test_loss_deepant]
for df in df_models:
  for index, row in mean_and_std[0:3].iterrows():
    param = parameters[index]
    mean = row.iloc[1]
    std = row.iloc[2] 
    df['true_'+param] = df['true_'+param]*std + mean
    df['pred_'+param] = df['pred_'+param]*std + mean

The true value of moisture and the predictions/reconstructions are shown below for sensor 50.

In [None]:
import plotly.graph_objects as go
sensor = 50
# Create traces
fig = go.Figure()



fig.add_trace(go.Scatter(y=df_transformer_loss['true_moist'][df_transformer_loss['sensor_id'] == sensor],
                    mode='lines',
                    name='True'))
fig.add_trace(go.Scatter(y=df_transformer_loss['pred_moist'][df_transformer_loss['sensor_id'] == sensor],
                    mode='lines',
                    name='Transformer'))
fig.add_trace(go.Scatter(y=df_test_loss_deepant['pred_moist'][df_test_loss_deepant['sensor_id'] == sensor],
                    mode='lines',
                    name='DeepANT'))
fig.add_trace(go.Scatter(y=df_gru_loss_plot['pred_moist'][df_gru_loss_plot['sensor_id'] == sensor],
                    mode='lines',
                    name='GRU'))
#style layout 
layout = go.Layout(
    xaxis=dict(
        title="t"
    ),
    yaxis=dict(
        title="Moisture [%]"
    ) ) 
fig.layout = layout


fig.show()

# Outlier dataframe

Lasse Regin from Woodsense has provided some time intervals for some different sensors, where the moisture levels appear to be abnormally high. These periods are defined below. It is tested whether the models are able to detect the outliers in this period.

In [None]:
outlier_time = {
"20": [["2020-09-15T00:00:00", "2020-09-22T00:00:00"], ["2020-10-07T00:00:00", "2020-10-16T00:00:00"]], 
"25": [["2020-10-21T00:00:00", "2020-11-30T00:00:00"]], 
"26": [["2020-10-20T00:00:00", "2020-11-30T00:00:00"]], 
"27": [["2020-10-20T00:00:00", "2020-11-30T00:00:00"]], 
"50": [["2020-10-21T00:00:00", "2020-11-30T00:00:00"]], 
"51": [["2020-10-19T00:00:00", "2020-11-19T00:00:00"]]
}

We want find out how many of the datapoints in this period are detected as anomalies by each model. We make a dataframe that holds each period, the length of the period, and the number of outliers in the period. The column pct_outliers is the percentage of outliers detected in the period. The mean_above_threshold column calculates difference $euc\_dist-threshold$ for each of the detected outliers and takes the average of this.

In [None]:
# For each sensor and anomalous period, count the number of outliers, that each model detect
df_outliers = pd.DataFrame(columns=df_transformer_loss.columns)
for key in outlier_time.keys():
  time_period = outlier_time[key]

  for model, name in [(df_gru_loss_plot,'GRU'),(df_test_loss_deepant,'CNN'),(df_transformer_loss,'Transformer')]:
    for t in time_period:
      df_time = model[model.sensor_id == int(key)].copy()
      df_time = df_time.set_index(['timestamp']).loc[t[0]:t[1]].reset_index(['timestamp'])

      df_time['outlier_period'] = [str(t[0])[:10] + ' - ' + str(t[1])[:10] for i in range(len(df_time))]

      df_time['outlier_period'] = 'sensor ' + df_time['sensor_id'].astype(str) + ':  ' + df_time['outlier_period'].astype(str)

      df_time['Outlier'] = df_time['euclidean_dist'] > df_time['threshold_euc']

      df_time['Model'] = name
      df_time['length_of_period'] = int(len(df_time))

      df_outliers = df_outliers.append(df_time)

df_outliers.drop(columns=['index','time_periods','time-period'])
df_outliers['thres_euc_dif'] = df_outliers['euclidean_dist'] - df_outliers['threshold_euc']

df_outliers = df_outliers[['Model','sensor_id','timestamp','outlier_period','length_of_period','Outlier','thres_euc_dif','euclidean_dist','threshold_euc','loss',
                           'loss_temp','loss_humid','loss_moist','true_temp','true_humid','true_moist',
                           'pred_temp','pred_humid','pred_moist','threshold_temp','threshold_humid','threshold_moist']]

In [None]:
# Make table that is grouped by model and period
df_outliers2 = df_outliers.copy()
df_outlier_table = df_outliers2.loc[:,['Model','outlier_period','Outlier','euclidean_dist','length_of_period']].groupby(['Model','outlier_period'])\
                                  .agg(num_outliers = pd.NamedAgg(column='Outlier',aggfunc='sum'),
                                      length_of_period = pd.NamedAgg(column='length_of_period',aggfunc='mean'),
                                      mean_euclidian_dist = pd.NamedAgg(column='euclidean_dist',aggfunc='mean')
                                      #thres_euclidean = pd.NamedAgg(column='threshold_euc',aggfunc='mean')
                                      )

# Add a column that has the recall in the given period                                  
df_outlier_table['pct_outliers'] = df_outlier_table['num_outliers']/df_outlier_table['length_of_period']*100

In [None]:
# add a column, which has the average difference between euc. distance and threshold for any detected outlier
df_outliers3 = df_outliers[df_outliers.Outlier == True].copy()
df_outliers_table_app =     df_outliers3.loc[:,['Model','outlier_period','thres_euc_dif']].groupby(['Model','outlier_period'])\
                                            .agg(mean_above_thres = pd.NamedAgg(column='thres_euc_dif',aggfunc='mean')
                                            )

In [None]:
# Merge tables
df_outlier_table = df_outlier_table.merge(df_outliers_table_app,on=['Model','outlier_period'],how='left')

In [None]:
df_outlier_table

We can plot the true values of moisture along with predictions/reconstructiosn from each model. The figure also shows the detected outliers by each model.

In [None]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()

sensor = 25
measure = 'moist'

## GRU model
fig.add_trace(go.Scatter(x=df_gru_loss_plot['timestamp'][df_gru_loss_plot['sensor_id'] == sensor],y=df_gru_loss_plot['true_'+measure][df_gru_loss_plot['sensor_id'] == sensor],
                    mode='lines',
                    name='True moisture'))

fig.add_trace(go.Scatter(x=df_gru_loss_plot['timestamp'][df_gru_loss_plot['sensor_id'] == sensor],y=df_gru_loss_plot['pred_'+measure][df_gru_loss_plot['sensor_id'] == sensor],
                    mode='lines',
                    name='GRU AE reconst.'))

#GRU outliers
df_period_outliers_gru = df_gru_loss_plot.loc[(df_gru_loss_plot.euclidean_dist >= df_gru_loss_plot.threshold_euc) &
                                     (df_gru_loss_plot.sensor_id == sensor)][['timestamp','pred_'+measure]]

fig.add_trace(go.Scatter(mode="markers", x=df_period_outliers_gru['timestamp'], 
                         y=df_period_outliers_gru['pred_'+measure], 
                         name="GRU AE outliers"))

#DeepAnt model
fig.add_trace(go.Scatter(x=df_test_loss_deepant['timestamp'][df_test_loss_deepant['sensor_id'] == sensor],y=df_test_loss_deepant['pred_'+measure][df_test_loss_deepant['sensor_id'] == sensor],
                    mode='lines',
                    name='DeepAnt Pred.'))

#DeepAnt outliers
df_period_outliers_deepant = df_test_loss_deepant.loc[(df_test_loss_deepant.euclidean_dist >= df_test_loss_deepant.threshold_euc) &
                                     (df_test_loss_deepant.sensor_id == sensor)][['timestamp','pred_'+measure]]

fig.add_trace(go.Scatter(mode="markers", x=df_period_outliers_deepant['timestamp'], 
                         y=df_period_outliers_deepant['pred_'+measure], 
                         name="DeepAnt outliers"))

# Transformer model
fig.add_trace(go.Scatter(x=df_transformer_loss['timestamp'][df_transformer_loss['sensor_id'] == sensor],y=df_transformer_loss['pred_'+measure][df_transformer_loss['sensor_id'] == sensor],
                    mode='lines',
                    name='Transformer Pred.'))

#Transformer outliers
df_period_outliers_transformer = df_transformer_loss.loc[(df_transformer_loss.euclidean_dist >= df_transformer_loss.threshold_euc) &
                                     (df_transformer_loss.sensor_id == sensor)][['timestamp','pred_'+measure]]

fig.add_trace(go.Scatter(mode="markers", x=df_period_outliers_transformer['timestamp'], 
                         y=df_period_outliers_transformer['pred_'+measure], 
                         name="Transformer outliers"))

fig.add_shape(type="line",
    x0="2020-09-15T00:00:00", y0=5, x1="2020-09-15T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)
fig.add_shape(type="line",
    x0="2020-09-22T00:00:00", y0=5, x1="2020-09-22T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)
fig.add_shape(type="line",
    x0="2020-10-07T00:00:00", y0=5, x1="2020-10-07T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)
fig.add_shape(type="line",
    x0="2020-10-15T00:00:00", y0=5, x1="2020-10-15T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)

fig.add_annotation(x="2020-10-07T00:00:00", y=25,
            text="Anomaly period 2",
            showarrow=True,
            arrowhead=1,
            ax=70,
            ay=-50)

fig.add_annotation(x="2020-09-15T00:00:00", y=25,
            text="Anomaly period 1",
            showarrow=True,
            arrowhead=1,
            ax=60,
            ay=-50)

#style layout 
layout = go.Layout(
    xaxis=dict(
        title="Date"
    ),
    yaxis=dict(
        title="Moisture [%]"
    ),
    font = dict(size = 18)) 
fig.layout = layout

fig.show()

In [None]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()

sensor = 20
measure = 'moist'

## GRU model
fig.add_trace(go.Scatter(x=df_gru_loss_plot['timestamp'][df_gru_loss_plot['sensor_id'] == sensor],y=df_gru_loss_plot['true_'+measure][df_gru_loss_plot['sensor_id'] == sensor],
                    mode='lines',
                    name='True '+measure))

fig.add_trace(go.Scatter(x=df_gru_loss_plot['timestamp'][df_gru_loss_plot['sensor_id'] == sensor],y=df_gru_loss_plot['pred_'+measure][df_gru_loss_plot['sensor_id'] == sensor],
                    mode='lines',
                    name='GRU Auto-Encoder Reconstrution. '+measure))

#GRU outliers
df_period_outliers_gru = df_outliers.loc[(df_outliers.Outlier == True) &
                                     (df_outliers.sensor_id == sensor) &
                                     (df_outliers.Model == 'GRU')][['timestamp','pred_'+measure]]

fig.add_trace(go.Scatter(mode="markers", x=df_period_outliers_gru['timestamp'], y=df_period_outliers_gru['pred_'+measure], name="GRU outliers"))

#DeepAnt model
fig.add_trace(go.Scatter(x=df_test_loss_deepant['timestamp'][df_test_loss_deepant['sensor_id'] == sensor],y=df_test_loss_deepant['pred_'+measure][df_test_loss_deepant['sensor_id'] == sensor],
                    mode='lines',
                    name='DeepAnt Pred .'+measure))

#DeepAnt outliers
df_period_outliers_deepant = df_outliers.loc[(df_outliers.Outlier == True) &
                                     (df_outliers.sensor_id == sensor) &
                                     (df_outliers.Model == 'CNN')][['timestamp','pred_'+measure]]

fig.add_trace(go.Scatter(mode="markers", x=df_period_outliers_deepant['timestamp'], y=df_period_outliers_deepant['pred_'+measure], name="DeepAnt outliers"))

# Transformer model
fig.add_trace(go.Scatter(x=df_transformer_loss['timestamp'][df_transformer_loss['sensor_id'] == sensor],y=df_transformer_loss['pred_'+measure][df_transformer_loss['sensor_id'] == sensor],
                    mode='lines',
                    name='Transformer Pred. '+measure))

df_period_outliers_deepant = df_outliers.loc[(df_outliers.Outlier == True) &
                                     (df_outliers.sensor_id == sensor) &
                                     (df_outliers.Model == 'CNN')][['timestamp','true_'+measure]]

#Transformer outliers
df_period_outliers_transformer = df_outliers.loc[(df_outliers.Outlier == True) &
                                     (df_outliers.sensor_id == sensor) &
                                     (df_outliers.Model == 'Transformer')][['timestamp','pred_'+measure]]

fig.add_trace(go.Scatter(mode="markers", x=df_period_outliers_transformer['timestamp'], y=df_period_outliers_transformer['pred_'+measure], name="Transformer outliers"))

#style layout 
layout = go.Layout(
    xaxis=dict(
        title="Date"
    ),
    yaxis=dict(
        title="Moisture [%]"
    ) ) 
fig.layout = layout

fig.add_shape(type="line",
    x0="2020-09-15T00:00:00", y0=5, x1="2020-09-15T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)
fig.add_shape(type="line",
    x0="2020-09-22T00:00:00", y0=5, x1="2020-09-22T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)
fig.add_shape(type="line",
    x0="2020-10-07T00:00:00", y0=5, x1="2020-10-07T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)
fig.add_shape(type="line",
    x0="2020-10-15T00:00:00", y0=5, x1="2020-10-15T00:00:00", y1=30,
    line=dict(color="black", width=1, dash="dashdot")
)

fig.add_annotation(x="2020-10-07T00:00:00", y=25,
            text="Anomaly period 2",
            showarrow=True,
            arrowhead=1,
            ax=70,
            ay=-50)

fig.add_annotation(x="2020-09-15T00:00:00", y=25,
            text="Anomaly period 1",
            showarrow=True,
            arrowhead=1,
            ax=60,
            ay=-50)




fig.show()

In [None]:
df

In [None]:
df_outliers = pd.DataFrame(columns=df_transformer_loss.columns)
for key in outlier_time.keys():
  time_period = outlier_time[key]
  #print(time_period)

  for model, name in [(df_gru_loss_plot,'GRU AE'),(df_test_loss_deepant,'DeepAnT'),(df_transformer_loss,'Transformer')]:
    #for t in time_period:
      #print(t)
      df_time = model[model.sensor_id == int(key)]
      #df_time = model.loc[t[0]:t[1]].copy()
      #df_time = model[model.sensor_id == int(key)]

      #df_time['outlier_period'] = [str(t[0])[:10] + ' - ' + str(t[1])[:10] for i in range(len(df_time))]

      #df_time['outlier_period'] = 'sensor ' + df_time['sensor_id'].astype(str) + ':  ' + df_time['outlier_period'].astype(str)

      df_time['Outlier'] = df_time['euclidean_dist'] > df_time['threshold_euc']

      df_time['Model'] = name
      df_time['length_of_period'] = int(len(df_time))

      df_outliers = df_outliers.append(df_time)

df_outliers.drop(columns=['index','time_periods','time-period'])
df_outliers['thres_euc_dif'] = df_outliers['euclidean_dist'] - df_outliers['threshold_euc']

df_outliers = df_outliers[['Model','sensor_id','timestamp','length_of_period','Outlier','thres_euc_dif','euclidean_dist','threshold_euc','loss',
                           'loss_temp','loss_humid','loss_moist','true_temp','true_humid','true_moist',
                           'pred_temp','pred_humid','pred_moist','threshold_temp','threshold_humid','threshold_moist']]

sub_df_outliers= df_outliers[df_outliers['sensor_id']==sensor]

sub_df_outliers['Outlier'] = sub_df_outliers['Outlier'].astype(int)
sub_df_outliers['timestamp'] = pd.to_datetime(sub_df_outliers['timestamp'],infer_datetime_format=True,utc=True).dt.date
SUM = sub_df_outliers[['timestamp','Model','Outlier']].groupby(['Model','timestamp']).sum().reset_index()
import plotly.graph_objects as go
import datetime
import numpy as np

sensors = sub_df_outliers['sensor_id'].unique()

z = SUM['Outlier'].values
date = SUM['timestamp'].values

fig = go.Figure(data=go.Heatmap(
        z=z,
        x=date,
        y=SUM['Model'].values,
        colorscale='OrRd'))
"""
fig = go.Figure(data=go.Heatmap(
        z=z,
        x=date,
        y=SUM['Model'].values,
        colorscale='OrRd'))
"""
fig.update_layout(
    title='Anomalies per day on selected sensor',
    xaxis_nticks=36,
    #height = 700,
    width = 1920
    )

fig.update_layout(plot_bgcolor='rgb(255,247,236)')
fig.update_layout(xaxis_showgrid=False, yaxis_showgrid=False)
fig.show()