**Based on the main model (which predicts the occurrence of anomalies in the industrial plant), another prediction model will be created to predict how long the anomaly will last**
  - This model is used after the main model prediction
  - This anomaly time prediction happens 63 minutes (previously determined) before the detection of a possible anomaly

**Import of Modules**

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, ReLU, Activation
from tensorflow.python.client import device_lib

**For Import of Custom Modules in Google Colab**
  - It need to put the PS_IndustriAL folder directory in the input variable path_of_PS_IndustriALL (str) below

In [2]:
path_of_PS_IndustriALL = "/content/drive/MyDrive"
sys.path.insert(0,os.path.join(path_of_PS_IndustriALL, "PS_IndustriALL/src/ultlities_modules"))
from note_taker import NoteTaker
from ultlities import test

**Object to Write Important Information to a Text File:**

In [3]:
note_taker = NoteTaker()

**Load the data from the csv file and set some hyperparameters:**

In [4]:
dataframe_data = pd.read_csv(os.path.join(path_of_PS_IndustriALL, "PS_IndustriALL/src/data/anomaly_time_data.csv")) # the dataframe of data
data_size = len(dataframe_data)
note_taker.write_line("Total amount data", data_size)
display(dataframe_data)
dataframe_data.drop(columns=["timestamp"], inplace=True)
number_input = len(dataframe_data.columns)
note_taker.write_line("Number of Inputs", number_input)

analysis_period = 10
predict_minute = 63 # minute to make the predict, selected in data_preprocessing.ipynb

Total amount data: 119103


Unnamed: 0,timestamp,anomaly_time,target_iALL_PS,TAG_iALL_PS_00,TAG_iALL_PS_04,TAG_iALL_PS_05,TAG_iALL_PS_06,TAG_iALL_PS_08,TAG_iALL_PS_09,TAG_iALL_PS_10,TAG_iALL_PS_11,TAG_iALL_PS_12,TAG_iALL_PS_13,TAG_iALL_PS_28,TAG_iALL_PS_37,TAG_iALL_PS_48,TAG_iALL_PS_50
0,2018-04-01 00:00:00,0,0.0,4.548754,1011.733181,97.284889,33.248746,29.262578,43.896138,71.130001,123.338718,58.828461,9.271792,1463.647280,156.908398,310.022461,426.651658
1,2018-04-01 00:01:00,0,0.0,7.887998,1358.466600,202.583688,32.494870,61.038519,26.967553,98.697513,74.897785,63.746416,7.989979,1437.274642,160.845276,306.084796,375.316113
2,2018-04-01 00:02:00,0,0.0,4.975919,1056.489015,147.098428,36.402837,30.304666,45.272021,39.329786,62.800193,65.154836,8.771413,1565.113515,157.841402,353.863854,444.809188
3,2018-04-01 00:03:00,0,0.0,6.304142,1619.924847,307.722320,34.283344,44.859311,11.692485,126.751846,96.756731,86.551891,9.234399,1966.270851,156.106507,301.563110,414.052496
4,2018-04-01 00:04:00,0,0.0,1.671733,591.648283,-7.684779,30.980682,1.229287,43.643912,41.526627,124.592349,83.649076,9.101509,1396.339609,163.462727,298.957820,431.548430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119098,2018-07-09 12:39:00,0,0.0,5.191826,1963.160736,236.208078,28.848354,12.315209,16.505811,80.550983,101.501932,36.386412,6.794132,1479.371410,86.868269,301.394243,1170.190231
119099,2018-07-09 12:40:00,0,0.0,6.849015,1136.882850,110.471072,23.773084,44.446469,31.571878,135.584829,107.572776,47.251239,7.719693,2012.456761,110.493838,261.988969,1249.646796
119100,2018-07-09 12:41:00,0,0.0,2.832383,383.606596,151.194093,41.133065,28.270642,36.124331,55.090741,114.515366,69.833234,7.779601,1886.055740,92.372019,344.363066,1186.408225
119101,2018-07-09 12:42:00,0,0.0,6.284776,1081.530595,106.541216,31.526591,6.449825,17.299298,131.685119,3.078141,76.481285,7.962738,1699.671456,106.665885,392.541203,1247.186205


Number of Inputs: 16


**Split Dataset, According to Predict Minutes:**
  - 80% Train
  - 10% Validation
  - 10% Test

In [5]:
# input and output data
input_data = dataframe_data.iloc[:-predict_minute]
output_data = dataframe_data.shift(-predict_minute).dropna()

# size of Split Dataset
train_size = int(0.8 * data_size)
val_size = int(0.1 * data_size)
test_size = data_size - train_size - val_size

# Split Dataset

input_train_data = input_data.loc[:train_size].to_numpy()
input_val_data = input_data.loc[train_size:train_size+val_size].to_numpy()
input_test_data = input_data.loc[train_size+val_size-analysis_period:].to_numpy()


output_train_data = output_data.loc[:train_size]["anomaly_time"].to_numpy()
output_val_data = output_data.loc[train_size:train_size+val_size]["anomaly_time"].to_numpy()
output_test_data = output_data.loc[train_size+val_size-analysis_period:]["anomaly_time"].to_numpy()


**Dataset Normalization:**

In [6]:
# set the normalization, 0 for minium value e 1 for max value
input_scaler = MinMaxScaler(feature_range=(0, 1))
output_scaler = MinMaxScaler(feature_range=(0, 1))

# Normalizing the Dataset
# And transform array to have the second dimension equal to 1, that is, a vector. Is a flatten
input_train_data = input_scaler.fit_transform(input_train_data) # fit_transform is with traing of input_scaler (MinMaxScaler)
input_val_data = input_scaler.transform(input_val_data)
input_test_data = input_scaler.transform(input_test_data)

output_train_data = output_scaler.fit_transform(output_train_data.reshape(-1, 1)) # fit_transform is with traing of output_scaler (MinMaxScaler)
output_val_data = output_scaler.transform(output_val_data.reshape(-1, 1))
output_test_data = output_scaler.transform(output_test_data.reshape(-1, 1))


**Generating Input and Output**

In [7]:
# train:
input_train = list()
output_train = list()
for i in range(analysis_period, train_size):
  input_train.append(input_train_data[i-analysis_period:i])
  output_train.append(output_train_data[i])

# convert to array format
input_train = np.array(input_train)
output_train = np.array(output_train)

# valitation:
input_val = list()
output_val = list()
for i in range(analysis_period, val_size):
  input_val.append(input_val_data[i-analysis_period:i])
  output_val.append(output_val_data[i])

# convert to array format
input_val = np.array(input_val)
output_val = np.array(output_val)

# test:
input_test = list()
output_test = list()

for i in range(analysis_period, len(output_test_data)):
  input_test.append(input_test_data[i-analysis_period:i])
  output_test.append(output_test_data[i])

input_test = np.array(input_test)
output_test = np.array(output_test)

**Construction of the Predictive Anomaly Time Model Architecture**

In [8]:
anomaly_time_model = Sequential() # model with forward and back propagation

# Two LSTM Layer, with 128 inputs, return_sequences False one output per analysis_period, True number_input per analysis_period
anomaly_time_model.add(LSTM(100, return_sequences=True, input_shape=(analysis_period, number_input)))
anomaly_time_model.add(LSTM(100, return_sequences=False))

anomaly_time_model.add(Dense(1))

anomaly_time_model.add(Activation('relu'))
anomaly_time_model.compile(loss='mse', optimizer='adam') # loss funcion and otimizer

anomaly_time_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 10, 100)           46800     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
 activation (Activation)     (None, 1)                 0         
                                                                 
Total params: 127301 (497.27 KB)
Trainable params: 127301 (497.27 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Checking if GPU is Available**

  - In Google Colab acess: Runtime -> Notebook Setting

In [9]:
device_name = tf.test.gpu_device_name() #
if device_name != '/device:GPU:0':
  device_name = "/device:CPU:0"
  print(f'Not Found GPU, device at in CPU: {device_name}')
else:
  print(f'Found GPU at: {device_name}')

Found GPU at: /device:GPU:0


**Model Training:**

In [None]:
with tf.device(device_name): # train model with GPU

  # weights folder
  path_of_weights = os.path.join(path_of_PS_IndustriALL, "PS_IndustriALL/src/weights")

  # hyperparameters of train
  epochs = 3
  batch_size = 32

  # model Traing
  train_result = anomaly_time_model.fit(input_train, output_train, validation_data = (input_val, output_val), epochs=epochs, batch_size=batch_size)

  # save the weights of train, in weights folder
  anomaly_time_model.save_weights(os.path.join(path_of_weights, "anomaly_time_weights.h5"))

  # Plot the losses curve
  plt.plot(train_result.history["loss"], label='train_loss')
  plt.plot(train_result.history["val_loss"], label='val_loss')
  plt.legend()
  plt.show()

**Or Load an Already Trained Model**

In [10]:
# weights folder
path_of_weights = os.path.join(path_of_PS_IndustriALL, "PS_IndustriALL/src/weights")

# load the model, this weights are referent the training of 72 epochs
anomaly_time_model.load_weights(os.path.join(path_of_weights, "anomaly_time_weights.h5"))

**Test of Model with Test Dataset**
  - The metric used was precision (true positive per amount of prediction)

In [11]:
with tf.device(device_name):
  test_result = test(anomaly_time_model, input_test, output_test, analysis_period, number_input, binary_classification=False)

**Results**

In [12]:
for key, value in test_result.items():
  note_taker.write_line(key, value)

total_amount: 11848
mean_error: [1.]


**Save the Annotations**

In [13]:
note_taker.save("train_anomaly_time_model.txt", os.path.join(path_of_PS_IndustriALL, "PS_IndustriALL/src/annotations"))