In [None]:

import torch

In [None]:
class SimpleTemporalFusionTransformer(torch.nn.Module):
  
  def __init__(self,known_input_dimension, observed_input_dimension, static_input_dimension, hidden_dimension, target_dimension ):
      '''known_input_dimension = #Dimension of known input
      observed_input_dimension= #Dimension of unknown(observed) input
      static_input_dimension=  #Dimension of the static inputs
      hidden_dimension = #Chosen dimension of model
      target_dimension= # Dimension of target dimension'''
      super(SimpleTemporalFusionTransformer, self).__init__()
    
      
      '''Let's assume that the batch x sequence x number of features for each feauture
      is the same'''

      #Projecting the static input into the hidden dimension
      '''We need to project the static input into
      the hidden dimension of the model itself:'''
      #Dimension of the static inputs
      self.static_input_embedding = torch.nn.Linear(static_input_dimension, hidden_dimension)

      #Using the LSTM encoder
      '''Long short-term memory (LSTM) is an artificial recurrent
      neural network (RNN) architecture used in the field of deep 
      learning
      '''

      '''This uses both known and unknown inputs'''
      self.encoder_lstm = torch.nn.LSTM(
      input_size = known_input_dimension +observed_input_dimension,
      hidden_size = hidden_dimension,
      batch_first= True
      )

      #Decoding the LSTM
      '''Decoding the LSTM would only use
      known future inputs of time series data in this case'''
      self.decoder_lstm = torch.nn.LSTM(
          input_size = known_input_dimension,
          hidden_size = hidden_dimension,
          batch_first = True
      )

      #Creating a multi-head attention network
      '''We need to create an automated and trainable way to 
      weight different evaluation metrics accurately'''
      self.multi_head_attention = torch.nn.MultiheadAttention(
          embed_dim = hidden_dimension,
          num_heads = 1,
          batch_first= True

      )
      
      '''Output Layer'''
      '''This is the outcome
       of passing the data through following networks in chronological order:
       1) Static data

       2)LSTM encoder Layer (s)-------->Multi-Head attention Network

       3)LSTM decoder Layer (s)-------->Multi-Head attention Network

       4)Output layer'''

      self.output_layer = torch.nn.Linear(hidden_dimension,target_dimension)

  def forward(self, 
              past_observed_inputs, 
              past_known_inputs,
              future_known_inputs,static_inputs):
    '''
    Argument dimensions:
    past_observed_inputs = batch size x lookback x features
    past_known_inputs = batch size x lookback x feautures
    future_known_inputs = batch size  x prediction horizon x features
    static_inputs = batch size x features
    '''

    #Combining input variables
    '''Combining all known 
    and unknown input variables to be passed as static data through the network'''
    full_past_inputs = torch.cat((past_observed_inputs,past_known_inputs),axis=-1)
    '''We then embed the static inputs into the model dimension'''
    embedded_static_inputs= self.static_input_embedding(static_inputs)
    '''We must now consider reshaping the embedded static inputs 
    into the expected hidden state's shape '''
    #print(embedded_static_inputs.unsqueeze(0))
    encoder_init_hidden_state = [embedded_static_inputs.unsqueeze(0), embedded_static_inputs.unsqueeze(0)]
    
    ''' Temporal (time_related) embedding of past inputs,
    We then initialize the hidden state with embedded static inputs
    '''
    past_input_temporal_embedding, encoder_hidden_state = self.encoder_lstm(
        full_past_inputs,
        encoder_init_hidden_state
         )
    '''
    Initialize the hidden state of the decoder lstm.
    Here we use the hidden state from the past embedding
    '''
    future_input_temporal_embedding, decoder_hidden_state = self.decoder_lstm(
        future_known_inputs,
        encoder_hidden_state
         )
    '''We can now combine the past and future input embeddings
    #Note the dimension of this combination is:
    batch size x lookback x prediction horizon x features
    '''
    combined_temporal_input_embeddings = torch.cat(
        (past_input_temporal_embedding, future_input_temporal_embedding),
        axis= 1
    )
    #Passing data into the multi-headed attention later
    '''At this stage we need to carefully consider 
    the weightings of the static decoded data'''
    # For this simple case, we have only provided one head
    #More heads should be added when we have defined the number of variables
    multi_head_attention_output, self.attention_weights = self.multi_head_attention(
        combined_temporal_input_embeddings,
        combined_temporal_input_embeddings,
        combined_temporal_input_embeddings)
    '''Extracting the prediction horizons from the 
    future_known_inputs local variable
    
    The prediction/ forecast horizon is the length of time into the future for
     which forecasts are to be prepared
    '''
    prediction_horizon = future_known_inputs.shape[1]

    '''Checking the multi-headed attention network
    and extracting attention outputs'''
    future_attention_output = multi_head_attention_output[:,-prediction_horizon:,:]

    '''Pass the decoded data through the final output layer'''
    final_output = self.output_layer(future_attention_output)

    return final_output

In [None]:
#Generate some fake Data to test this basic neural network
'''Here we can construct some fake data to test the network'''
#Setting dimentions and (fake) data construction
LOOKBACK_HORIZON = 365 #How many days we look back for our observed input variables

PREDICTION_HORIZON = 7 #The number of future predictions to make

KNOWN_INPUT_DIMENSION = 2 #The dimension of known inputs

OBSERVED_INPUT_DIMENSION = 2 #The dimension of known inputs

STATIC_INPUT_DIMENSION = 3 #The dimension of static inputs

TARGET_DIMENSION = 1 # The dimension of the target variables

BATCH_SIZE = 100 #We choose a batch size

'''Constructing the data'''
#We can use a normal distribution for the sake of ease
'''Past Inputs'''
past_observed_inputs_data = torch.normal(0,1,(BATCH_SIZE,LOOKBACK_HORIZON, OBSERVED_INPUT_DIMENSION))
past_known_inputs_data = torch.normal(0,1,(BATCH_SIZE,LOOKBACK_HORIZON, KNOWN_INPUT_DIMENSION))
static_inputs_data = torch.normal(0,1, (BATCH_SIZE,STATIC_INPUT_DIMENSION))
'''Future inputs'''
future_known_inputs_data = torch.normal(0,1,(BATCH_SIZE,PREDICTION_HORIZON, KNOWN_INPUT_DIMENSION))
'''Future targets'''
future_targets_data = torch.normal (0,1, (BATCH_SIZE,PREDICTION_HORIZON, TARGET_DIMENSION))

In [None]:
'''Constructing the model from the class'''
HIDDEN_DIMENSION = 30
#print(past_observed_inputs.shape[-1])
'''Constructing the model'''

model = SimpleTemporalFusionTransformer(known_input_dimension=past_known_inputs_data.shape[-1], #known_input_dimension 
    observed_input_dimension=past_observed_inputs_data.shape[-1], #observed_input_dimension 
    static_input_dimension= static_inputs_data.shape[-1], #static_input_dimension 
    hidden_dimension=HIDDEN_DIMENSION, #hidden_dimension 
    target_dimension = TARGET_DIMENSION) #target_dimension


'''Add an optimizer  using torch.optim'''
'''We need to construct an optimizer object, 
that will hold the current state and will update the parameters 
based on the computed gradients.

Parameters: These are the variables of the model 
            function and can be called by model.parameters()
optimizer-specific options:
learning rate = 0.01
weight decay

General example:
optimizer = optim.Adam([var1, var2], lr=0.0001)
'''
#Optimizer choice:Adam
optimizer =torch. optim.Adam(model.parameters(), lr=0.01)
'''Adam optimization algorithm is an extension to 
stochastic gradient descent
Blog https://machinelearningmastery.com/adam-optimization-
algorithm-for-deep-learning/'''

'''Setting a loss function:


Loss functions are used to gauge the error
 between the prediction output and the provided target value. 
 A loss function tells us how far the algorithm model 
is from realizing the expected outcome.


If the deviation is small or the values are 
nearly identical, it’ll output a very low loss value.
 Therefore, you need to use a loss function that can penalize 
 a model properly
 when it is training on the provided dataset.'''

 #We made need to build our own loss function in the future
loss_function = torch.nn.MSELoss()

In [None]:
"""One training Epoch"""

"""What is an Epoch:
An epoch is when the machine learning algorith runs through the training data.
One epoch means that each sample in the training dataset has had an opportunity
to update the internal model parameters. An epoch is comprised of one or more batches.

The number of epochs is a hyperparameter that defines the number times that the 
learning algorithm will work through the entire training dataset. """

#What is a batch

"""

The batch size is a hyperparameter that defines the number of samples to work
 through before updating the internal model parameters.

Think of a batch as a for-loop iterating over one or more samples and making 
predictions. At the end of the batch, the predictions are compared to the 
expected output variables and an error is calculated. From this error, the 
update algorithm is used to improve the model, e.g. move down along the error
 gradient.

A training dataset can be divided into one or more batches.

When all training samples are used to create one batch, the learning algorithm 
is called batch gradient descent. When the batch is the size of one sample,
 the learning algorithm is called stochastic gradient descent. When the batch 
 size is more than one sample and less than the size of the training dataset, 
 the learning algorithm is called mini-batch gradient descent.
"""

#Training one epoch we can run the model through the model via the 'model.forward()'

model_output =model.forward(
    past_observed_inputs = past_observed_inputs_data,
    past_known_inputs=past_known_inputs_data,
    future_known_inputs = future_known_inputs_data,
    static_inputs = static_inputs_data)

#get the mse loss value
'''mse loss'''
loss= loss_function(model_output, future_targets_data)

#Clear the gradients 
'''the gradient simply the
 change in all weights with regard to the change in error (loss)'''
optimizer.zero_grad()

"""Propagate the loss backwards to look for change"""
#Backpropagartion
loss.backward()

"""Extract gradient descent step"""
#optimization algorithm for finding a local minimum of a differentiable function. 
#Finds best value
optimizer.step()








                            


tensor([[[ 0.4802, -0.4375, -0.3822,  ..., -0.9489,  0.4556, -0.8693],
         [-0.1278, -0.7987, -0.0592,  ..., -0.6485,  0.2933, -0.6113],
         [ 1.1337,  0.1820,  0.0309,  ..., -1.2703,  0.6236, -0.5927],
         ...,
         [-0.4725, -0.2154, -0.5835,  ..., -0.3452,  0.3333, -0.3850],
         [-1.7562, -0.6300,  0.2834,  ...,  0.3286,  0.0258,  0.4676],
         [-0.4070, -0.5647,  0.6057,  ..., -0.4770,  0.2449,  0.0276]]],
       grad_fn=<UnsqueezeBackward0>)


In [None]:
'''Showing the training of the data- visually'''

from tqdm import tqdm # training our model with a progress bar
'''tqdm is a Python library for adding progress bar. It lets you configure
 and display a progress bar with metrics you want to track'''
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
'''What is IPython vs Python?
IPython is an interactive shell that is built with python. 
It provides a more useful shell environment to execute python code in
 REPL (Read Eval Print Loop).'''
k=200
with tqdm(desc="Training Epoch", total=k) as progress:
    for epoch in range(k):
        progress.update(1)  # increments the progress bar

#for epoch_loop in tdqm(range(200)):
 # progress.update(1)


Training Epoch: 100%|██████████| 200/200 [00:00<00:00, 479623.10it/s]


In [None]:
past_observed_inputs_data


tensor([[[ 0.6981,  0.6630],
         [-0.9645,  0.3983],
         [-0.2106,  0.6476],
         ...,
         [ 0.8924, -0.7843],
         [ 0.9008, -0.9925],
         [ 0.3089,  0.7778]],

        [[-1.4222,  0.3189],
         [ 0.7543, -0.3861],
         [ 2.2447, -1.0362],
         ...,
         [-0.3972, -1.2914],
         [-0.6044, -0.4934],
         [ 0.9957,  0.7323]],

        [[ 0.9141,  1.2325],
         [ 0.9302, -0.1430],
         [-1.0859, -0.5657],
         ...,
         [-1.2451,  0.3230],
         [ 1.5205,  0.8938],
         [ 0.3455,  0.2253]],

        ...,

        [[-0.6999, -0.2559],
         [-0.8430, -0.5155],
         [-0.1755,  0.5163],
         ...,
         [ 0.0453,  1.0072],
         [-0.4293,  1.3060],
         [-0.1528, -0.3547]],

        [[ 0.6810, -0.5732],
         [ 0.7327, -0.0349],
         [ 1.4500,  1.1789],
         ...,
         [ 0.6896, -0.7359],
         [ 1.0039, -1.6875],
         [ 0.4078,  0.3587]],

        [[ 0.6747, -0.3173],
       