In [3]:
# Implements a basic attention mechanism without using PyTorch

import numpy as np 

def linear_transform( X, W, b = None ):
  """
  Performs a linear transformation of the data: Y = XW + b
  :param X: Input data, NumPy array of shape (batch_size, ..., input_dim)
  :param W: Weight matrix, NumPy array of shape (input_dim, output_dim)
  :param b: Bias vector, NumPy array of shape (output_dim,) or None
  :return: NumPy array with the transformed data
  """
  Y = np.dot( X, W )
  if b is not None:
    Y += b
  return Y

def basic_attention( inputs, W1, b1, W2 ):
  """
  Implements a basic attention mechanism without using PyTorch.
  :param inputs: Input data, NumPy array of shape (batch_size, sequence_length, input_dim)
  :param W1: Weight matrix for the attention_weights_layer layer, of shape (input_dim, attention_dim)
  :param b1: Bias vector for the attention_weights_layer layer, of shape (attention_dim,)
  :param W2: Weight matrix for the context_vector_layer, of shape (attention_dim, 1)
  :return: Tuple of context_vector and attention_weights
  """
  # Step 1: Compute attention weights
  # Transform inputs to attention space 
  attention_scores = linear_transform( inputs, W1, b1 )
  # Apply non-linearity to attention scores ( tanh )
  attention_scores = np.tanh( attention_scores )
  # Compute attention scores for each input 
  attention_weights = linear_transform( attention_scores, W2 )
  # Apply softmax to obtain normalized attention weights
  attention_weights = np.exp( attention_weights - np.max( attention_weights, axis = 1, keepdims = True ) )
  attention_weights /= np.sum( attention_weights, axis = 1, keepdims = True )

  # Step 2: Apply attention weights to inputs to obtain context vector
  context_vector = np.sum( attention_weights * inputs, axis = 1 )

  return context_vector, attention_weights.squeeze( -1 )

# Dimension and parameters ( for testing )
batch_size = 1
sequence_length = 5
input_dim = 10
attention_dim = 20

# Initialization of parameters
W1 = np.random.rand( input_dim, attention_dim )
b1 = np.random.rand( attention_dim )
W2 = np.random.rand( attention_dim, 1 )

# Example input data ( random )
inputs = np.random.rand( batch_size, sequence_length, input_dim )

context_vector, attention_weights = basic_attention( inputs, W1, b1, W2 )

print( "Context Vector: ", context_vector ) 
print("")
print( "Attention Weights: ", attention_weights )

Context Vector:  [[0.36984988 0.61671825 0.31822509 0.60165173 0.5588019  0.35451691
  0.63199326 0.21645852 0.80540425 0.67249008]]

Attention Weights:  [[0.19937268 0.19023509 0.19817685 0.20136077 0.21085462]]


In [4]:
# Implementation of the basic attention mechanism without using PyTorch
import numpy as np

# Softmax function
def softmax( x ):
  # Subtract the max for numerical stability
  e_x = np.exp( x - np.max( x, axis = -1, keepdims = True ) )
  # Normalize
  return e_x / e_x.sum( axis = -1, keepdims = True )

class BasicAttentionManual:
  def __init__( self, input_dim, attention_dim ):
    # Random initialization of the weight matrices
    self.W_att = np.random.rand( input_dim, attention_dim )
    self.W_ctx = np.random.rand( attention_dim, 1 )

  def forward( self, inputs ):
    # Step 1: Compute attention weights
    # Transform inputs to attention space
    attention_scores = np.tanh( np.dot( inputs, self.W_att ) )
    # Compute attention scores for each input 
    attention_scores = np.dot( attention_scores, self.W_ctx )
    # Apply softmax to obtain normalized attention weights
    attention_weights = softmax( attention_scores.squeeze( -1 ) ).reshape( attention_scores.shape )

    # Step 2: Apply attention weights to inputs to obtain context vector
    context_vector = np.sum( attention_weights * inputs, axis = 1 )

    return context_vector, attention_weights.squeeze( -1 )
  
input_dim = 128 # Dimension of the input data
attention_dim = 64 # Dimension of the attention space
batch_size = 32 # Batch size
sequence_length = 10 # Sequence length

# example input data ( random ) 
inputs = np.random.rand( batch_size, sequence_length, input_dim )

attention = BasicAttentionManual( input_dim, attention_dim )
context_vector, attention_weights = attention.forward( inputs )

print( "Context Vector: ", context_vector )
print("")
print( "Attention Weights: ", attention_weights )

Context Vector:  [[0.47855559 0.50177232 0.28462834 ... 0.61515627 0.63151555 0.52124213]
 [0.54984241 0.50824661 0.452511   ... 0.44746394 0.39663317 0.48986191]
 [0.47734344 0.46148235 0.58379601 ... 0.50083603 0.43596871 0.49653411]
 ...
 [0.37177787 0.37541966 0.42256784 ... 0.7021436  0.62209681 0.38691504]
 [0.5438249  0.43593613 0.55316309 ... 0.63541769 0.4919232  0.62900256]
 [0.51594188 0.44686249 0.46423829 ... 0.48957873 0.4263816  0.54310004]]

Attention Weights:  [[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
 [