In [2]:
import numpy as np
from numpy import ndarray,exp,dot
from typing import Dict

In [3]:
def Softm(S:ndarray)->ndarray:
    assert S.ndim == 1
                                      # The softmax function.
    Z = np.zeros([S.shape[0]])
    
    k = np.sum(exp(S))
    for l in range(len(S)):
        Z[l] = (exp(S[l]))/(k)
        
    return Z

In [4]:
def Sigmoid(S:ndarray)->ndarray:   # The sigmoid function.
    return (1/(1+np.exp(S)))

In [5]:
Softm(np.array([1,2,3]))

array([0.09003057, 0.24472847, 0.66524096])

In [19]:
# The attention Operation with trainable parameters. S is a matrix, each row represents a vector in the sequence.
# The attention operation works on a sequence of vectors and outputs a sequence of vectors of the same dimension.

def Attention(S:ndarray)->ndarray:                          
    
    Weights = Weights_attent(S)
    
    W_q = Weights['W_q']  # The query operation weight tensor. square matrix of order dimension of the vector.
    W_k = Weights['W_k']  # The key operation weight tensor.
    W_v = Weights['W_v']   # The value operation weight tensor.
                                  
           # Transforming the input tensor for the respective operations.
    
    Q = dot(S,W_q.transpose())  # Query 
    K = dot(S,W_k.transpose()) # The key  ->The input tensor * the transpose of the weight matrix
                                                    # to preserve the shape of the input tensor.
    V = dot(S,W_v.transpose()) #The value 
    
    # Weight matrix Creation.
    
    U = dot(Q,K.transpose())
    Z = np.zeros([U.shape[0],U.shape[1]])
    
    for l in range(U.shape[0]):    # Applying the Softmax function row wise.
        Z[l,:]  = Softm(U[l,:])
        
    W = Z/(S.shape[0]**.5)  # Scaling the Weight components.
        
    Output = dot(W,V)
    
    return Output

In [20]:
# the function creates the matrices for the attention operation.
def Weights_attent(S:ndarray)->Dict[str,ndarray]:
    dim_v = S.shape[1]  # the dimension of the vectors in the sequence.
    
    # Weight tensors are weight tensors of shape (dimension of the vectors, dimension of the vectors)
    W_q = np.random.randn(dim_v,dim_v)  # Weight matrix for the query operation. The matrix of transformation.
    W_k = np.random.randn(dim_v,dim_v) # Weight matrix for the key operation. matrix of transformation.
    W_v = np.random.randn(dim_v,dim_v)  # value operation. matrix operation.
    
    Weights:Dict[str,ndarray] = {'W_q':W_q,'W_k':W_k,'W_v':W_v}
        
    return Weights

In [21]:
S = np.random.random((4,3))

In [22]:
Attention(S)

array([[ 0.95543098, -0.43590239, -0.0254599 ],
       [ 0.92790575, -0.42375402, -0.02528112],
       [ 0.94025067, -0.43104245, -0.02707601],
       [ 0.95870903, -0.436121  , -0.02417617]])

In [41]:
# Attention Operation used With Sigmoid activation.
# I tried the attention operation with Sigmoid function.
def Attention_1(S:ndarray)->ndarray:
    Weights = Weights_attent(S)   # Weight tensors for the attention operation.
    
    W_q = Weights['W_q']   # Weights for the query operation.
    W_k = Weights['W_k']   # Weights for the key operation
    W_v = Weights['W_v']   # Weight matrix for the value operation
    
    # Transforming the input for the three operations.
    Q = dot(W_q,S.transpose())  # The query 
    K = dot(W_k,S.transpose())  # The key 
    V = dot(W_v,S.transpose())  # The value ->  The 3 linear transformations applied to the sequence of vectors.
    
    # Creating the weight matrix for the final transformation. 
    
    U = dot(Q,K.transpose())
            
    Z = np.zeros([U.shape[0],U.shape[1]])
    
    for n in range(Z.shape[0]):
        Z[n,:] = Sigmoid(U[n,:])   # Applying the sigmoid function row wise.
        
    W = Z/(S.shape[0]**.5)
    
    Output = dot(W,V)
    
    return Output
    
    
    
    
    

In [42]:
Attention_1(S)

array([[-0.10197852, -0.14918442, -0.08936732, -0.05045058],
       [-0.31142592, -0.58182358, -0.36499999, -0.22930438],
       [ 0.12356017,  0.30046345,  0.19568016,  0.13236449]])