# Import modules

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F



# Encoder Decoder Attention Explained

- As we saw in the previous notebooks we have Encoder-only Transformer and Decoder-Only Transformer. </br>
Before that the first transformer ever made had one part called Encoder that used self attention, and one part called Decoder that used Masked Self-Attention.</br>
This 2 parts were connected together to each other so they could calculate something called Encoder-Decoder Attention. </br>
Encoder-Decoder Attention uses the output from the Encoder to calculate the Keys, Values and Queries are calculated from the output of the Masked Self-Attention </br>
(Decoder). Once the Q, K and V are calculated the Encoder-Decoder Attention is calculated just like Self-Attention using every similarity.</br>
This first Transformer is based on something called Seq2Seq or an Encoder-Decoder model. Seq2Seq were designed to translate text from one language into another.</br>
Encoder-Decoder Attention is also called Cross-Attention.

- We can apply attention to the encoded values multiple times simultaneously, if we want to work with longer sequences and to understand how the word are related in this long sequences of words/tokens. Each attention is called head and has its own sets of weights for calculating the Q, K and V. </br>
When we have multiple heads we call it Multi-Head Attention.




# Code Attention

In [2]:
class Attention(nn.Module):
    '''
    Implementation of Attention class which can work as a Self-Attention and Mask Self-Attention
    '''
    
    def __init__(self, d_model=2, row_dim_idx=0, col_dim_idx=1):
        
        super().__init__()
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        
        self.row_dim_idx = row_dim_idx
        self.col_dim_idx = col_dim_idx
        
    
    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v ,mask=None):
        
        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)
        
        sims = torch.matmul(q, k.transpose(self.row_dim_idx, self.col_dim_idx))
        
        sims_scaled = sims / torch.tensor(k.size(self.col_dim_idx) ** 0.5)
        
        if mask is not None:
            sims_scaled = sims_scaled.masked_fill(mask, values=-1.e9)
        
        
        attention_percents = F.softmax(sims_scaled, dim=self.col_dim_idx)
        
        attention_scores = torch.matmul(attention_percents, v)
        
        return attention_scores
        
        
        


# Test Attention class

In [3]:
# create matrices of token encodings...
encodings_for_q = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_k = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_v = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

# set the seed for the random number generator
torch.manual_seed(42)

# create an attention object
attention = Attention(d_model=2,
                      row_dim_idx=0,
                      col_dim_idx=1)

# calculate encoder-decoder attention
attention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

# Multi-head Attention

In [4]:
class MultiHeadAttention(nn.Module):
    
    '''
    "Implementation of MultiHead Attention which can execue and calculate more than one set of Weights for Q, K, V
     We can specify the number of attention heads from the num_heads argument wich by default is 1
    '''
    
    def __init__(self, d_model=2, row_dim_idx=0, col_dim_idx=1, num_heads=1):
        super().__init__()
        
        self.heads = nn.ModuleList([Attention(d_model, row_dim_idx, col_dim_idx) for _ in range(num_heads)])
        self.row_dim_idx = row_dim_idx
        self.col_dim_idx = col_dim_idx
    
    def forward(self, encodings_for_q, encodings_for_k, encodingds_for_v):
        return torch.cat(
                        [head(encodings_for_q, encodings_for_k, encodingds_for_v) 
                        for head in self.heads], 
                        dim=self.col_dim_idx)
        

# Test MultiHead Attention class

In [5]:
# set the seed for the random number generator
torch.manual_seed(42)

# create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim_idx=0,
                                        col_dim_idx=1,
                                        num_heads=1)

# calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<CatBackward0>)

In [6]:
# set the seed for the random number generator
torch.manual_seed(42)

# create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim_idx=0,
                                        col_dim_idx=1,
                                        num_heads=2)

# calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)

tensor([[ 1.0100,  1.0641, -0.7081, -0.8268],
        [ 0.2040,  0.7057, -0.7417, -0.9193],
        [ 3.4989,  2.2427, -0.7190, -0.8447]], grad_fn=<CatBackward0>)