- Attention mechanism is paying attention to certain parts when processing the data.   

#### Example task: translate the sentence : ' le chat est noir' to english sentence - ' the cat is black'    

- Here, there will be total of five time steps in translation [each word + end of sentence] and at each time step attention is applied by assigning weights to input words.
- the more important words the higher weights will be applied.

- the main reason for using attention mechanism is to conserve the information from the first encoder cell without the loss of information during the process of decoding. Thus to handle this, the attention weight is added to all encoder outputs.

- *How important is the validity of the weights in attention mechanism?* - Normally, it is ok to set random values as the backpropogation gradient process takes care of correcting it during training.

**Luong attention**

*Calculating encoder hidden  state*

In [1]:
import torch 
import torch.nn as nn

In [2]:
from torch.nn.modules import dropout
class Encoder_LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, n_layers = 1, drop_prob = 0):
    super(EncoderLSTM, self).__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers, dropout=drop_prob, batch_first=True)

  def forward(self, inputs, hidden):
    embedded = self.embedding(inputs)
    output, hidden = self.lstm(embedded, hidden)
    return output, hidden

**Calculating alignment score** - alignment scores are calculated using new decoder hidden state and the encoder hidden state.    
In Luong attention, alignment score can be calculated in three different ways:  
1. Dot function - multiplying hidden encoder and hidden decoder
2. General function - similar to the dot function but weight matrix is added to the equation
3. concat function - W2 * tanhn(W1(H(encoder) + H(decoder)))

In [3]:
class Loung_attention_layer(nn.Module):
  def __init(self, method, hidden_size):
    super(Loung_attention_layer, self).__init()
    self.method = method
    self.hidden_size = hidden_size

    if self.method not in ['dot', 'general', 'concat']:
      raise ValueError(self.method, 'is not the correct attention method')
    if self.method == 'general':
      self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
    elif self.method == 'concat':
      self.attn = torch.nn.Linear(self.hidden_size*2, hidden_size)
      self.weight = nn.Parameter(torch.FloatTensor(hidden_size))

  def get_dot_score(self, hidden, encoder_outputs):
    return torch.sum(hidden*encoder_outputs, dim = 2)

  def get_general_score(self, hidden, encoder_outputs):
    energy = self.attn(encoder_outputs)
    return torch.sum(hiden * energy, dim = 2)

  def get_concat_score(self, hidden, encoder_outputs):
    concat = torch.cat((hidden.expand(encoder-outputs.size(0), -1, -1), encoder_outputs), dim=2)
    energy = torch.tanh(self.attn(concat))
    return torch.sum(self.weight*energy, dim=2)

  def forward(self, hidden, encoder_outputs):
    if self.method == 'dot':
      attn_energy = self.get_dot_score(hidden, encoder_outputs)
    elif self.method == 'general':
      attn_energy = self.get_general_score(hidden, encoder_outputs)
    elif self.method == 'concat':
      attn_energy = self.get_concat_score(hidden, encoder_outputs)

    attn_enegry = attn_energy.t() #transpose
    return F.softmax(attn_energy, dim=1).unsqueeze(1) #attn_energy is softmaxed to retun the weight corresponding to each encoder output

In [4]:
class Luong_Decoder(nn.Module):
  def __init__(self, hidden_size, output_size, attention, n_layers=1, drop_prob=0.1):
    super(LoungDecoder, self).__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.drop_prob = drop_prob

    self.attention = attention
    self.embedding = nn.Embedding(self.output_size, self.hidden_size)
    self.dropout = nn.Dropout(self.drop_prob)
    self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
    self.classifier = nn.Linear(self.hidden_size*2, self.output_size)

  def forward(self, inputs, hidden, encoder_outputs):
    embedded = self.embedding(inputs).view(1, 1, -1) #embed input words
    embedded = self.dropout(embedded)

    lstm_out, hidden = self.lstm(embedded, hidden) # new hidden state for decoder

    alignment_scores = self.attention(lstm_out, encoder_outputs) #calculate alignment scores

    attn_weights = F.softmax(alignment_scores.view(1, -1), dim=1)

    context_vector = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs) #calculating context vector by multiplying attention weights with encoder utputs

    output = torch.cat((lstm_out, context_vector), -1) #calcumating the final decoder outut by concatenating putput from LSTM with context vector

    output = F.log_softmax(self.classifier(output[0]), dim=1)

    return output. hidden, attn_weights

Reference: https://www.kaggle.com/code/tientd95/understanding-attention-in-neural-network

https://blog.floydhub.com/attention-mechanism/