<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Attention_Mechanism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random  # Import the random module

# Define the encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell

# Define the attention mechanism
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, encoder_outputs, hidden):
        src_len = encoder_outputs.shape[1]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)

# Define the decoder with attention
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers, attention):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim + hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_out = nn.Linear(embed_dim + hidden_dim * 2, output_dim)
        self.attention = attention

    def forward(self, x, encoder_outputs, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        attention = self.attention(encoder_outputs, hidden)
        attention = attention.unsqueeze(1)
        weighted = torch.bmm(attention, encoder_outputs)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        output = torch.cat((output, weighted, embedded), dim=2)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

# Training loop with teacher forcing
def train(model, encoder, decoder, criterion, optimizer, src, trg, teacher_forcing_ratio=0.5):
    model.train()
    optimizer.zero_grad()
    loss = 0

    encoder_outputs, hidden, cell = encoder(src)
    input = trg[:, 0]

    for t in range(1, trg.size(1)):
        output, hidden, cell = decoder(input, encoder_outputs, hidden, cell)
        loss += criterion(output, trg[:, t])
        teacher_force = random.random() < teacher_forcing_ratio
        input = trg[:, t] if teacher_force else output.argmax(1)

    loss.backward()
    optimizer.step()
    return loss.item() / trg.size(1)

# Example usage
input_dim = 10  # Vocabulary size of source language
output_dim = 10  # Vocabulary size of target language
embed_dim = 16
hidden_dim = 32
num_layers = 2

encoder = Encoder(input_dim, embed_dim, hidden_dim, num_layers)
attention = Attention(hidden_dim)
decoder = Decoder(output_dim, embed_dim, hidden_dim, num_layers, attention)
model = nn.ModuleList([encoder, decoder])

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example input
src = torch.randint(0, input_dim, (2, 5))  # Batch of 2 sequences, each of length 5
trg = torch.randint(0, output_dim, (2, 5))

# Training step
loss = train(model, encoder, decoder, criterion, optimizer, src, trg)
print(f"Loss: {loss:.4f}")