<a href="https://colab.research.google.com/github/S-VATS31/Deep_Learning_Models/blob/main/Simple_Attention_Mechanism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [9]:
class AttentionMechanism(torch.nn.Module):
    def __init__(self, query_weight_matrix, key_weight_matrix, value_weight_matrix, d_model, d_keys):
        super(AttentionMechanism, self).__init__()
        self.d_model = d_model # Hidden Size
        self.d_keys = d_keys # Embedding Size

        # Trainable Parameters
        self.query_weight_matrix = torch.nn.Parameter(torch.randn(d_model, d_keys)) * (1.00 /d_keys)
        self.key_weight_matrix = torch.nn.Parameter(torch.randn(d_model, d_keys)) * (1.00 /d_keys)
        self.value_weight_matrix = torch.nn.Parameter(torch.randn(d_model, d_keys)) * (1.00 /d_keys)

    def forward(self, queries, keys, values):
        # Linear Projections
        Q = torch.matmul(queries, self.query_weight_matrix) # Shape: (batch_size, seq_len, d_model)
        K = torch.matmul(keys, self.key_weight_matrix) # Shape: (batch_size, seq_len, d_model)
        V = torch.matmul(values, self.value_weight_matrix) # Shape: (batch_size, seq_len, d_model)

        # Compute Attention Scores
        attention_scores = torch.matmul(Q, K.transpose(-2, -1))

        # Scaled Dot Product Attention
        scaled_attention_scores = attention_scores / torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))

        # Softmax Attention
        attention_probability_distribution = F.softmax(scaled_attention_scores, dim=-1)

        # Compute Output
        attention_output = torch.matmul(attention_probability_distribution, V)

        # Return probability distribution and output matrix
        return attention_probability_distribution, attention_output

