<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Attention_Mechanisms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn

class ScaledDotProductAttention(nn.Module):
    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):
        # Compute scaled dot-product attention
        attn = torch.matmul(q / self.temperature, k.transpose(-2, -1))  # Q.K^T

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)  # Apply mask if provided

        attn = torch.softmax(attn, dim=-1)  # Normalize with softmax
        attn = self.dropout(attn)  # Apply dropout for regularization
        output = torch.matmul(attn, v)  # Weight the values (V) with attention scores

        return output, attn

# Example usage
if __name__ == "__main__":
    # Example input tensors: batch size 5, sequence length 8, embedding size 16
    q = torch.rand(5, 8, 16)  # Query
    k = torch.rand(5, 8, 16)  # Key
    v = torch.rand(5, 8, 16)  # Value
    mask = torch.randint(0, 2, (5, 8, 8))  # Example mask

    # Instantiate and compute attention
    attention = ScaledDotProductAttention(temperature=16 ** 0.5)
    output, attn_weights = attention(q, k, v, mask)

    print("Attention Output Shape:", output.shape)  # Should be [5, 8, 16]
    print("Attention Weights Shape:", attn_weights.shape)  # Should be [5, 8, 8]