In [ ]:
from torch import nn


class ECAttention(nn.Module):
    def __init__(self):
        super(ECAttention, self).__init__()

        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=3, padding=1, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # batch, channel, dim -> batch, channel, 1
        y = self.avg_pool(x)

        # batch, channel, 1 -> batch, 1, channel
        y = y.transpose(-1, -2)
        y = self.conv(y)
        y = y.transpose(-1, -2)
        y = self.sigmoid(y)
        return x * y.expand_as(x)

# x = torch.randn((1, 16, 2))
# eca = ECAttention()
# y = eca(x)
# print(y.shape)
# print(sum(p.numel() for p in eca.parameters() if p.requires_grad))

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [4]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MaskedMultiHeadAttention, self).__init__()
        assert d_model % n_heads == 0
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        
        # Define the projection layers for query, key, value
        self.query_projection = nn.Linear(d_model, d_model)
        self.key_projection = nn.Linear(d_model, d_model)
        self.value_projection = nn.Linear(d_model, d_model)
        
        # Output projection layer
        self.out_projection = nn.Linear(d_model, d_model)
        
    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.size()
        
        # Project input to query, key, and value
        query = self.query_projection(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        key = self.key_projection(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        value = self.value_projection(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        
        # Compute attention scores
        scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_dim ** 0.5)
        
        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)
        
        # Apply attention weights to values
        attn_output = torch.matmul(attn_weights, value)
        
        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        attn_output = self.out_projection(attn_output)
        
        return attn_output

In [5]:
# Example usage
batch_size = 32
seq_len = 16
d_model = 256
n_heads = 8

# Generate random input
x = torch.randn(batch_size, seq_len, d_model)

# Create the model
model = MaskedMultiHeadAttention(d_model, n_heads)

# Generate a mask
mask = generate_square_subsequent_mask(seq_len)

# Apply masked attention
output = model(x, mask)

RuntimeError: The size of tensor a (16) must match the size of tensor b (8) at non-singleton dimension 1