In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()

        assert d_model%num_heads == 0 ,"d_model must be divisible by num_heads"
        # Initialization dimension

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model , d_model)
        self.W_k = nn.Linear(d_model , d_model)
        self.W_v = nn.Linear(d_model , d_model)
        self.W_o = nn.Linear(d_model , d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask = None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) 
        print("Shape of energy si ")
        print(attn_scores.shape )
        
        attn_scores=attn_scores/ math.sqrt(self.d_k)

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask==0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim = -1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_head(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_head(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask = None):
        Q = self.split_head(self.W_q(Q))
        K = self.split_head(self.W_k(K))
        V = self.split_head(self.W_v(V))
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        output = self.W_o(self.combine_head(attn_output))

        return output

In [4]:
import torch
import torch.nn as nn
import math

# Define dimensions
batch_size = 2
seq_length = 10
d_model = 512
num_heads = 8

# Create random tensors for Q, K, V
Q = torch.randn(batch_size, seq_length, d_model)  # (2, 10, 512)
K = torch.randn(batch_size, seq_length, d_model)  # (2, 10, 512)
V = torch.randn(batch_size, seq_length, d_model)  # (2, 10, 512)

# Initialize the multi-head attention module
mha = MultiHeadAttention(d_model, num_heads)

# Pass through the attention layer
output = mha(Q, K, V)

# Output shape should be (batch_size, seq_length, d_model)
print(output.shape)  # Expected: torch.Size([2, 10, 512])


Shape of energy si 
torch.Size([2, 8, 10, 10])
torch.Size([2, 10, 512])


In [5]:

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        
        # One word converted into 512 dims and 512 distrubute into 8 heads so one head will get 512/8= 64 dims
        # 64 dims of q,k,v will go through the linear transformation which is a neural network 
        # after this each 64 dims query will go dot product with 64 dims of keys 64 dot 64 = attention value 
        # Example attention scores for first head might look like:
        
            #         .....
            # Query shape is torch.Size([1, 10, 8, 64])
            # Key shape is torch.Size([1, 10, 8, 64])
            # Attention Score will be torch.Size([1, 10, 8, 8])
            
            # for every Q there is 64 dims
            # for every k there is 64 dims
            
            # so when we apply dot product we get a 64.dot(64) we get a single value 
            # so output attention will be  torch.Size([1, 10, 8, 8])
            # bwqd->bwkd 
            # after transpose bwqd->bwdk
            # after dot product 
            # bwqk
        
            #        i*64    am*64    a*64    cat*64
            # i*64    [0.2  0.3  0.1  0.4]
            # am*64   [0.1  0.4  0.2  0.3]
            # a*64    [0.1  0.2  0.3  0.4]
            # cat*64  [0.3  0.2  0.1  0.4]
            
        
        
        
        # after this we this we apply softmax to get the probality(sum of all equal to 1) of a query with 
        # respect to keys 

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask=None):
        # Get number of training examples
        N = query.shape[0] 
        # N No of batches
        # 

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        # Number of charater orr words

        
        values = self.values(values)  # (N, value_len, embed_size)
        keys = self.keys(keys)  # (N, key_len, embed_size)
        queries = self.queries(query)  # (N, query_len, embed_size)

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        Q=queries
        K=keys
        V=values
        
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.embed_size)
        
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask==0, -1e9)
      
      
        attn_probs = torch.softmax(attn_scores, dim = -1)
        output = torch.matmul(attn_probs, V)
        return output

In [10]:
import torch
import torch.nn as nn
import math

# Define dimensions
batch_size = 1
seq_length = 10
d_model = 512
num_heads = 8

# Create random tensors for Q, K, V
Q = torch.randn(batch_size, seq_length, d_model)  # (2, 10, 512)
K = torch.randn(batch_size, seq_length, d_model)  # (2, 10, 512)
V = torch.randn(batch_size, seq_length, d_model)  # (2, 10, 512)

# Initialize the multi-head attention module
mha = SelfAttention(d_model, num_heads)

# Pass through the attention layer
output = mha(Q, K, V)

# Output shape should be (batch_size, seq_length, d_model)
print(output.shape)  # Expected: torch.Size([2, 10, 512])


Shape of Query si 
torch.Size([1, 10, 8, 64])
Shape of energy si 
torch.Size([1, 10, 8, 64])
Shape of energy si 
torch.Size([1, 10, 8, 8])
Shape of 
torch.Size([1, 10, 8, 8])
Values of  
torch.Size([1, 10, 8, 64])
torch.Size([1, 10, 8, 64])


In [9]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask=None):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = self.values(values)  # (N, value_len, embed_size)
        keys = self.keys(keys)  # (N, key_len, embed_size)
        queries = self.queries(query)  # (N, query_len, embed_size)

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm


        print("Shape of Query si ")
        print(queries.shape )
        print("Shape of energy si ")
        print(keys.shape )
        # energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        energy = torch.matmul(queries,keys.transpose(-2, -1))
        print("Shape of energy si ")
        print(energy.shape )
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, words, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=-1)
        # attention shape: (N, heads, query_len, key_len)
        print("Shape of ")
        print(attention.shape )
        print("Values of  ")
        print(values.shape )
        # out = torch.einsum("nhql,nlhd->nqhd", [attention, values])
        out = torch.matmul(attention, values)
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        # out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()

#         Initialization
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))