In [3]:
import torch 
import torch.nn as nn
import torch.functional as F
import math

In [5]:
def scaled_dot_product(q,k,v,mask=None):
    d_k=q.size()[-1]
    scaled=torch.matmul(q,k.transpose(-1,-2))/math.sqrt(d_k)
    if mask is not None:
        scaled+=mask
    attention=F.softmax(scaled,dim=-1)
    values=torch.matmul(attention,v)
    return values,attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model=d_model
        self.num_heads=num_heads
        self.head_dim=d_model//num_heads
        self.qkv_layer=nn.Linear(d_model,3*d_model)
        self.linear_layer=nn.Linear(d_model,d_model)
    def forward(self,x,mask=None):
        batch_size,sequence_length ,id_model=x.size()
        qkv=self.qkv_layer(x)
        qkv=qkv.reshape(batch_size,sequence_length,self.num_heads,3*self.head_dim)
        qkv=qkv.permute(0,2,1,3)
        q,k,v=qkv.chunk(2,dim=-1)
        values,attention=scaled_dot_product(q,k,v,mask)
        values=values.reshape(batch_size,sequence_length,self.num_heads*self.head_dim)
        out=self.linear_layer(values)




       

In [2]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer).__init__()
        self.attention=MultiHeadAttention(d_model=d_model,num_heads=num_heads)
        self.norm1=LayerNormalization(parameters_shape[d_model])
        self.dropout1=nn.Dropout(p=drop_prob)
        self.ffn=PositionWiseFeedForward(parameters_shape[d_model])
        self.norm2=LayerNormalization(parameters_shape[d_model])
        self.dropout2=nn.Dropout(p=drop_prob)

    def forward(self,x):
        residual_x=x
        x=self.attention(x,mask=none)
        x=self.dropout1(x)
        x=self.norm1(x+residual_x)
        residual_x=x
        x=self.ffn(x)
        x=self.dropout2(x)
        x=self.norm2(x+residual_x)
        return x

                                         
        
        

In [None]:
class Encoder(nn.Module):
    def __init__(self, d_model,ffn_hidden,num_heads,drop_prob,num_layers):
        super().__init__()
        self.layers=nn.Sequential(*[EncoderLayer(d_model,ffn_hidden,num_heads,drop_prob) for _ in range(num_layers)])

    def forward(self, x):
        x=self.layers(x)
        return x
                                  

        

The *layers_list syntax unpacks the list into individual EncoderLayer instances. This is equivalent to passing each layer as a separate argument to the nn.Sequential constructor. In other words, if layers_list is [layer1, layer2, layer3], then nn.Sequential(*layers_list) is equivalent to nn.Sequential(layer1, layer2, layer3)