# Building Transformers using Pytorch Transformers Method

<img src="https://media.geeksforgeeks.org/wp-content/uploads/20250325174552667398/transformer.png">

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [4]:
class SimpleTransformers(nn.Module):
    def __init__(
        self, 
        src_vocab_size,
        tgt_vocab_size, 
        d_model=512,
        nhead=8,
        num_layers=4,
        dim_feedforward=2048,
        dropout=0.1,
        max_len=100
    ):
        super(SimpleTransformers, self).__init__()

        self.src_embedding = nn.Embedding(num_embeddings=src_vocab_size, embedding_dim=d_model)
        self.tgt_embedding = nn.Embedding(num_embeddings=tgt_vocab_size, embedding_dim=d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)

        # Pytorch Transformer
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)

        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        src_emb = self.pos_encoder(self.src_embedding(src))
        tgt_emb = self.pos_encoder(self.tgt_embedding(tgt))

        output = self.transformer(src_emb, tgt_emb)
        return self.fc_out(output)

In [6]:
src_vocab_size = 10000
tgt_vocab_size = 10000

model = SimpleTransformers(src_vocab_size, tgt_vocab_size)

src = torch.randint(0, src_vocab_size, (32, 20))  # batch=32, seq_len=20
tgt = torch.randint(0, tgt_vocab_size, (32, 15))

output = model(src, tgt)
print(output.shape)  # (32, 15, tgt_vocab_size)


torch.Size([32, 15, 10000])


In [7]:
output

tensor([[[-0.1232,  0.1974,  0.2311,  ...,  0.7546, -0.0971, -0.5343],
         [-0.5452, -0.2743,  0.5020,  ...,  0.5108, -0.3281, -0.2989],
         [-0.2360, -0.0759, -0.0610,  ..., -0.1975, -0.1219, -0.7713],
         ...,
         [-0.6507, -0.0919,  0.0209,  ...,  0.1554,  0.0329,  0.0339],
         [-0.3224,  0.2080, -0.1527,  ...,  0.5321,  0.4047, -0.5693],
         [-0.9821, -0.3305,  0.4270,  ...,  0.2308, -0.1707, -0.6261]],

        [[-0.6786,  0.2414,  0.4896,  ...,  0.2271,  0.1008, -0.9225],
         [ 0.0511,  0.2614,  0.8016,  ...,  0.1394,  0.0407, -0.8343],
         [-0.4496,  0.0155,  0.4898,  ...,  0.7178,  0.4297, -0.5523],
         ...,
         [-0.4567,  0.4619, -0.2568,  ...,  0.6354,  0.4413, -0.7411],
         [-0.3466,  0.3697, -0.1801,  ...,  0.3141,  0.3780, -1.1125],
         [-0.1956,  0.7709, -0.0984,  ...,  1.0252,  0.2743, -0.3762]],

        [[-0.1776,  0.3541,  0.4787,  ...,  0.2038, -0.0803, -0.4546],
         [-0.0384,  0.4412, -0.3236,  ...,  0