In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
## Learnable Positional Encoding layer
class LearnablePositionalEncoding(nn.Module):
  def __init__(self, seq_len, d_model):
    super(LearnablePositionalEncoding, self).__init__()
    self.positional_encoding = nn.Parameter(torch.zeros(seq_len, d_model))

  # add positional encodings
  def forward(self, x):
    return x + self.positional_encoding

  # print positional encodings
  def print(self):
    print(self.positional_encoding)

In [3]:
# Dummy Dataset
class DummyDataset(torch.utils.data.Dataset):
  def __init__(self, num_samples, seq_len, vocab_size):
    self.num_samples = num_samples
    self.data = torch.randint(0, vocab_size, (num_samples, seq_len))
    self.labels = torch.randint(0, 2, (num_samples,))  # Binary labels

  def __len__(self):
    return self.num_samples

  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx]

In [4]:
# Custom Model
class CustomModel(nn.Module):
  def __init__(self, seq_len, d_model, vocab_size):
    super(CustomModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, d_model)
    self.positional_encoding = LearnablePositionalEncoding(seq_len, d_model)
    self.self_attention = nn.MultiheadAttention(d_model, num_heads=4, batch_first=True)
    self.fc = nn.Linear(d_model, 2)

  # forward
  def forward(self, x):
    x = self.embedding(x)
    x = self.positional_encoding(x)
    attn_output, _ = self.self_attention(x, x, x)
    x = attn_output.mean(dim=1)
    return self.fc(x)

In [5]:
# hyper params
num_samples = 1000
seq_len = 10
vocab_size = 50
batch_size = 32
d_model = 32
num_epochs = 10

# dataloader for dummydata
dataset = DummyDataset(num_samples=num_samples, seq_len=seq_len, vocab_size=vocab_size)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# CustomModel Instance
model = CustomModel(seq_len, d_model, vocab_size)

# CrossEntropy loss function
criterion = nn.CrossEntropyLoss()

# Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [6]:
model

CustomModel(
  (embedding): Embedding(50, 32)
  (positional_encoding): LearnablePositionalEncoding()
  (self_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
  )
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

In [7]:
# Positional Encoding before training
model.positional_encoding.print()

Parameter containing:
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [8]:
# traning
for epoch in range(num_epochs):
    total_loss = 0
    for data, labels in dataloader:

        # forward pass
        optimizer.zero_grad()
        output = model(data)

        loss = criterion(output, labels) # loss

        # backword pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Epoch 1/10, Loss: 22.3550
Epoch 2/10, Loss: 22.1789
Epoch 3/10, Loss: 21.9619
Epoch 4/10, Loss: 21.8773
Epoch 5/10, Loss: 21.7146
Epoch 6/10, Loss: 21.4730
Epoch 7/10, Loss: 21.2953
Epoch 8/10, Loss: 21.2298
Epoch 9/10, Loss: 20.9922
Epoch 10/10, Loss: 20.5630


In [9]:
# Positional Encoding after training
model.positional_encoding.print()

Parameter containing:
tensor([[-3.1466e-03,  6.1018e-03,  2.4692e-03, -3.1287e-02,  1.2773e-02,
         -3.0379e-03,  2.0704e-02, -3.4882e-02,  1.6531e-02, -3.0002e-02,
         -1.5794e-02,  1.5690e-02, -1.0167e-01, -6.3978e-02, -2.3232e-02,
          5.2042e-02, -1.8232e-02,  1.3805e-02, -1.4339e-02,  3.5232e-02,
         -1.5111e-02, -5.2531e-03, -2.5525e-02,  4.6537e-02, -1.1457e-04,
         -4.1833e-02,  3.7339e-02, -1.4136e-02,  1.2506e-02,  2.3147e-02,
         -2.2639e-02, -1.7194e-02],
        [-2.4170e-02,  2.7554e-02, -1.2757e-02,  1.8083e-02,  2.8730e-02,
         -3.3117e-02, -2.9872e-02, -1.0636e-02,  6.4208e-03,  2.2384e-02,
          2.6614e-03,  5.8113e-02, -1.6450e-02, -3.8446e-02,  8.2055e-03,
          3.0226e-02,  1.6807e-03, -5.2279e-02, -2.6574e-02,  1.4283e-02,
          2.0021e-02, -2.2082e-02,  3.0174e-02,  1.0478e-02, -2.4284e-02,
         -4.0875e-02,  2.1014e-02, -2.2329e-02,  2.4974e-02,  1.9988e-02,
         -1.0269e-02,  7.1575e-03],
        [-1.3432e-