In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].detach()

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, nhead):
        super(MultiHeadAttention, self).__init__()
        self.nhead = nhead
        self.d_model = d_model

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)

        q = q.view(q.size(0), -1, self.nhead, self.d_model // self.nhead).transpose(1, 2)
        k = k.view(k.size(0), -1, self.nhead, self.d_model // self.nhead).transpose(1, 2)
        v = v.view(v.size(0), -1, self.nhead, self.d_model // self.nhead).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_model // self.nhead, dtype=torch.float32))

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attention = nn.functional.softmax(scores, dim=-1)
        x = torch.matmul(attention, v)
        x = x.transpose(1, 2).contiguous().view(x.size(0), -1, self.d_model)

        return self.out(x)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = nn.functional.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class TransformerLayer(nn.Module):
    def __init__(self, d_model, nhead, d_ff):
        super(TransformerLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, nhead)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)

        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x)

        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, d_ff):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.transformer_layers = nn.ModuleList([TransformerLayer(d_model, nhead, d_ff) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask):
        x = self.embedding(x)
        x = x + self.positional_encoding(x)

        for layer in self.transformer_layers:
            x = layer(x, mask)

        x = self.fc(x)
        return x

# Example usage:
vocab_size = 10000  # Adjust based on your dataset vocabulary size
d_model = 512
nhead = 8
num_layers = 6
d_ff = 2048

model = Transformer(vocab_size, d_model, nhead, num_layers, d_ff)

# Define your input tensors (adjust shapes based on your dataset)
src = torch.randint(0, vocab_size, (32, 32))  # (sequence length, batch size)
mask = torch.ones_like(src)  # This is a simple example; you may need to create a proper mask based on your task

output = model(src, mask)

In [3]:
output

tensor([[[ 0.3185, -0.6270,  0.0854,  ...,  0.2607, -0.8709, -0.0693],
         [-0.1849,  0.3812,  1.2874,  ...,  0.5064, -0.3868, -0.0705],
         [ 0.5565, -0.1648,  0.4797,  ..., -0.0579, -0.5002, -0.2894],
         ...,
         [-0.3584, -0.2182, -0.4169,  ...,  0.4447,  0.1997, -0.1067],
         [-0.1045, -0.4073, -0.4237,  ..., -0.1433, -0.7178,  0.1841],
         [ 0.5390, -0.9265,  0.1892,  ...,  0.0154, -0.4392, -1.1138]],

        [[-0.0867, -0.8074,  0.8041,  ...,  0.4412, -1.2305,  0.5416],
         [ 1.1327,  0.0483, -0.9722,  ..., -0.5373, -0.2334,  0.8293],
         [ 0.0269, -0.2483, -0.3554,  ..., -0.0077, -0.2522, -0.0726],
         ...,
         [-0.0325, -0.4224,  0.4529,  ..., -0.2559, -0.5969,  0.4381],
         [ 0.0413, -1.1664,  0.6749,  ...,  0.4957, -0.7693, -0.1099],
         [-0.5257, -0.5453,  0.8009,  ..., -0.8668, -1.2717,  0.8335]],

        [[ 0.5855, -0.5652, -0.4818,  ...,  0.7361, -0.6954,  0.8023],
         [-0.2199, -0.1594,  0.6121,  ...,  0

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

# Dummy dataset
class DummyDataset(torch.utils.data.Dataset):
    def __init__(self, vocab_size, seq_length, num_samples):
        self.data = torch.randint(0, vocab_size, (num_samples, seq_length))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create a dummy dataset
vocab_size = 1000
seq_length = 32
num_samples = 100
dummy_dataset = DummyDataset(vocab_size, seq_length, num_samples)

# Split the dataset into training and testing sets
train_size = int(0.8 * len(dummy_dataset))
test_size = len(dummy_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dummy_dataset, [train_size, test_size])

# DataLoader for training and testing
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Instantiate the model, loss function, and optimizer
vocab_size = 1000
d_model = 512
nhead = 8
num_layers = 6
d_ff = 2048
model = Transformer(vocab_size, d_model, nhead, num_layers, d_ff)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs in train_loader:
        src = inputs[:, :-1]  # Input sequence
        tgt = inputs[:, 1:]   # Target sequence (shifted by one position)
        mask = (src != 0).unsqueeze(1).unsqueeze(2)     # Create a mask to ignore padding tokens

        if mask is not None:
          mask = mask.to(torch.bool)  # Ensure the mask is of boolean type
        optimizer.zero_grad()
        outputs = model(src, mask)
        loss = criterion(outputs.reshape(-1, vocab_size), tgt.reshape(-1))
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 7.1226372718811035
Epoch 2/10, Loss: 6.9523091316223145
Epoch 3/10, Loss: 6.47813081741333
Epoch 4/10, Loss: 5.6671600341796875
Epoch 5/10, Loss: 4.596777439117432
Epoch 6/10, Loss: 4.534095287322998
Epoch 7/10, Loss: 3.3199002742767334
Epoch 8/10, Loss: 2.3391032218933105
Epoch 9/10, Loss: 1.5963609218597412
Epoch 10/10, Loss: 1.0564284324645996


In [12]:
# Testing loop
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for inputs in test_loader:
        src = inputs[:, :-1]  # Input sequence
        tgt = inputs[:, 1:]   # Target sequence (shifted by one position)
        mask = (src != 0).unsqueeze(1).unsqueeze(2)     # Create a mask to ignore padding tokens

        if mask is not None:
          mask = mask.to(torch.bool)  # Ensure the mask is of boolean type

        outputs = model(src, mask)
        _, predictions = torch.max(outputs, dim=-1)

        total_correct += (predictions == tgt).sum().item()
        total_samples += tgt.numel()

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 0.32%
