<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torchtext

In [None]:
!pip show torch
!pip show torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset

# Define a simple dataset
class TextDataset(Dataset):
    def __init__(self, text_data, tokenizer, vocab, pad_token):
        self.data = [torch.tensor(vocab(tokenizer(line)), dtype=torch.long) for line in text_data]
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.pad_token = pad_token

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def collate_fn(self, batch):
        max_len = max(len(item) for item in batch)
        batch = [torch.cat([item, torch.tensor([self.pad_token] * (max_len - len(item)), dtype=torch.long)]) for item in batch]
        return torch.stack(batch)

# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        src_mask = self.generate_square_subsequent_mask(src.size(0))
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(0))
        memory_mask = self.generate_square_subsequent_mask(tgt.size(0))
        output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask, memory_mask=memory_mask)
        return self.fc_out(output)

# Tokenization and vocabulary
tokenizer = get_tokenizer('basic_english')
train_data = ["hello world", "transformer model example", "deep learning with transformers"]
vocab = build_vocab_from_iterator(map(tokenizer, train_data), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab['<unk>'])

# Create dataset and dataloader
pad_token = vocab['<pad>']
dataset = TextDataset(train_data, tokenizer, vocab, pad_token)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=dataset.collate_fn)

# Model, loss, and optimizer
model = TransformerModel(vocab_size=len(vocab), embed_size=128, num_heads=2, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=512)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    for batch in dataloader:
        src = batch[:, :-1]
        tgt = batch[:, 1:]
        optimizer.zero_grad()
        output = model(src, src)
        loss = criterion(output.view(-1, len(vocab)), tgt.view(-1))
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')