<a href="https://colab.research.google.com/github/OneFineStarstuff/State-of-the-Art/blob/main/Transformer_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer

# Define the Transformer model components
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dropout, forward_expansion, max_length):
        super(Transformer, self).__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                embed_size, num_heads, forward_expansion * embed_size, dropout
            ),
            num_encoder_layers,
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(
                embed_size, num_heads, forward_expansion * embed_size, dropout
            ),
            num_decoder_layers,
        )
        self.src_word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.src_position_embedding = nn.Embedding(max_length, embed_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.trg_position_embedding = nn.Embedding(max_length, embed_size)
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(src.device)
        )
        trg_positions = (
            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(trg.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        encoder_src = self.encoder(embed_src)
        output = self.decoder(embed_trg, encoder_src)

        return self.fc_out(output)

# Define the fields and data loaders
SRC = Field(tokenize=get_tokenizer("spacy", language="de"), init_token="<sos>", eos_token="<eos>", lower=True)
TRG = Field(tokenize=get_tokenizer("spacy", language="en"), init_token="<sos>", eos_token="<eos>", lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

SRC.build_vocab(train_data, max_size=10000, min_freq=2)
TRG.build_vocab(train_data, max_size=10000, min_freq=2)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=32,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

# Initialize the model, optimizer, and loss function
src_vocab_size = len(SRC.vocab)
trg_vocab_size = len(TRG.vocab)
embed_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
forward_expansion = 4
max_length = 100
learning_rate = 0.0003

model = Transformer(src_vocab_size, trg_vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dropout, forward_expansion, max_length)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = TRG.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in train_iterator:
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()
        output = model(src, trg[:-1])
        output_dim = output.shape[-1]

        output = output.view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_iterator)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

print("Training complete.")

In [None]:
!pip install --upgrade torchtext torch
!pip uninstall torchtext
!pip install torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer

# Check versions
print(f"torchtext version: {torchtext.__version__}")
print(f"torch version: {torch.__version__}")

# Verify basic imports work
SRC = Field(tokenize=get_tokenizer("spacy", language="de"), init_token="<sos>", eos_token="<eos>", lower=True)
TRG = Field(tokenize=get_tokenizer("spacy", language="en"), init_token="<sos>", eos_token="<eos>", lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

In [None]:
pip install --upgrade torch torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator
from torchtext.legacy.datasets import Multi30k
from torchtext.data.utils import get_tokenizer

# Define the Transformer model components
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dropout, forward_expansion, max_length):
        super(Transformer, self).__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                embed_size, num_heads, forward_expansion * embed_size, dropout
            ),
            num_encoder_layers,
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(
                embed_size, num_heads, forward_expansion * embed_size, dropout
            ),
            num_decoder_layers,
        )
        self.src_word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.src_position_embedding = nn.Embedding(max_length, embed_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.trg_position_embedding = nn.Embedding(max_length, embed_size)
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(src.device)
        )
        trg_positions = (
            torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(trg.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        encoder_src = self.encoder(embed_src)
        output = self.decoder(embed_trg, encoder_src)

        return self.fc_out(output)

# Define the fields and data loaders
SRC = Field(tokenize=get_tokenizer("spacy", language="de"), init_token="<sos>", eos_token="<eos>", lower=True)
TRG = Field(tokenize=get_tokenizer("spacy", language="en"), init_token="<sos>", eos_token="<eos>", lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

SRC.build_vocab(train_data, max_size=10000, min_freq=2)
TRG.build_vocab(train_data, max_size=10000, min_freq=2)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=32,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

# Initialize the model, optimizer, and loss function
src_vocab_size = len(SRC.vocab)
trg_vocab_size = len(TRG.vocab)
embed_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
forward_expansion = 4
max_length = 100
learning_rate = 0.0003

model = Transformer(src_vocab_size, trg_vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dropout, forward_expansion, max_length)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = TRG.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in train_iterator:
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()
        output = model(src, trg[:-1])
        output_dim = output.shape[-1]

        output = output.view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_iterator)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

print("Training complete.")