In [None]:
!pip uninstall torchvision -y
!pip install torchtext==0.17.0 'portalocker>=2.0.0'

Found existing installation: torchvision 0.20.1+cu124
Uninstalling torchvision-0.20.1+cu124:
  Successfully uninstalled torchvision-0.20.1+cu124
Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting portalocker>=2.0.0
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting torch==2.2.0 (from torchtext==0.17.0)
  Downloading torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.0)
  Downloading torchdata-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
C

In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/

In [None]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import os
from torch.optim.lr_scheduler import LambdaLR


In [None]:
# ---------------------------
# 1. Data Preparation
# ---------------------------
os.environ['TORCHDATA_DISABLE_CACHE'] = '1'
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'
BATCH_SIZE = 256

# Tokenizers (ensure the spaCy models are installed)
tokenizer_src = get_tokenizer('spacy', language='de_core_news_sm')
tokenizer_tgt = get_tokenizer('spacy', language='en_core_web_sm')

def yield_tokens(data_iter, language):
    tokenizer = tokenizer_src if language == SRC_LANGUAGE else tokenizer_tgt
    for src_sample, tgt_sample in data_iter:
        text = src_sample if language == SRC_LANGUAGE else tgt_sample
        yield tokenizer(text)


# Load the full training dataset as a list for vocabulary creation
full_train_examples = list(Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)))

# Build source vocabulary
vocab_src = build_vocab_from_iterator(yield_tokens(full_train_examples, SRC_LANGUAGE),
                                      min_freq=1,
                                      specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_src.set_default_index(vocab_src["<unk>"])

# Build target vocabulary
vocab_tgt = build_vocab_from_iterator(yield_tokens(full_train_examples, TGT_LANGUAGE),
                                      min_freq=1,
                                      specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_tgt.set_default_index(vocab_tgt["<unk>"])

# Special token indices
PAD_IDX = vocab_src["<pad>"]
BOS_IDX = vocab_src["<bos>"]
EOS_IDX = vocab_src["<eos>"]

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_tensor = torch.tensor([vocab_src["<bos>"]] +
                                  vocab_src(tokenizer_src(src_sample)) +
                                  [vocab_src["<eos>"]], dtype=torch.long)
        tgt_tensor = torch.tensor([vocab_tgt["<bos>"]] +
                                  vocab_tgt(tokenizer_tgt(tgt_sample)) +
                                  [vocab_tgt["<eos>"]], dtype=torch.long)
        src_batch.append(src_tensor)
        tgt_batch.append(tgt_tensor)
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch


# Load full examples for both train and valid splits
train_examples = list(Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)))
valid_examples = list(Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE)))

# Define subset percentage
subset_percentage = 0.9

# Subsample the examples
train_subset = train_examples[:int(len(train_examples) * subset_percentage)]
valid_subset = valid_examples[:int(len(valid_examples) * subset_percentage)]

# Create DataLoaders from these subsets
train_dataloader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_subset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# ---------------------------
# 2. Model Definition
# ---------------------------

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(1)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, nhead: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward: int = 512, dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        outs = self.transformer(src_emb, tgt_emb,
                                src_mask, tgt_mask,
                                None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

def generate_square_subsequent_mask(sz: int):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
# ---------------------------
# 3. Training Setup
# ---------------------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters (feel free to experiment)
EMB_SIZE = 256
NUM_ENCODER_LAYERS = 5
NUM_DECODER_LAYERS = 5
FFN_HID_DIM = 1024
NHEAD = 8

SRC_VOCAB_SIZE = len(vocab_src)
TGT_VOCAB_SIZE = len(vocab_tgt)

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                           NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM).to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Optionally, use a learning rate scheduler (here using a StepLR for simplicity)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.6)

def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            src = src.to(device)
            tgt = tgt.to(device)
            tgt_input = tgt[:-1, :]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
            logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)
            tgt_out = tgt[1:, :]
            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Training Loop
NUM_EPOCHS = 28  # Increase the number of epochs for better convergence

for epoch in range(1, NUM_EPOCHS+1):
    train_loss = train_epoch(model, train_dataloader)
    val_loss = evaluate(model, valid_dataloader)
    scheduler.step()  # Update learning rate
    print(f"Epoch: {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# ---------------------------
# 4. Inference with Beam Search
# ---------------------------

def beam_search_decode(model, src, src_mask, max_len, start_symbol, beam_size=10):
    src = src.to(device)
    memory = model.encode(src, src_mask)
    # Initialize beam with a tensor of shape (1, 1)
    beam = [(torch.tensor([[start_symbol]], device=device), 0)]

    for _ in range(max_len - 1):
        new_beam = []
        for seq, score in beam:
            tgt_mask = generate_square_subsequent_mask(seq.size(0)).to(device)
            out = model.decode(seq, memory, tgt_mask)
            out = out.transpose(0, 1)  # shape: (batch_size, seq_len, emb_size)
            # Get log probabilities for the last token in the sequence
            log_probs = torch.log_softmax(model.generator(out[:, -1]), dim=1)  # shape: (batch_size, vocab_size)
            topk_log_probs, topk_indices = torch.topk(log_probs, beam_size, dim=1)
            for log_prob, token in zip(topk_log_probs[0], topk_indices[0]):
                # Reshape token to (1, 1) so that dimensions match for concatenation
                new_seq = torch.cat([seq, token.view(1, 1)], dim=0)
                new_score = score + log_prob.item()
                new_beam.append((new_seq, new_score))
        # Keep the top beam_size candidates
        beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_size]
        # Stop if all sequences in the beam have generated the <eos> token
        if all((seq[-1] == EOS_IDX).item() for seq, _ in beam):
            break
    best_seq = beam[0][0]
    return best_seq


def translate(model, sentence, max_len=50, beam_size=10):
    model.eval()
    tokens = [vocab_src["<bos>"]] + vocab_src(tokenizer_src(sentence)) + [vocab_src["<eos>"]]
    src = torch.tensor(tokens).unsqueeze(1).to(device)  # shape: [seq_len, 1]
    src_mask = torch.zeros((src.size(0), src.size(0)), device=device).type(torch.bool)
    tgt_tokens = beam_search_decode(model, src, src_mask, max_len, vocab_tgt["<bos>"], beam_size)
    tgt_tokens = tgt_tokens.flatten().cpu().numpy()
    translated_tokens = []
    for tok in tgt_tokens:
        token = vocab_tgt.get_itos()[tok]
        if token == "<eos>":
            break
        translated_tokens.append(token)
    # Remove <bos> from the final translation
    return " ".join(translated_tokens[1:])




Epoch: 1, Train Loss: 5.4066, Val Loss: 4.2808
Epoch: 2, Train Loss: 4.0365, Val Loss: 3.7683
Epoch: 3, Train Loss: 3.6103, Val Loss: 3.4233
Epoch: 4, Train Loss: 3.3222, Val Loss: 3.2211
Epoch: 5, Train Loss: 3.0794, Val Loss: 3.0434
Epoch: 6, Train Loss: 2.8507, Val Loss: 2.9074
Epoch: 7, Train Loss: 2.7086, Val Loss: 2.7772
Epoch: 8, Train Loss: 2.5579, Val Loss: 2.6713
Epoch: 9, Train Loss: 2.4132, Val Loss: 2.5422
Epoch: 10, Train Loss: 2.2743, Val Loss: 2.4559
Epoch: 11, Train Loss: 2.1295, Val Loss: 2.3708
Epoch: 12, Train Loss: 2.0503, Val Loss: 2.3247
Epoch: 13, Train Loss: 1.9781, Val Loss: 2.2884
Epoch: 14, Train Loss: 1.9090, Val Loss: 2.2462
Epoch: 15, Train Loss: 1.8471, Val Loss: 2.2147
Epoch: 16, Train Loss: 1.7687, Val Loss: 2.1833
Epoch: 17, Train Loss: 1.7259, Val Loss: 2.1720
Epoch: 18, Train Loss: 1.6930, Val Loss: 2.1612
Epoch: 19, Train Loss: 1.6557, Val Loss: 2.1374
Epoch: 20, Train Loss: 1.6208, Val Loss: 2.1254
Epoch: 21, Train Loss: 1.5775, Val Loss: 2.1186
E

In [None]:
# ---------------------------
# 5. Translation Examples
# ---------------------------

example_sentences = [
    "ein mann in einem blauen hemd steht auf der seite eines gebäudes .",
    "zwei männer in einem restaurant unterhalten sich .",
    "ein kind spielt mit einem ball auf einem großen feld ."
]

print("\n--- Improved Translation Examples ---")
for i, sentence in enumerate(example_sentences, 1):
    translation = translate(model, sentence, beam_size = 4)
    print(f"Example {i}:")
    print(f"Input (DE):  {sentence}")
    print(f"Output (EN): {translation}\n")


--- Improved Translation Examples ---
Example 1:
Input (DE):  ein mann in einem blauen hemd steht auf der seite eines gebäudes .
Output (EN): Skateboarder in a blue bathing suit standing on the edge of a house .

Example 2:
Input (DE):  zwei männer in einem restaurant unterhalten sich .
Output (EN): 2 friends are talking to each other in a restaurant .

Example 3:
Input (DE):  ein kind spielt mit einem ball auf einem großen feld .
Output (EN): Surfer is playing with a large board on a large surface .

