In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from datasets import load_dataset
from tqdm import tqdm
import torch.nn.functional as F
import time

from transformer.custom_transformer import Transformer, Encoder, Decoder, EncoderBlock, MultiHeadAttentionBlock, \
    FeedForwardBlock, DecoderBlock, InputEmbeddings, ProjectionLayer, PositionalEncoding

In [2]:
# Determine which device to choose
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [3]:
# Set max length for sentences
MAX_LEN = 512

In [4]:
# BPE Tokenizer Class
class BpeTokenizer:
    def __init__(self, corpus, vocab_size=32000):
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()

        trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], vocab_size=vocab_size)
        self.tokenizer.train_from_iterator(corpus, trainer)

        self.tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[("[CLS]", 2), ("[SEP]", 3)]
        )

        self.vocab_size = self.tokenizer.get_vocab_size()

    def encode(self, text):
        return self.tokenizer.encode(text).ids

    def decode(self, token_ids):
        return self.tokenizer.decode(token_ids)

In [5]:
# Load Dataset
dataset = load_dataset("wmt16", "de-en")
df = dataset["train"].to_pandas()

df["de"] = df["translation"].apply(lambda x: x["de"] if isinstance(x, dict) else [0])
df["en"] = df["translation"].apply(lambda x: x["en"] if isinstance(x, dict) else [0])

In [6]:
# Step 1: Prepare Corpus
print("Preparing corpus...")
start = time.time()
sample_size = 10000
sample_texts = df["de"].dropna().sample(sample_size, random_state=42).tolist() + \
               df["en"].dropna().sample(sample_size, random_state=42).tolist()
print(f"Corpus prepared in {time.time() - start:.2f} seconds.")

Preparing corpus...
Corpus prepared in 0.46 seconds.


In [7]:
# Step 2: Train BPE Tokenizer
print("Training BPE Tokenizer...")
start = time.time()
bpe_tokenizer = BpeTokenizer(sample_texts)
print(f"Tokenizer trained in {time.time() - start:.2f} seconds.")

Training BPE Tokenizer...



Tokenizer trained in 0.66 seconds.


In [8]:
# Step 3: Tokenizing Sentences (Using tqdm for progress tracking)
print("Tokenizing sentences...")
start = time.time()
df["de_tokenized"] = [bpe_tokenizer.encode(text)[:MAX_LEN] for text in tqdm(df["de"].fillna(""))]
df["en_tokenized"] = [bpe_tokenizer.encode(text)[:MAX_LEN] for text in tqdm(df["en"].fillna(""))]
print(f"Tokenization done in {time.time() - start:.2f} seconds.")

Tokenizing sentences...


100%|██████████| 4548885/4548885 [02:13<00:00, 34004.84it/s]
100%|██████████| 4548885/4548885 [02:31<00:00, 30047.97it/s]


Tokenization done in 286.07 seconds.


In [9]:
# Step 4: Converting to Tensors
print("Converting to tensors...")
start = time.time()
src_tensors = [torch.tensor(seq, dtype=torch.long) for seq in df["de_tokenized"] if len(seq) > 0]
tgt_tensors = [torch.tensor(seq, dtype=torch.long) for seq in df["en_tokenized"] if len(seq) > 0]
print(f"Tensors created in {time.time() - start:.2f} seconds.")

Converting to tensors...
Tensors created in 27.08 seconds.


In [10]:
# Step 5: Padding Sequences
def pad_all_sequences(tensors, max_len=None):
    # Pads all sequences to the same length (max_len).
    if max_len is None:
        max_len = max(len(seq) for seq in tensors)

    # Pad each sequence to max_len
    padded_tensors = [F.pad(seq, (0, max_len - len(seq)), value=0) for seq in tensors]

    return torch.stack(padded_tensors)

print("Padding sequences...")
start = time.time()

# Apply padding to both source and target tensors
src_sentences = pad_all_sequences(src_tensors)
tgt_sentences = pad_all_sequences(tgt_tensors)

print(f"Padding done in {time.time() - start:.2f} seconds.")

# Check the shapes of the resulting padded sequences
print(f"Source Sentences Shape: {src_sentences.shape}, Target Sentences Shape: {tgt_sentences.shape}")


Padding sequences...
Padding done in 65.27 seconds.
Source Sentences Shape: torch.Size([4548885, 512]), Target Sentences Shape: torch.Size([4548885, 512])


In [18]:
# Transformer Model Setup
d_model = 32
vocab_size = bpe_tokenizer.vocab_size
seq_len = src_sentences.shape[1]
num_heads = 4
dropout = 0.5
num_layers = 2
ff_dim = 512

In [19]:
# Define model
model = Transformer(
    encoder=Encoder(d_model, nn.ModuleList([
        EncoderBlock(d_model, MultiHeadAttentionBlock(d_model, num_heads, dropout),
                     FeedForwardBlock(d_model, ff_dim, dropout), dropout)
        for _ in range(num_layers)
    ])),
    decoder=Decoder(d_model, nn.ModuleList([
        DecoderBlock(d_model, MultiHeadAttentionBlock(d_model, num_heads, dropout),
                     MultiHeadAttentionBlock(d_model, num_heads, dropout),
                     FeedForwardBlock(d_model, ff_dim, dropout), dropout)
        for _ in range(num_layers)
    ])),
    src_embed=InputEmbeddings(d_model, vocab_size),
    tgt_embed=InputEmbeddings(d_model, vocab_size),
    src_pos=PositionalEncoding(d_model, seq_len, dropout),
    tgt_pos=PositionalEncoding(d_model, seq_len, dropout),
    projection_layer=ProjectionLayer(d_model, vocab_size)
)
model.to(device)

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-1): 2 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=32, out_features=32, bias=False)
          (w_k): Linear(in_features=32, out_features=32, bias=False)
          (w_v): Linear(in_features=32, out_features=32, bias=False)
          (w_o): Linear(in_features=32, out_features=32, bias=False)
          (dropout): Dropout(p=0.5, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=32, out_features=512, bias=True)
          (dropout): Dropout(p=0.5, inplace=False)
          (linear_2): Linear(in_features=512, out_features=32, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.5, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization()
  )
  (de

In [20]:
# Define Optimizer and the criterion
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-5)

In [21]:
# Learning rate scheduler
def get_scheduler(optimizer, warmup_steps=4000):
    return optim.lr_scheduler.LambdaLR(
        optimizer, lr_lambda=lambda step: min((step + 1) ** (-0.5), (step + 1) * (warmup_steps ** (-1.5)))
    )

In [22]:
# Training Setup
writer = SummaryWriter(log_dir="runs/transformer_experiment")
num_epochs = 5
batch_size = 8
scheduler = get_scheduler(optimizer, num_epochs)

In [23]:
# Training loop
for epoch in range(num_epochs):
    start_time = time.time()
    epoch_loss = 0

    for i in tqdm(range(0, len(df), batch_size), desc=f"Epoch {epoch + 1}/{num_epochs}"):
        src_batch = src_sentences[i:i + batch_size].to(device)
        tgt_batch = tgt_sentences[i:i + batch_size].to(device)

        encoder_output = model.encode(src_batch, None)
        decoder_output = model.decode(encoder_output, None, tgt_batch, None)
        output_logits = model.project(decoder_output)

        loss = criterion(output_logits.view(-1, vocab_size), tgt_batch.view(-1).long())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    scheduler.step()
    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(df):.4f}")

    # Test Sentence Translation
    test_sentence_de = df["de"].iloc[0]
    test_tokens = torch.tensor(bpe_tokenizer.encode(test_sentence_de)).unsqueeze(0).to(device)
    test_mask = (test_tokens != 0).unsqueeze(1).unsqueeze(2)

    encoder_output = model.encode(test_tokens, test_mask)
    generated_tokens = torch.tensor([bpe_tokenizer.encode("[CLS]")]).to(device)

    for _ in range(seq_len):
        decoder_output = model.decode(encoder_output, test_mask, generated_tokens, None)
        output_logits = model.project(decoder_output)
        next_token = output_logits[:, -1, :].argmax(dim=-1, keepdim=True)
        if next_token.item() == bpe_tokenizer.encode("[SEP]")[0]:
            break
        generated_tokens = torch.cat((generated_tokens, next_token), dim=1)

    translated_sentence = bpe_tokenizer.decode(generated_tokens.squeeze().cpu().numpy())
    print(f"Test Input (German): {test_sentence_de}")
    print(f"Model Output (English): {translated_sentence}")

writer.close()

Epoch 1/5:   0%|          | 939/568611 [01:49<18:21:08,  8.59it/s]


KeyboardInterrupt: 