# Machine Translation (EN to DE)

# Google Colab/Local Setup

In [None]:
from google.colab import drive
import os
import torch
#import torch_xla.core.xla_model as xm

drive.mount('/content/drive')
# path to the link you created
en_file = '/content/drive/MyDrive/Machine Translation DE > EN/europarl-v7.en_20k_lines.txt'
de_file = '/content/drive/MyDrive/Machine Translation DE > EN/europarl-v7.de_20k_lines.txt'
val_en_file = '/content/drive/MyDrive/Machine Translation DE > EN/europarl-v7.en_tiny_lines.txt'
val_de_file = '/content/drive/MyDrive/Machine Translation DE > EN/europarl-v7.de_tiny_lines.txt'
test_file = '/content/drive/MyDrive/Machine Translation DE > EN/test-en.txt'

# Model save path
model_save_dir = '/content/drive/MyDrive/MachineTranslationModels'
# Use os.path.join to create the full path for the model checkpoint
model_save_path = os.path.join(model_save_dir, 'transformer_best.pth')

# Set the device to the TPU
#device = xm.xla_device()
# Set the device to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
!pip install cloud-tpu-client
!pip install torch
!pip install torchvision
!pip install torch-xla
!pip install spacy
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm
!pip install --upgrade torchtext

# Transformer Model

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

class AttentionHead(nn.Module):
    def __init__(self, model_dim, head_count):
        super(AttentionHead, self).__init__()
        assert model_dim % head_count == 0, "model_dim must be divisible by head_count"

        self.model_dim = model_dim
        self.head_count = head_count
        self.depth = model_dim // head_count

        self.query_weight = nn.Linear(model_dim, model_dim)
        self.key_weight = nn.Linear(model_dim, model_dim)
        self.value_weight = nn.Linear(model_dim, model_dim)
        self.output_weight = nn.Linear(model_dim, model_dim)

    def attention(self, query, key, value, mask=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        probabilities = torch.softmax(scores, dim=-1)
        attention_output = torch.matmul(probabilities, value)
        return attention_output

    def split(self, tensor):
        batch_size, sequence_length, model_dim = tensor.size()
        return tensor.view(batch_size, sequence_length, self.head_count, self.depth).transpose(1, 2)

    def combine(self, tensor):
        batch_size, _, sequence_length, depth = tensor.size()
        return tensor.transpose(1, 2).contiguous().view(batch_size, sequence_length, self.model_dim)

    def forward(self, query, key, value, mask=None):
        query = self.split(self.query_weight(query))
        key = self.split(self.key_weight(key))
        value = self.split(self.value_weight(value))

        attention = self.attention(query, key, value, mask)
        combined_attention = self.combine(attention)
        output = self.output_weight(combined_attention)
        return output

class FeedForwardNetwork(nn.Module):
    def __init__(self, model_dim, ff_dim):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = nn.Linear(model_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, model_dim)
        self.relu = nn.ReLU()

    def forward(self, tensor):
        return self.linear2(self.relu(self.linear1(tensor)))

class PositionalEncoder(nn.Module):
    def __init__(self, model_dim):
        super(PositionalEncoder, self).__init__()
        self.model_dim = model_dim

    def forward(self, tensor):
        batch_size, sequence_length = tensor.size(0), tensor.size(1)
        position_encoding = torch.zeros(sequence_length, self.model_dim, device=tensor.device)
        position = torch.arange(0, sequence_length, dtype=torch.float, device=tensor.device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.model_dim, 2).float() * -(math.log(10000.0) / self.model_dim)).to(tensor.device)
        position_encoding[:, 0::2] = torch.sin(position * div_term)
        position_encoding[:, 1::2] = torch.cos(position * div_term)
        position_encoding = position_encoding.unsqueeze(0).repeat(batch_size, 1, 1)
        return tensor + position_encoding

class EncoderBlock(nn.Module):
    def __init__(self, model_dim, head_count, ff_dim, dropout_rate):
        super(EncoderBlock, self).__init__()
        self.attention = AttentionHead(model_dim, head_count)
        self.feed_forward = FeedForwardNetwork(model_dim, ff_dim)
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, tensor, mask):
        attention_output = self.attention(tensor, tensor, tensor, mask)
        tensor = self.norm1(tensor + self.dropout(attention_output))
        ff_output = self.feed_forward(tensor)
        tensor = self.norm2(tensor + self.dropout(ff_output))
        return tensor

class DecoderBlock(nn.Module):
    def __init__(self, model_dim, head_count, ff_dim, dropout_rate):
        super(DecoderBlock, self).__init__()
        self.self_attention = AttentionHead(model_dim, head_count)
        self.cross_attention = AttentionHead(model_dim, head_count)
        self.feed_forward = FeedForwardNetwork(model_dim, ff_dim)
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.norm3 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, tensor, encoder_output, source_mask, target_mask):
        self_attention_output = self.self_attention(tensor, tensor, tensor, target_mask)
        tensor = self.norm1(tensor + self.dropout(self_attention_output))
        cross_attention_output = self.cross_attention(tensor, encoder_output, encoder_output, source_mask)
        tensor = self.norm2(tensor + self.dropout(cross_attention_output))
        ff_output = self.feed_forward(tensor)
        tensor = self.norm3(tensor + self.dropout(ff_output))
        return tensor

class Transformer(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, model_dim, head_count, layer_count, ff_dim, dropout_rate):
        super(Transformer, self).__init__()
        self.source_embedding = nn.Embedding(source_vocab_size, model_dim)
        self.target_embedding = nn.Embedding(target_vocab_size, model_dim)
        self.position_encoder = PositionalEncoder(model_dim)

        self.encoder_blocks = nn.ModuleList([EncoderBlock(model_dim, head_count, ff_dim, dropout_rate) for _ in range(layer_count)])
        self.decoder_blocks = nn.ModuleList([DecoderBlock(model_dim, head_count, ff_dim, dropout_rate) for _ in range(layer_count)])

        self.final_linear = nn.Linear(model_dim, target_vocab_size)
        self.dropout = nn.Dropout(dropout_rate)

    def create_mask(self, source, target):
        source_mask = (source != 0).unsqueeze(1).unsqueeze(2)
        target_mask = (target != 0).unsqueeze(1).unsqueeze(3)
        sequence_length = target.size(1)

        # Ensure future_mask is created on the same device as target
        future_mask = (1 - torch.triu(torch.ones(1, sequence_length, sequence_length, device=target.device), diagonal=1)).bool()

        target_mask = target_mask & future_mask
        return source_mask, target_mask

    def forward(self, source, target):
        source_mask, target_mask = self.create_mask(source, target)
        source_embedded = self.dropout(self.position_encoder(self.source_embedding(source)))
        target_embedded = self.dropout(self.position_encoder(self.target_embedding(target)))

        encoder_output = source_embedded
        for encoder_block in self.encoder_blocks:
            encoder_output = encoder_block(encoder_output, source_mask)

        decoder_output = target_embedded
        for decoder_block in self.decoder_blocks:
            decoder_output = decoder_block(decoder_output, encoder_output, source_mask, target_mask)

        final_output = self.final_linear(decoder_output)
        return final_output

## Breakdown of the Architecture

- `Embeddings:` Converts input tokens into fixed-size vectors.

- `Positional Encoding:` Adds information about the position of tokens in the sequence.

- `Encoder Blocks:` Each block applies self-attention to the source sequence and passes the result through a feed-forward network. Normalization and dropout are included for stability and regularization.

- `Decoder Blocks:` Similar to encoder blocks but with an additional cross-attention layer that focuses on the encoder's output. Includes self-attention, cross-attention, a feed-forward network, normalization, and dropout.

- `Masks:` Source masks ignore padding, and target masks ensure predictions are based only on previously seen tokens.

- `Final Linear Layer:` Transforms decoder output to the target vocabulary size for probability predictions.

- `Output:` The model outputs a sequence of probability distributions over the target vocabulary.

# Text Preprocessing

In [10]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from torchtext.vocab import build_vocab_from_iterator
from nltk.translate.bleu_score import corpus_bleu
from torch.utils.data.dataset import random_split

"""Step 1: Load the Data from the text files."""
def load_data(en_file, de_file):
    with open(en_file, 'r', encoding='utf-8') as f:
        english_sentences = f.readlines()
    with open(de_file, 'r', encoding='utf-8') as f:
        german_sentences = f.readlines()
    return english_sentences, german_sentences

english_sentences, german_sentences = load_data(en_file, de_file)

"""Step 2: Tokenization Tokenize the sentences into words.."""
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')

def yield_tokens(data_iter, tokenizer):
    for sentence in data_iter:
        yield tokenizer(sentence)

def build_vocab(sentences, tokenizer):
    vocab = build_vocab_from_iterator(yield_tokens(sentences, tokenizer), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    vocab.set_default_index(vocab['<unk>'])  # Set '<unk>' as the default index for unknown tokens
    return vocab

en_vocab = build_vocab(english_sentences, en_tokenizer)
de_vocab = build_vocab(german_sentences, de_tokenizer)

src_vocab_size = len(en_vocab)
tgt_vocab_size = len(de_vocab)
# print(src_vocab_size,tgt_vocab_size)

"""Step 3: Convert the tokenized sentences to integer sequences."""
def tokenize_and_convert_to_ints(sentences, tokenizer, vocab):
    token_ids = []
    for sentence in sentences:
        tokens = tokenizer(sentence)
        token_ids.append([vocab['<bos>']] + [vocab[token] for token in tokens] + [vocab['<eos>']])
    return token_ids

en_token_ids = tokenize_and_convert_to_ints(english_sentences, en_tokenizer, en_vocab)
de_token_ids = tokenize_and_convert_to_ints(german_sentences, de_tokenizer, de_vocab)

"""Step 4: Padding Sequences"""

def pad_sequences(token_ids, pad_index):
    return pad_sequence([torch.tensor(s) for s in token_ids], padding_value=pad_index, batch_first=True)

en_padded = pad_sequences(en_token_ids, en_vocab['<pad>'])
de_padded = pad_sequences(de_token_ids, de_vocab['<pad>'])



# Hyperparamters

In [11]:
num_epochs = 100
BATCH_SIZE = 32
validation_split_percentage = 0.2
d_model = 512
num_heads = 4
num_layers = 2
d_ff = 1024
max_seq_length = 66
dropout = 0.1
learning_rate = 0.0001

total_size = len(en_padded)
val_size = int(total_size * validation_split_percentage)
train_size = total_size - val_size

# Create the combined dataset
full_dataset = TensorDataset(en_padded, de_padded)

# Split the dataset into training and validation sets
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Create DataLoaders for both datasets
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Instantiate the transformer model
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, dropout)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

# Move the model to device
transformer.to(device)

Transformer(
  (source_embedding): Embedding(18681, 512)
  (target_embedding): Embedding(35731, 512)
  (position_encoder): PositionalEncoder()
  (encoder_blocks): ModuleList(
    (0-1): 2 x EncoderBlock(
      (attention): AttentionHead(
        (query_weight): Linear(in_features=512, out_features=512, bias=True)
        (key_weight): Linear(in_features=512, out_features=512, bias=True)
        (value_weight): Linear(in_features=512, out_features=512, bias=True)
        (output_weight): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardNetwork(
        (linear1): Linear(in_features=512, out_features=1024, bias=True)
        (linear2): Linear(in_features=1024, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_blocks): ModuleList(
 

## Description of Hyperparamters

- `num_epochs`: 100  
  The number of training cycles through the entire dataset.

- `BATCH_SIZE`: 32  
  The number of samples that will be propagated through the network in one forward/backward pass.

- `validation_split_percentage`: 0.2  
  The fraction of the dataset to be used as validation data.

- `d_model`: 512  
  The number of expected features in the encoder/decoder inputs (dimensionality of the embeddings).

- `num_heads`: 4  
  The number of heads in the multi-head attention models.

- `num_layers`: 2  
  The number of sub-encoder/decoder layers in the transformer.

- `d_ff`: 1024  
  The dimension of the feed-forward network model.

- `max_seq_length`: 66  
  The maximum length of the input sequences.

- `dropout`: 0.1  
  The dropout value is a regularization parameter.

- `learning_rate`: 0.0001  
  The step size at each iteration while moving toward a minimum of the loss function.

- `optimizer`: Adam  
  An optimization algorithm that can handle sparse gradients on noisy problems. It uses the square of gradients to scale the learning rate and takes advantage of momentum by using the moving average of the gradient instead of gradient itself.

# Model Training

In [None]:
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import ReduceLROnPlateau

training_losses = []
validation_losses = []

# Initialize scheduler and early stopping parameters
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.1, verbose=True)
early_stopping_patience = 10
early_stopping_counter = 0
best_val_loss = float('inf')
best_model_state = None

for epoch in range(num_epochs):
    # Training phase
    transformer.train()
    total_loss = 0
    for source, target in train_dataloader:
        source, target = source.to(device), target.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        output = transformer(source, target[:, :-1])

        # Compute loss; we need to reshape the output and target
        loss = criterion(output.reshape(-1, output.shape[-1]), target[:, 1:].reshape(-1))

        # Backward pass
        loss.backward()

        # Gradient clipping
        clip_grad_norm_(transformer.parameters(), max_norm=1.0)

        # Optimization step
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    training_losses.append(average_loss)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {average_loss:.4f}')

    # Validation phase
    transformer.eval()
    total_val_loss = 0
    with torch.no_grad():
        for source, target in validation_dataloader:
            source, target = source.to(device), target.to(device)

            # Forward pass
            output = transformer(source, target[:, :-1])

            # Compute loss
            val_loss = criterion(output.reshape(-1, output.shape[-1]), target[:, 1:].reshape(-1))
            total_val_loss += val_loss.item()

    average_val_loss = total_val_loss / len(validation_dataloader)
    validation_losses.append(average_val_loss)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {average_val_loss:.4f}')

    # Learning rate scheduling
    scheduler.step(average_val_loss)

    # Early stopping and checkpointing for the best model
    if average_val_loss < best_val_loss:
        print(f'Validation loss improved from {best_val_loss:.4f} to {average_val_loss:.4f}. Saving checkpoint...')
        best_val_loss = average_val_loss
        best_model_state = transformer.state_dict()

        # Checkpoint for resuming training later
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': best_model_state,
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_val_loss': best_val_loss,
            'early_stopping_counter': early_stopping_counter,
            'training_losses': training_losses,
            'validation_losses': validation_losses
        }

        # Use os.path.join to create the full path for the checkpoint
        best_checkpoint_path = os.path.join(model_save_dir, 'transformer_best.pth')

        # Save the checkpoint
        torch.save(checkpoint, best_checkpoint_path)

        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

# Load the best model state before testing or deployment
best_model_path = os.path.join(model_save_dir, 'transformer_best.pth')
checkpoint = torch.load(best_model_path)
transformer.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

## Description of Training

The training was performed on the Europarl Parallel corpus, but only with 20,000 lines of text due to memory limitations and constant crashes on Google Colab's environment.
1. **Setup**:
   - A learning rate scheduler (`ReduceLROnPlateau`) is initialized to reduce the learning rate when the validation loss stops improving, helping in convergence.
   - Early stopping parameters are set to prevent overfitting if the validation loss does not improve for a certain number of epochs (`early_stopping_patience`).

2. **Training Loop**:
   The model undergoes training for the predefined number of epochs, with each epoch consisting of both training and validation phases. (epochs were set to 100 due to reliance on Early Stopping)

   - **Training Phase**:
     - The model is set to training mode (`transformer.train()`).
     - For each batch, the model performs a forward pass and computes the loss.
     - Backpropagation is performed (`loss.backward()`), and gradients are clipped to prevent exploding gradients (`clip_grad_norm_`).
     - The optimizer updates the model's weights (`optimizer.step()`).
     - The total training loss is accumulated and logged.

   - **Validation Phase**:
     - The model is set to evaluation mode (`transformer.eval()`).
     - Validation loss is calculated over the entire validation dataset without gradient updates.
     - The scheduler checks the validation loss to adjust the learning rate if necessary.
     - Validation loss is recorded and compared to the best loss seen so far.

3. **Checkpointing**:
   - If the validation loss improves, the model's state is saved (`torch.save(checkpoint, best_checkpoint_path)`).
   - This checkpoint includes the model state, optimizer state, scheduler state, and the current best validation loss, so training can be started again from the last checkpoint.
   - Early stopping counter is reset if there's an improvement.

4. **Early Stopping Check**:
   - If there's no improvement in validation loss for a number of epochs specified by `early_stopping_patience`, early stopping is triggered, and training is stopped.

5. **Final Model Loading**:
   - After training, the best model state is loaded from the checkpoint for testing  (`transformer.load_state_dict(checkpoint['model_state_dict'])`).

# Plotting the Loss Curves

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Assuming that training_losses and validation_losses are populated with the respective loss values
epochs = np.arange(1, len(training_losses) + 1)

# Plotting the training and validation loss curves
plt.figure(figsize=(10, 5))

# Plot training loss
plt.plot(epochs, training_losses, label='Training Loss')

# Plot validation loss
plt.plot(epochs, validation_losses, label='Validation Loss', linestyle='--')

plt.title('Loss Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Set x-ticks to start from 1
plt.xticks(epochs)

plt.show()

![title](https://i.imgur.com/kKd4SkV.png)


# Calculating BLEU score on Validation data

In [14]:
from nltk.translate.bleu_score import SmoothingFunction

test_en_sentences, test_de_sentences = load_data(val_en_file, val_de_file)

# Preprocess the test data
test_en_token_ids = tokenize_and_convert_to_ints(test_en_sentences, en_tokenizer, en_vocab)
test_de_token_ids = tokenize_and_convert_to_ints(test_de_sentences, de_tokenizer, de_vocab)

test_en_padded = pad_sequences(test_en_token_ids, en_vocab['<pad>'])
test_de_padded = pad_sequences(test_de_token_ids, de_vocab['<pad>'])

# Create the test dataset and DataLoader
test_dataset = TensorDataset(test_en_padded, test_de_padded)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Function to decode model output into sentences
def decode_output(output, vocab):
    itos = vocab.get_itos()  # Get the list of tokens (index to string)
    sentence = [itos[token] for token in output if token not in (vocab['<bos>'], vocab['<eos>'], vocab['<pad>'])]
    return ' '.join(sentence)

# Evaluate on the test set
transformer.eval()
all_predictions = []
all_references = []
with torch.no_grad():
    for source, target in test_dataloader:
        source, target = source.to(device), target.to(device)

        # Forward pass
        output = transformer(source, target[:, :-1])

        # Convert model output to tokens (may need to implement beam search or greedy decoding)
        predicted_tokens = output.argmax(2)

        # Decode tokens into sentences
        predictions = [decode_output(t, de_vocab) for t in predicted_tokens]
        references = [decode_output(t, de_vocab) for t in target]

        all_predictions.extend(predictions)
        all_references.extend(references)

# Calculate BLEU score with smoothing to avoid zero scores for higher-order n-grams when there are no matches
chencherry = SmoothingFunction()
bleu_score = corpus_bleu(
    [[ref.split()] for ref in all_references],  # Reference sentences should be tokenized into lists of words
    [pred.split() for pred in all_predictions],  # Hypothesis sentences should be tokenized into lists of words
    smoothing_function=chencherry.method1  # Use smoothing method
)
print(f'BLEU score on validation data: {bleu_score:.4f}')

BLEU score on validation data: 0.0149


# Output Translation from Test file

In [None]:
# Translate the test set and print the translations
def translate_and_print_test_set(test_dataloader, model, de_vocab, device):
    model.eval()

    with torch.no_grad():
        for source, _ in test_dataloader:  # No need for German sentences here
            source = source.to(device)

            # Forward pass
            output = model(source, torch.zeros((source.size(0), 1), dtype=torch.long).to(device))
            predicted_tokens = output.argmax(2)

            # Decode tokens into sentences and print them
            for t in predicted_tokens:
                translation = decode_output(t[1:], de_vocab)  # Skip the <bos> token
                print(translation)

translate_and_print_test_set(test_dataloader, transformer, de_vocab, device)

# Final Comments on Evaluation

Given the constraints of memory and computational resources on Google Colab's environment, the model's performance is limited by the inability to scale up the hyperparameters. With only **2 layers** and **4 attention heads**, may be too simplistic to effectively capture the complex patterns in the language translation task, but adjustments are not feasible within the current resource limitations.

The dataset size of **20,000 lines**, did not provide the diversity and volume of examples needed for the model to generalize well. Language translation is a highly complex task that benefits from large datasets to cover the complexities and variances in language use. However, using a larger dataset led to further memory and computational complications.

Hyperparameter optimization is a crucial step in improving model performance, specifically the batch size might be too small to provide stable gradient estimates, while typical applications use a batch size of **64**, our model was only trained on **32** to avoid memory crashes.

In summary, the model's suboptimal performance can be attributed to a combination of its relatively simple architecture, the limited size of the training dataset, and the conservative hyperparameter settings, all of which are bounded by the current memory and computational bottlenecks. 