#ML Project
#Project Title: **Adaptive Wait-k Policy for Simultaneous Text-to-Text Machine Translation Based on RL**

*Group Members:*
* Tanmay Kumar Shrivastava (12241870)
* Darsh Mahajan (12240500)
* Shavaneeth Gourav (12241100)



---



# Problem Statement
Simultaneous Machine Translation (SiMT) aims to generate translations simultaneously with the reading of the source sentence, balancing translation
quality with latency. Most SiMT models currently require training multiple
models for different latency levels, thus increasing computational costs and,
more importantly, limiting flexibility. The new approach is, like Mixture-
of-Experts Wait-k policy, training multiple wait-k values in balance between
the considerations of both latency and translation quality, leaving the determination of the optimal value of k for unseen data as an open challenge.

Moreover, variability in the structure of structure between different languages
makes the problem even more complicated because the application of a fixed
policy becomes rather ineffective.



---



#Objective

The purpose of this project is to build a Reinforcement Learning (RL) -based
Simultaneous Machine Translation model which decides the optimal wait-k
value dynamically as the translation gets performed based on the language
corpus during translation and, therefore, the translation quality improves
without introduced latency without requiring multiple models trained for it
because, through this adaptive policy for k, it would learn to generalize its
policy across different languages and structures of sentences.

# Coding

In [1]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=67608efaeb8af7b457a72a552c24898d934e4692172ad2fc4a6847b6ec1d746e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


# Importing Libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import matplotlib.pyplot as plt

In [3]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



In [4]:
# Define the dataset class
class SimultaneousTranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, max_length):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.max_length = max_length

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        source_sentence = self.source_sentences[idx]
        target_sentence = self.target_sentences[idx]

        # Preprocess the source and target sentences
        source_tokens = tokenizer.encode(source_sentence, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        target_tokens = tokenizer.encode(target_sentence, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

        # Shift target tokens for decoder input
        decoder_input_ids = target_tokens.clone().detach()
        decoder_input_ids = torch.roll(decoder_input_ids, 1, dims=1)  # Shift tokens to the right
        decoder_input_ids[:, 0] = tokenizer.pad_token_id  # Set the first token as padding

        return {
            'input_ids': source_tokens.squeeze(),  # Remove extra dimensions
            'attention_mask': torch.ones_like(source_tokens).squeeze(),
            'labels': target_tokens.squeeze(),
            'decoder_input_ids': decoder_input_ids.squeeze()  # Add shifted decoder input IDs
        }

In [5]:
class MOEWaitKPolicy(nn.Module):
    def __init__(self, num_heads, hidden_size, output_size, dropout):
        super(MOEWaitKPolicy, self).__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout

        # Embedding layer to convert token IDs into hidden states (vectors)
        self.embedding = nn.Embedding(30522, hidden_size)  # 30522 is a common vocab size, adjust as needed

        self.expert_lagging = nn.Parameter(torch.randn(num_heads))  # Size: [num_heads]
        self.expert_weights = nn.Parameter(torch.randn(num_heads))  # Size: [num_heads]

        # Linear layer to align dimensions before passing to the output
        self.linear = nn.Linear(hidden_size * num_heads, output_size)

    def forward(self, input_ids, attention_mask):
        # Embed the input_ids to get the hidden states
        embedded_input = self.embedding(input_ids)  # Shape: [batch_size, seq_len, hidden_size]

        batch_size, seq_len, hidden_size = embedded_input.size()

        # Expand expert_lagging and expert_weights to match the sequence length
        expert_lagging_expanded = self.expert_lagging.unsqueeze(0).unsqueeze(-1).expand(batch_size, -1, seq_len)
        expert_weights_expanded = self.expert_weights.unsqueeze(0).unsqueeze(-1).expand(batch_size, -1, seq_len)

        expert_outputs = []
        for i in range(self.num_heads):
            # Perform element-wise multiplication with broadcasting
            expert_output = expert_lagging_expanded[:, i, :] * embedded_input + expert_weights_expanded[:, i, :] * attention_mask.unsqueeze(-1)
            expert_outputs.append(expert_output.unsqueeze(2))  # Unsqueeze to prepare for concatenation

        # Concatenate expert outputs along the hidden size dimension
        context_vector = torch.cat(expert_outputs, dim=2)  # Concatenate along the hidden dimension

        # Pass through a linear layer to ensure the dimensions match before the final output
        output = self.linear(context_vector)

        return output

In [6]:
class AdaptiveWaitKModel(nn.Module):
    def __init__(self, encoder, decoder, hidden_size, num_heads, output_size, dropout):
        super(AdaptiveWaitKModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.adaptive_wait_k_policy = MOEWaitKPolicy(num_heads, hidden_size, output_size, dropout)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input_ids, attention_mask, decoder_input_ids):
        # Encode the input
        encoder_output = self.encoder(input_ids, attention_mask=attention_mask)

        # Calculate the adaptive wait-k policy output
        adaptive_wait_k_output = self.adaptive_wait_k_policy(encoder_output.last_hidden_state, attention_mask)

        # Decode the output
        decoder_output = self.decoder(decoder_input_ids, attention_mask=attention_mask, encoder_hidden_states=adaptive_wait_k_output)

        # Apply the final linear layer
        output = self.linear(decoder_output.last_hidden_state)

        return output

In [7]:
class SimultaneousTranslationModel(nn.Module):
    def __init__(self, num_heads, hidden_size, dropout):
        super(SimultaneousTranslationModel, self).__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.dropout = dropout

        output_size = hidden_size  # Define the output size, can be customized

        # Pass all required arguments including output_size
        self.moe_wait_k_policy = MOEWaitKPolicy(num_heads, hidden_size, output_size, dropout)

        # Linear layer for reinforcement learning policy, hidden_size should match
        self.reinforcement_learning_policy = nn.Linear(hidden_size, num_heads)

    def forward(self, input_ids, attention_mask, decoder_input_ids=None):
        # MOE-Wait-k policy output based on input_ids and attention_mask
        moe_wait_k_output, _ = self.moe_wait_k_policy(input_ids, attention_mask)

        # If decoder_input_ids are provided, use them for the decoder logic
        if decoder_input_ids is not None:
            # You can apply additional decoder logic here if necessary
            pass

        # Apply reinforcement learning policy
        reinforcement_learning_output = self.reinforcement_learning_policy(moe_wait_k_output)

        return reinforcement_learning_output

#Load the corpus (for testing purposes)

In [8]:
source_sentences = [
    "This is an example sentence.",
    "I am a large language model.",
    "Simultaneous translation is challenging.",
]
target_sentences = [
    "これは例文です。",
    "私は大規模言語モデルです。",
    "同時翻訳は難しいです。",
]

In [9]:
dataset = SimultaneousTranslationDataset(source_sentences, target_sentences, max_length=512)

In [10]:
# Define the training components
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimultaneousTranslationModel(num_heads=8, hidden_size=512, dropout=0.1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

Here, the SimultaneousTranslationModel is being initialized with the following hyperparameters:

* num_heads=8: The number of attention heads for the adaptive wait-k policy.
* hidden_size=512: The size of the hidden layers in the model.
* dropout=0.1: Dropout rate used to prevent overfitting during training.

The .to(device) method moves the model to the selected device (GPU or CPU) so that it can perform computations on that device.

**criterion = nn.CrossEntropyLoss()** : This defines the loss function to be used during training. CrossEntropyLoss is commonly used for classification problems where the task is to assign input sequences to specific classes.

In [11]:
for epoch in range(10):
    model.train()
    total_loss = 0

    # Training loop
    for batch in DataLoader(dataset, batch_size=2, shuffle=True):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_input_ids = batch['decoder_input_ids'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(input_ids, attention_mask, decoder_input_ids)

        # Output needs to match the labels shape for loss calculation
        output_logits = output.view(-1, output.size(-1))  # Flattening logits
        labels_flat = labels.view(-1)  # Flattening labels

        # Calculate the loss
        loss = criterion(output_logits, labels_flat)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(DataLoader(dataset, batch_size=2))}')

    # Evaluation during training
    model.eval()
    total_bleu = 0
    total_rouge = 0

    with torch.no_grad():
        for batch in DataLoader(dataset, batch_size=2, shuffle=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_input_ids = batch['decoder_input_ids'].to(device)

            # Forward pass
            output = model(input_ids, attention_mask, decoder_input_ids)

            # Decode predicted and reference sentences
            predicted_sentences = tokenizer.batch_decode(torch.argmax(output, dim=-1), skip_special_tokens=True)
            reference_sentences = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # BLEU score calculation
            for pred, ref in zip(predicted_sentences, reference_sentences):
                bleu_score = sentence_bleu([ref.split()], pred.split())
                total_bleu += bleu_score

            # ROUGE score calculation
            rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
            for pred, ref in zip(predicted_sentences, reference_sentences):
                scores = rouge.score(ref, pred)
                total_rouge += scores['rougeL'].fmeasure

    avg_bleu = total_bleu / len(DataLoader(dataset, batch_size=2))
    avg_rouge = total_rouge / len(DataLoader(dataset, batch_size=2))

    bleu_scores.append(avg_bleu)
    rouge_scores.append(avg_rouge)
    print(f'Average BLEU: {avg_bleu}, Average ROUGE: {avg_rouge}')


RuntimeError: The size of tensor a (2) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
# Plot BLEU and ROUGE Scores
plt.plot(bleu_scores, label='BLEU')
plt.plot(rouge_scores, label='ROUGE')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('BLEU and ROUGE Scores over Training Epochs')
plt.legend()
plt.show()