## Implementation of Transformer for machine translation
### The __torchtext.data__ may through an error stating `no module found named "Field"` which probably arises due to deprecation of this module in the newer version of torch. Execute the cell below to install the `torchtext version 0.6.0` to run the notebook. This is because the _Field_ and _TabularDataset_ makes the vocabulary and dataloader creation much simpler.
```python
pip install torchtext==0.6.0
print(torchtext.__version__)
```

In [2]:
# pip install torchtext==0.6.0
# print(torchtext.__version__)

In [3]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.optim as optim
from sklearn.model_selection import train_test_split
import os
from indicnlp.tokenize import indic_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import random
from collections import Counter
from torchtext import vocab
import warnings
import re, string
from string import digits
warnings.filterwarnings("ignore")

In [4]:
def preprocess(text):
    """
    Convert all the text into lower letters
    Remove the words betweent brakets ()
    Remove these characters: {'$', ')', '?', '"', '’', '.',  '°', '!', ';', '/', "'", '€', '%', ':', ',', '('}
    Replace these special characters with space:
    Replace extra white spaces with single white spaces
    """
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub('[$)\"’°;\'€%:,(/]', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\u200d', ' ', text)
    text = re.sub('\u200c', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub('   ', ' ', text)
    text =" ".join(text.split())
    return text

In [6]:
# Set the dataset name
l = "malayalam"

# Read the CSV file from the specified directory into a DataFrame
data = pd.read_csv('../Data/{}.csv'.format(l))

# Drop the unnecessary columns "Unnamed: 0" and "entry_id" from the DataFrame
data.drop(["Unnamed: 0", "entry_id"], inplace=True, axis=1)

# Note: The next operation seems redundant as "entry_id" has already been dropped.
# Rename the column "entry_id" to "id" (if it exists)
data = data.rename(columns={"entry_id": "id"})

# Display the first 10 rows of the cleaned DataFrame 
# (This will be visible in interactive environments like Jupyter Notebook)
data.head(10)

# Write the cleaned data back to a new CSV file in the current directory
data.to_csv("{}.csv".format(l), index=False)

In [7]:
# reading the cleaned
data = pd.read_csv("{}.csv".format(l))
data.head(10)

Unnamed: 0,english,malayalam
0,Earlier it was believed that women develops co...,ആദ്യം മനസ്സിലാക്കിയിരുന്നത് ഇത് കേവലം ജനനസംബന്...
1,"This can be hard to do, but having an emergenc...","ഇത് ചെയ്യാൻ ബുദ്ധിമുട്ടായിരിക്കാം, പക്ഷേ ഒരു അ..."
2,Encourage them to put money away so they'll se...,പണം മാറ്റിവയ്ക്കാൻ അവരെ പ്രോത്സാഹിപ്പിക്കുകവഴി...
3,"Traditional spices like cloves , cardamom , bl...","ഗ്രാമ്പൂ , ഏലക്കാ , കുരുമുളക് , ജാതിക്കാ , കായ..."
4,It is said that respectable main six Raga are ...,പറയുന്നത് എന്തെന്നാല്‍ ശ്രീ പ്രമുഖ ആറു രാഗം ശങ...
5,olly how is the new movie swat,olly പുതിയ ചലച്ചിത്രം ദൃശ്യം എങ്ങനെയുണ്ട്
6,In its first weekend the film collected ₹7 mil...,ആദ്യ വാരാന്ത്യത്തിൽ ഈ ചിത്രം 7 ദശലക്ഷം രൂപ (92...
7,Patients becomes weak .,രോഗി ദുര്‍ബലനായി മാറുന്നു .
8,Singh enrolled on a 12-week transformation pro...,സിംഗ് സ്റ്റീവൻസിനൊപ്പം കർശനമായ പ്രോട്ടീൻ ഡയറ്റ...
9,"As of 2010, the Asia Pacific Floorball Champio...","2010 ലെ കണക്കനുസരിച്ച്, ഏഷ്യ പസഫിക് ഫ്ലോർബോൾ ച..."


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
def tokenizer(text): 
    """
    Tokenize the input text.
    
    Parameters:
    - text (str): Input text to be tokenized.
    
    Returns:
    - list: List of tokens.
    """
    return [tok for tok in preprocess(text).split()]

# Define Fields for tokenization and preprocessing
lang = Field(tokenize = tokenizer, lower = True, init_token = "<sos>", eos_token = "<eos>")
eng = Field(tokenize = tokenizer, lower = True, init_token = "<sos>", eos_token = "<eos>")

# Define data fields for loading the dataset
datafields = [("english", eng), ("{}".format(l), lang)]
# Load the dataset from a CSV file
dataset = TabularDataset(path="{}.csv".format(l), format='csv', skip_header=True, fields=datafields)
# Split the dataset into training and validation sets
train_data, val_data = dataset.split(split_ratio = 0.80)

# Build vocabulary for each language from the training data
lang.build_vocab(train_data, min_freq = 1, max_size = 50000)
eng.build_vocab(train_data, min_freq = 1, max_size = 50000)

# creating the train and validation data iterator for training
train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 32, 
    device = device, 
    sort_key = lambda x: getattr(x,l),  # change the language after x.
    sort_within_batch = True)

In [11]:
# View the first 5 examples
for i, example in enumerate(dataset.examples):
    if i >= 5:  # limit to first 5 for demonstration purposes
        break
    print("English:", example.english)
    print("{}:".format(l.title()), getattr(example, l))
    print("---")

English: ['earlier', 'it', 'was', 'believed', 'that', 'women', 'develops', 'colon', 'cancer', 'only', 'for', 'genetic', 'reasons', 'but', 'now', 'with', 'the', 'increasing', 'smoking', 'and', 'drinking', 'habits', 'in', 'women', 'they', 'have', 'high', 'risk', 'of', 'colon', 'cancer', '.']
Malayalam: ['ആദ്യം', 'മനസ്സിലാക്കിയിരുന്നത്', 'ഇത്', 'കേവലം', 'ജനനസംബന്ധമായ', 'കാരണങ്ങളാല്', 'സ്ത്രീകളില്', 'വന്', 'കുടല്', 'അര്', 'ബുദം', 'ഉണ്ടാകുന്നു', 'എന്നാണ്', 'എന്നാല്', 'ഇപ്പോള്', 'സ്ത്രീകളില്', 'കൂടിവരുന്ന', 'പുകവലിയും', 'മദ്യപാനശീലത്താലും', 'വന്', 'കുടല്', 'അര്', 'ബുദത്തിന്', 'റെ', 'ഭീതി', 'കൂടുന്നു', '.']
---
English: ['this', 'can', 'be', 'hard', 'to', 'do', 'but', 'having', 'an', 'emergency', 'fund', 'is', 'necessary', 'to', 'protect', 'you', 'and', 'your', 'family', 'in', 'an', 'emergency', '.']
Malayalam: ['ഇത്', 'ചെയ്യാൻ', 'ബുദ്ധിമുട്ടായിരിക്കാം', 'പക്ഷേ', 'ഒരു', 'അടിയന്തിര', 'സാഹചര്യത്തിൽ', 'നിങ്ങളെയും', 'നിങ്ങളുടെ', 'കുടുംബത്തെയും', 'സംരക്ഷിക്കാൻ', 'ഒരു', 'അടിയന്തര', 'ഫണ്ട്', 'ആവശ്യമ

In [13]:
# Model Configuration

# Define hyperparameters and model configuration values
DROPOUT_RATE = 0.2 # Dropout rate used for regularization in the model
EPOCHS = 1 # Total number of training epochs (full passes over the training dataset)
BATCH_SIZE = 16 # Number of training examples processed in a single batch during training
TEACHER_FORCE_RATIO = 0.1 # Probability with which true target tokens are used as the next input instead of the predicted tokens during training (used in sequence-to-sequence models)
NUM_LAYERS =  1 # Number of recurrent layers in the model
HIDDEN_SIZE = 600 # Number of features in the hidden state of the recurrent unit (e.g., GRU or LSTM)
EMBEDDING_SIZE = 300 # Size of the embedding vectors used to represent tokens
SRC_VOCAB_SIZE = len(eng.vocab) # Vocabulary size for the source language (English in this case)
TAR_VOCAB_SIZE = len(lang.vocab) # Vocabulary size for the target language
PAD_IDX = eng.vocab.stoi["<pad>"]  # Index for the padding token
SOS_IDX = eng.vocab.stoi["<sos>"] # index for start token
EOD_IDX = eng.vocab.stoi["<eos>"] # index for end token
INPUT_SIZE_EN = SRC_VOCAB_SIZE # Input size for the encoder (equal to the source vocabulary size)
INPUT_SIZE_DR = TAR_VOCAB_SIZE
OUTPUT_SIZE_DR = TAR_VOCAB_SIZE # Input and output sizes for the decoder (equal to the target vocabulary size)
LEARNING_RATE = 0.001 # Learning rate for the optimizer
WEIGHT_DECAY = 0.0008 # Weight decay parameter for regularization in the optimizer
eng_tokens = [] # List to store tokenized sentences for the source language
bn_tokens = [] # List to store tokenized sentences for the target language (Bengali in this example)
device = ("cuda" if torch.cuda.is_available() else "cpu") # Device configuration (uses GPU if available, otherwise falls back to CPU)
pad_idx = eng.vocab.stoi["<pad>"]

In [15]:
class Encoder(nn.Module):
    """
    Encoder module for the Seq2Seq architecture with attention.

    Attributes:
    - embedding: An embedding layer that transforms input tokens into embeddings.
    - gru: A bi-directional GRU (Gated Recurrent Unit) layer.
    - fc_hidden: A linear layer that reduces the combined forward and backward hidden states to the desired hidden size.
    - dropout: A dropout layer for regularization.
    """
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_dim

        # Define the layers
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, bidirectional=True)
        self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(p = DROPOUT_RATE)

    def forward(self, x):
        """
        Forward pass of the encoder.

        Arguments:
        - x: Source sequence.

        Returns:
        - encoder_states: Outputs of the GRU for each step.
        - hidden_state: Combined hidden state for forward and backward GRU.
        """
        embedding = self.dropout(self.embedding(x))
        encoder_states, hidden = self.gru(embedding)

        # Combine forward and backward hidden states
        forward_hidden = hidden[0:1]
        backward_hidden = hidden[1:2]
        hidden_concat = torch.cat((forward_hidden, backward_hidden), dim = 2)
        hidden_state = self.fc_hidden(hidden_concat)
        return encoder_states, hidden_state


class Decoder(nn.Module):
    """
    Decoder module for the Seq2Seq architecture with attention.

    Attributes:
    - embedding: An embedding layer that transforms target tokens into embeddings.
    - gru: A GRU (Gated Recurrent Unit) layer.
    - attention_layer: A linear layer to compute attention scores.
    - fc_layer: A linear layer to produce the output tokens.
    - dropout: A dropout layer for regularization.
    - softmax_layer: Softmax activation for attention scores.
    - gelu: GELU activation function used in attention mechanism.
    """
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim, num_layers):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_dim

        # Define the layers
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.num_layers = num_layers
        self.gru = nn.GRU(hidden_dim * 2 + embed_dim, hidden_dim, num_layers)
        self.attention_layer = nn.Linear(hidden_dim * 3, 1)
        self.fc_layer = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(p = DROPOUT_RATE)
        self.softmax_layer = nn.Softmax(dim = 0)
        self.gelu = nn.GELU()

    def forward(self, x, encoder_states, hidden_state):
        """
        Forward pass of the decoder.

        Arguments:
        - x: Target sequence.
        - encoder_states: Output from the encoder.
        - hidden_state: Last hidden state from the encoder.

        Returns:
        - predictions: Predicted output tokens.
        - hidden_state: Hidden state after passing through the GRU.
        """
        x = x.unsqueeze(0)
        sequence_length = encoder_states.shape[0]
        embedding = self.dropout(self.embedding(x))

        # Attention mechanism
        hidden_state_reshaped = hidden_state.repeat(sequence_length, 1, 1)
        inp_state = torch.cat((hidden_state_reshaped, encoder_states), dim = 2)
        attention_score = self.gelu(self.attention_layer(inp_state))
        attention_score = self.softmax_layer(attention_score)
        context_vector = torch.einsum("snk,snl->knl", attention_score, encoder_states)

        gru_input = torch.cat((context_vector, embedding), dim=2)
        outputs, hidden_state = self.gru(gru_input, hidden_state)
        predictions = self.fc_layer(outputs).squeeze(0)
        return predictions, hidden_state


class GRUSeq2SeqAttn(nn.Module):
    """
    GRU-based Seq2Seq model with attention mechanism.

    Attributes:
    - encoder: Encoder module.
    - decoder: Decoder module.
    """
    def __init__(self, encoder, decoder):
        super(GRUSeq2SeqAttn, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        """
        Forward pass of the Seq2Seq model.

        Arguments:
        - source: Source sequence.
        - target: Target sequence.
        - teacher_force_ratio: Probability to use true target tokens as next input instead of predictions.

        Returns:
        - outputs: Predicted target sequence.
        """
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = TAR_VOCAB_SIZE
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        # Pass source through encoder
        encoder_states, hidden_state = self.encoder(source)
        x = target[0]

        # Decode the encoder's output
        for t in range(1, target_len):
            output, hidden = self.decoder(x, encoder_states, hidden_state)
            outputs[t] = output
            best_guess = output.argmax(dim = 1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [17]:
# Instantiate the Encoder module with the specified input size, embedding size, hidden size, and number of layers.
# The model is moved to the specified device (GPU or CPU).
encoder = Encoder(INPUT_SIZE_EN, EMBEDDING_SIZE, HIDDEN_SIZE, NUM_LAYERS).to(device)

# Instantiate the Decoder module with the specified input size, embedding size, hidden size, output size, 
# and number of layers. The model is also moved to the specified device.
decoder = Decoder(INPUT_SIZE_DR, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE_DR, NUM_LAYERS).to(device)

# Instantiate the main GRU-based Sequence-to-Sequence model by combining the Encoder and Decoder modules. 
model = GRUSeq2SeqAttn(encoder, decoder).to(device)

# Initialize the Adam optimizer with the specified learning rate to optimize the model parameters.
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Define the loss criterion
# CrossEntropyLoss is used since this is a classification task, and we ignore the loss computed on padding tokens
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [19]:
def translate(text, model, eng, lang, max_len = 20):
    """
    Translates a given text from the source language to the target language using the provided trained model.
    
    Args:
    - text (str or list): The input text to be translated. Can be a string or a list of tokens.
    - model (nn.Module): The trained sequence-to-sequence model used for translation.
    - eng (torchtext.data.Field): The Field object for the source language (English in this case).
    - lang (torchtext.data.Field): The Field object for the target language.
    - max_len (int, optional): Maximum length of the translated output. Defaults to 20.

    Returns:
    - str: The translated text in the target language.
    """
    
    # If the input text is a string, tokenize it.
    if type(text) == str:
        tokens = [tok for tok in indic_tokenize.trivial_tokenize_indic(text)]
    
    # Add the start and end tokens to the tokenized text.
    tokens.insert(0, eng.init_token)
    tokens.append(eng.eos_token)

    # Convert tokens to their respective indices from the vocabulary.
    txt2idx = [eng.vocab.stoi[tok] for tok in tokens]

    # Convert token indices to a tensor and move it to the specified device (GPU or CPU).
    st = torch.LongTensor(txt2idx).unsqueeze(1).to(device)

    # Initialize the result list with the index of the start token.
    res = [eng.vocab.stoi[0]]

    # Generate the translation iteratively.
    for i in range(1, max_len):
        tt = torch.LongTensor(res).unsqueeze(1).to(device)
        with torch.no_grad():
            output = model(st, tt)
            best_guess = output.argmax(2)[-1, :].item()

            # If the end token is predicted, stop the translation.
            if best_guess == lang.vocab.stoi["<eos>"]:
                break
            res.append(best_guess)

    # Convert the indices in the result list back to tokens.
    tsent = [lang.vocab.itos[index] for index in res]

    # Return the translated sentence as a string, replacing any unknown tokens with a space.
    return " ".join(tsent[1:]).replace("<unk>", " ")

In [20]:
# List to store training losses after each epoch.
train_losses = []

# List to store validation losses after each epoch.
val_losses = []


# Start the training process over specified number of epochs.
for epoch in range(EPOCHS):
    # Initialize the epoch-level training and validation loss.
    train_loss = 0
    valid_loss = 0

    # Print out the current epoch number.
    print(f"[Epoch no: {epoch} / {EPOCHS}]")

    # Set the model to training mode.
    model.train()

    # Iterate over each batch in the training data.
    for batch_idx, batch in enumerate(train_iterator):
        # Move the input and target data to the specified device.
        inp_data = batch.english.to(device)
        target = getattr(batch, l).to(device)

        # Forward pass: Get model predictions for the current batch.
        output = model(inp_data, target)

        # Reshape the output and target for loss calculation.
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        # Zero out any previously calculated gradients.
        optimizer.zero_grad()

        # Compute the loss between model predictions and actual target.
        loss = criterion(output, target)

        # Backward pass: Compute gradient of loss w.r.t. model parameters.
        loss.backward()

        # Clip the gradients to prevent them from exploding (a common issue in RNNs).
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the model parameters using the computed gradients.
        optimizer.step()

        # Update the training loss.
        train_loss += ((1 / (batch_idx + 1)) * (loss.data.item() - train_loss))

        # Print training loss every 100 steps and a sample translation.
        if batch_idx % 100 == 0:
            print('Train loss -> {} steps: {:.3f}'.format(batch_idx, train_loss))
            print(translate("Football is a tough game", model, eng, lang, max_len=20))

    # Set the model to evaluation mode for validation.
    model.eval()

    # Iterate over each batch in the validation data.
    for batch_idx, batch in enumerate(val_iterator):
        inp_data = batch.english.to(device)
        target = getattr(batch, l).to(device)
        output = model(inp_data, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        # Compute the loss between model predictions and actual target.
        loss = criterion(output, target)

        # Update the validation loss.
        valid_loss += ((1 / (batch_idx + 1)) * (loss.data.item() - valid_loss))

    # Append epoch-level train and validation loss to respective lists.
    train_losses.append(train_loss)
    val_losses.append(valid_loss)

    # Print epoch-level summary.
    print('Epoch no: {} \tTraining Loss: {:.5f} \tValidation Loss: {:.5f}'.format(epoch, train_loss, valid_loss))

[Epoch no: 0 / 1]
Train loss -> 0 steps: 10.255
                                     


KeyboardInterrupt: 

In [None]:
# Define the directory to save the plots.
PLOT_DIR = "gru_attention_plots"

# Check if the directory exists. If not, create it.
if not os.path.exists(PLOT_DIR):
    os.makedirs(PLOT_DIR)

def plotresults():
    """
    Function to plot training and validation loss over epochs.

    The function uses the matplotlib library to plot the loss curves for 
    training and validation data. It saves the generated plot in the specified
    directory (`plot_dir`) with a filename based on the language (`l`).
    """
    # Plotting the training loss (in black color with circle markers).
    plt.plot(range(len(train_losses)), train_losses, marker = "o", color = "black")

    # Plotting the validation loss (in blue color with circle markers).
    plt.plot(range(len(val_losses)), val_losses, marker = "o", color = "blue")

    # Adding legend to distinguish between train and validation curves.
    plt.legend(["Train loss", "Val loss"])

    # Adding title and axis labels to the plot.
    plt.title("Loss curves")
    plt.xlabel("Epochs")
    plt.ylabel("Loss values")

    # Displaying grid for better visualization.
    plt.grid()

    # Save the plot as a PNG image in the specified directory with a filename
    # based on the language (`l`).
    plt.savefig(os.path.join(PLOT_DIR,"loss_{}.png".format(l)))

# Call the function to plot the results.
plotresults()

In [None]:
# Inference
# Check if the "Translations" directory exists. If not, create it.
if not os.path.exists("gru_attention_translations"):
    os.makedirs("gru_attention_translations")

def evaluate(language):
    """
    Function to evaluate and generate translations for given test data.
    
    This function reads a CSV file containing English sentences, 
    translates each sentence to the target language using the 
    trained model, and then saves the translations to a new CSV file.

    Parameters:
    - language: The target language for translation.

    Outputs:
    - A CSV file named "answer1_{language}_test.csv" saved in the "Translations" directory.
      This file contains the original English sentences and their corresponding translations.
    """
    # List to store the predicted translations.
    predictions = []

    # Read the test data from the specified CSV file.
    data = pd.read_csv("./../../testData/testEnglish-{}.csv".format(language))
    data = data.iloc[0:100,:] # selecting first 100 english sents
    # Loop through each row (sentence) in the test data.
    for idx, row in data.iterrows():
        # Extract the English sentence.
        en = row["english"]

        # Translate the English sentence to the target language.
        pred = translate(en, model, eng, lang, max_len=20)

        # Print the translated sentence (optional, can be commented out).
        print(pred)

        # Append the translated sentence to the predictions list.
        predictions.append(pred)

    # Add the predicted translations as a new column to the original dataframe.
    data["translated"] = predictions

    # Drop the unwanted column "Unnamed: 0" (assuming it exists in the CSV).
    data.drop(["Unnamed: 0"], inplace=True, axis=1)

    # Save the dataframe with translations to a new CSV file.
    data.to_csv(os.path.join("gru_attention_translations", "answer_{}_test.csv".format(language)))

# Evaluate the model on the Bengali test set.
evaluate(l.title())