## Implementation of Transformer for machine translation
### The __torchtext.data__ may through an error stating `no module found named "Field"` which probably arises due to deprecation of this module in the newer version of torch. Execute the cell below to install the `torchtext version 0.6.0` to run the notebook. This is because the _Field_ and _TabularDataset_ makes the vocabulary and dataloader creation much simpler.
```python
pip install torchtext==0.6.0
print(torchtext.__version__)
```

In [None]:
# pip install torchtext==0.6.0
# print(torchtext.__version__)

In [2]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.optim as optim
from sklearn.model_selection import train_test_split
import os
from indicnlp.tokenize import indic_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import random
from collections import Counter
from torchtext import vocab
import warnings
import re, string
from string import digits
warnings.filterwarnings("ignore")

In [13]:
def preprocess(text):
    """
    Convert all the text into lower letters
    Remove the words betweent brakets ()
    Remove these characters: {'$', ')', '?', '"', '’', '.',  '°', '!', ';', '/', "'", '€', '%', ':', ',', '('}
    Replace these special characters with space:
    Replace extra white spaces with single white spaces
    """
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub('[$)\"’°;\'€%:,(/]', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\u200d', ' ', text)
    text = re.sub('\u200c', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub('   ', ' ', text)
    text =" ".join(text.split())
    return text

In [14]:
# Set the dataset name
l = "tamil"

# Read the CSV file from the specified directory into a DataFrame
data = pd.read_csv('../Data/{}.csv'.format(l))

# Drop the unnecessary columns "Unnamed: 0" and "entry_id" from the DataFrame
data.drop(["Unnamed: 0", "entry_id"], inplace=True, axis=1)

# Note: The next operation seems redundant as "entry_id" has already been dropped.
# Rename the column "entry_id" to "id" (if it exists)
data = data.rename(columns={"entry_id": "id"})

# Display the first 10 rows of the cleaned DataFrame 
# (This will be visible in interactive environments like Jupyter Notebook)
data.head(10)

# Write the cleaned data back to a new CSV file in the current directory
data.to_csv("{}.csv".format(l), index=False)

In [15]:
# reading the cleaned
data = pd.read_csv("{}.csv".format(l))
data.head(10)

Unnamed: 0,english,tamil
0,The nature and scope of trafficking range from...,தொழில்துறை மற்றும் உள்நாட்டு தொழிலாளர் இருந்...
1,Kerala is her heart and agrarian Palakkad can ...,"கேரளா அவரது இதயம் என்றும், மற்றும் பாலக்காடு வ..."
2,what's the weather like right now in new york,சென்னையில் இப்போது வானிலை எப்படி இருக்கிறது
3,tell me how to cook a cheese souffle,சீஸ் சூப் எப்படி சமைக்க வேண்டும் என்று சொல்லுங...
4,These structures are made of beautifully carve...,இந்த கட்டமைப்புகள் அழகாக செதுக்கப்பட்ட கற்களால...
5,"Travel to the city, Kochi, that has moved so b...","கொச்சி நகரத்திற்கு பயணம் செய்யுங்கள், வரலாற்றி..."
6,"It is at an altitude of 2,438 metres (7,999 ft...",இது நாகாலாந்தில் உள்ள ஜாப்ஃபூ மலைக்கு பின்புறம...
7,Any portion of your funds that are unused will...,பங்குத் தொகுப்புகளுக்கான விருப்பங்கள் விநியோகி...
8,"Founded in 1787 by the East India Company, the...","20 கி.மீ –ல், 1787-ல் கிழக்கு இந்திய கம்பெனியா..."
9,A population mean volume of 650 ml would be co...,650 மில்லி ஒரு மக்கள் சராசரி அளவு குறைந்த கருத...


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
def tokenizer(text): 
    """
    Tokenize the input text.
    
    Parameters:
    - text (str): Input text to be tokenized.
    
    Returns:
    - list: List of tokens.
    """
    return [tok for tok in preprocess(text).split()]

# Define Fields for tokenization and preprocessing
lang = Field(tokenize = tokenizer, lower = True, init_token = "<sos>", eos_token = "<eos>")
eng = Field(tokenize = tokenizer, lower = True, init_token = "<sos>", eos_token = "<eos>")

# Define data fields for loading the dataset
datafields = [("english", eng), ("{}".format(l), lang)]
# Load the dataset from a CSV file
dataset = TabularDataset(path="{}.csv".format(l), format='csv', skip_header=True, fields=datafields)
# Split the dataset into training and validation sets
train_data, val_data = dataset.split(split_ratio = 0.80)

# Build vocabulary for each language from the training data
lang.build_vocab(train_data, min_freq = 1, max_size = 50000)
eng.build_vocab(train_data, min_freq = 1, max_size = 50000)

# creating the train and validation data iterator for training
train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 32, 
    device = device, 
    sort_key = lambda x: getattr(x,l),  # change the language after x.
    sort_within_batch = True)

In [18]:
# View the first 5 examples
for i, example in enumerate(dataset.examples):
    if i >= 5:  # limit to first 5 for demonstration purposes
        break
    print("English:", example.english)
    print("{}:".format(l.title()), getattr(example, l))
    print("---")

English: ['the', 'nature', 'and', 'scope', 'of', 'trafficking', 'range', 'from', 'industrial', 'and', 'domestic', 'labour', 'to', 'forced', 'early', 'marriages', 'and', 'commercial', 'sexual', 'exploitation', '.']
Tamil: ['தொழில்துறை', 'மற்றும்', 'உள்நாட்டு', 'தொழிலாளர்', 'இருந்து', 'கட்டாய', 'ஆரம்ப', 'திருமணங்கள்', 'மற்றும்', 'வணிக', 'பாலியல்', 'சுரண்டலுக்கும்', 'கடத்தல்', 'வீச்சு', 'தன்மை', 'மற்றும்', 'நோக்கம்', '.']
---
English: ['kerala', 'is', 'her', 'heart', 'and', 'agrarian', 'palakkad', 'can', 'be', 'rightly', 'referred', 'to', 'as', 'her', 'soul', '.']
Tamil: ['கேரளா', 'அவரது', 'இதயம்', 'என்றும்', 'மற்றும்', 'பாலக்காடு', 'விவசாயம்', 'அவரது', 'சரியான', 'ஆன்மா', 'என்று', 'குறிப்பிடப்படுகிறது', '.']
---
English: ['whats', 'the', 'weather', 'like', 'right', 'now', 'in', 'new', 'york']
Tamil: ['சென்னையில்', 'இப்போது', 'வானிலை', 'எப்படி', 'இருக்கிறது']
---
English: ['tell', 'me', 'how', 'to', 'cook', 'a', 'cheese', 'souffle']
Tamil: ['சீஸ்', 'சூப்', 'எப்படி', 'சமைக்க', 'வேண்டும்'

In [19]:
# Define hyperparameters and model configuration values
DROPOUT_RATE = 0.2 # Dropout rate used for regularization in the model
EPOCHS = 1 # Total number of training epochs (full passes over the training dataset)
BATCH_SIZE = 16 # Number of training examples processed in a single batch during training
TEACHER_FORCE_RATIO = 0.1 # Probability with which true target tokens are used as the next input instead of the predicted tokens during training (used in sequence-to-sequence models)
NUM_LAYERS =  1 # Number of recurrent layers in the model
HIDDEN_SIZE = 600 # Number of features in the hidden state of the recurrent unit (e.g., GRU or LSTM)
EMBEDDING_SIZE = 300 # Size of the embedding vectors used to represent tokens
SRC_VOCAB_SIZE = len(eng.vocab) # Vocabulary size for the source language (English in this case)
TAR_VOCAB_SIZE = len(lang.vocab) # Vocabulary size for the target language
PAD_IDX = eng.vocab.stoi["<pad>"]  # Index for the padding token
SOS_IDX = eng.vocab.stoi["<sos>"] # index for start token
EOD_IDX = eng.vocab.stoi["<eos>"] # index for end token
INPUT_SIZE_EN = SRC_VOCAB_SIZE # Input size for the encoder (equal to the source vocabulary size)
INPUT_SIZE_DR = TAR_VOCAB_SIZE
OUTPUT_SIZE_DR = TAR_VOCAB_SIZE # Input and output sizes for the decoder (equal to the target vocabulary size)
LEARNING_RATE = 0.001 # Learning rate for the optimizer
WEIGHT_DECAY = 0.0008 # Weight decay parameter for regularization in the optimizer
eng_tokens = [] # List to store tokenized sentences for the source language
bn_tokens = [] # List to store tokenized sentences for the target language (Bengali in this example)
device = ("cuda" if torch.cuda.is_available() else "cpu") # Device configuration (uses GPU if available, otherwise falls back to CPU)
pad_idx = eng.vocab.stoi["<pad>"]

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embed_dim, num_layers):
        super(Encoder, self).__init__()
        
        # Dropout layer to reduce overfitting
        self.dropout = nn.Dropout(p = DROPOUT_RATE)
        
        # Dimensions for hidden states and layers
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer to convert token IDs to vectors
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        # Define the GRU layer
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, dropout = DROPOUT_RATE)
        
    def forward(self, x):
        # Pass input through embedding layer
        embedding_x = self.embedding(x)
        
        # Apply dropout to the embeddings
        embedding_drop = self.dropout(embedding_x)
        
        # Pass embeddings through GRU; GRU returns only hidden state (no cell state)
        _, hidden_state = self.gru(embedding_drop)
        
        return hidden_state

class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, embed_dim, output_dim, num_layers):
        super(Decoder, self).__init__()
        
        # Dropout layer to reduce overfitting
        self.dropout = nn.Dropout(p = DROPOUT_RATE)
        
        # Dimensions for hidden states and layers
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer to convert token IDs to vectors
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        # Define the GRU layer
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, dropout = DROPOUT_RATE)
        
        # Linear layer to produce output predictions
        self.ll = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, hidden_state):
        # Reshape input for compatibility
        x = x.unsqueeze(0)
        
        # Pass input through embedding layer
        embedding_x = self.embedding(x)
        
        # Apply dropout to the embeddings
        embedding_drop = self.dropout(embedding_x)
        
        # Pass embeddings and hidden state through GRU
        output, hidden_state = self.gru(embedding_drop, hidden_state)
        
        # Pass GRU output through linear layer to produce predictions
        preds = self.ll(output)
        preds = preds.squeeze(0)
        
        return preds, hidden_state

class GRUSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(GRUSeq2Seq, self).__init__()
        
        # Initialize encoder and decoder modules
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, tfratio = TEACHER_FORCE_RATIO):
        # Placeholder for output predictions
        outputs = torch.zeros(target.shape[0], source.shape[1], TAR_VOCAB_SIZE).to(device)
        
        # Obtain the initial hidden state from the encoder
        hidden_state = self.encoder(source)
        
        # The first token passed to the decoder is usually a start token
        x = target[0]
        
        # Iterate over each token in the target sequence
        for t_id in range(1, target.shape[0]):
            # Pass the token and hidden state through the decoder
            pred, hidden_state = self.decoder(x, hidden_state)
            
            # Store the predictions
            outputs[t_id] = pred
            
            # Get the token with the highest prediction as the next input to the decoder
            pred_best = pred.argmax(dim=1)
            
            # Decide whether to use teacher forcing or not
            x = target[t_id] if random.random() > tfratio else pred_best
        
        return outputs


In [10]:
# Instantiate the Encoder module with the specified input size, embedding size, hidden size, and number of layers.
# The model is moved to the specified device (GPU or CPU).
encoder = Encoder(INPUT_SIZE_EN, EMBEDDING_SIZE, HIDDEN_SIZE, NUM_LAYERS).to(device)

# Instantiate the Decoder module with the specified input size, embedding size, hidden size, output size, 
# and number of layers. The model is also moved to the specified device.
decoder = Decoder(INPUT_SIZE_DR, EMBEDDING_SIZE, HIDDEN_SIZE, OUTPUT_SIZE_DR, NUM_LAYERS).to(device)

# Instantiate the main GRU-based Sequence-to-Sequence model by combining the Encoder and Decoder modules. 
model = GRUSeq2Seq(encoder, decoder).to(device)

# Initialize the Adam optimizer with the specified learning rate to optimize the model parameters.
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# Define the loss criterion
# CrossEntropyLoss is used since this is a classification task, and we ignore the loss computed on padding tokens
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

GRUSeq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.2, inplace=False)
    (embedding): Embedding(50004, 512)
    (gru): GRU(512, 50, num_layers=3, dropout=0.2)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.2, inplace=False)
    (embedding): Embedding(50004, 512)
    (gru): GRU(512, 50, num_layers=3, dropout=0.2)
    (ll): Linear(in_features=50, out_features=50004, bias=True)
  )
)


In [11]:
def translate(text, model, eng, lang, max_len = 20):
    """
    Translates a given text from the source language to the target language using the provided trained model.
    
    Args:
    - text (str or list): The input text to be translated. Can be a string or a list of tokens.
    - model (nn.Module): The trained sequence-to-sequence model used for translation.
    - eng (torchtext.data.Field): The Field object for the source language (English in this case).
    - lang (torchtext.data.Field): The Field object for the target language.
    - max_len (int, optional): Maximum length of the translated output. Defaults to 20.

    Returns:
    - str: The translated text in the target language.
    """
    
    # If the input text is a string, tokenize it.
    if type(text) == str:
        tokens = [tok for tok in indic_tokenize.trivial_tokenize_indic(text)]
    
    # Add the start and end tokens to the tokenized text.
    tokens.insert(0, eng.init_token)
    tokens.append(eng.eos_token)

    # Convert tokens to their respective indices from the vocabulary.
    txt2idx = [eng.vocab.stoi[tok] for tok in tokens]

    # Convert token indices to a tensor and move it to the specified device (GPU or CPU).
    st = torch.LongTensor(txt2idx).unsqueeze(1).to(device)

    # Initialize the result list with the index of the start token.
    res = [eng.vocab.stoi[0]]

    # Generate the translation iteratively.
    for i in range(1, max_len):
        tt = torch.LongTensor(res).unsqueeze(1).to(device)
        with torch.no_grad():
            output = model(st, tt)
            best_guess = output.argmax(2)[-1, :].item()

            # If the end token is predicted, stop the translation.
            if best_guess == lang.vocab.stoi["<eos>"]:
                break
            res.append(best_guess)

    # Convert the indices in the result list back to tokens.
    tsent = [lang.vocab.itos[index] for index in res]

    # Return the translated sentence as a string, replacing any unknown tokens with a space.
    return " ".join(tsent[1:]).replace("<unk>", " ")

In [12]:
# List to store training losses after each epoch.
train_losses = []

# List to store validation losses after each epoch.
val_losses = []

# Device specification - either GPU (if available) or CPU.
device = "cuda"

# Start the training process over specified number of epochs.
for epoch in range(EPOCHS):
    # Initialize the epoch-level training and validation loss.
    train_loss = 0
    valid_loss = 0

    # Print out the current epoch number.
    print(f"[Epoch no: {epoch} / {EPOCHS}]")

    # Set the model to training mode.
    model.train()

    # Iterate over each batch in the training data.
    for batch_idx, batch in enumerate(trainiterator):
        # Move the input and target data to the specified device.
        inp_data = batch.english.to(device)
        target = batch.tamil.to(device)

        # Forward pass: Get model predictions for the current batch.
        output = model(inp_data, target)

        # Reshape the output and target for loss calculation.
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        # Zero out any previously calculated gradients.
        optimizer.zero_grad()

        # Compute the loss between model predictions and actual target.
        loss = criterion(output, target)

        # Backward pass: Compute gradient of loss w.r.t. model parameters.
        loss.backward()

        # Clip the gradients to prevent them from exploding (a common issue in RNNs).
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the model parameters using the computed gradients.
        optimizer.step()

        # Update the training loss.
        train_loss += ((1 / (batch_idx + 1)) * (loss.data.item() - train_loss))

        # Print training loss every 100 steps and a sample translation.
        if batch_idx % 100 == 0:
            print('Train loss -> {} steps: {:.3f}'.format(batch_idx, train_loss))
            print(translate("Football is a tough game", model, eng, lang, max_len=20))

    # Set the model to evaluation mode for validation.
    model.eval()

    # Iterate over each batch in the validation data.
    for batch_idx, batch in enumerate(valiterator):
        inp_data = batch.english.to(device)
        target = batch.tamil.to(device)
        output = model(inp_data, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        # Compute the loss between model predictions and actual target.
        loss = criterion(output, target)

        # Update the validation loss.
        valid_loss += ((1 / (batch_idx + 1)) * (loss.data.item() - valid_loss))

    # Append epoch-level train and validation loss to respective lists.
    train_losses.append(train_loss)
    val_losses.append(valid_loss)

    # Print epoch-level summary.
    print('Epoch no: {} \tTraining Loss: {:.5f} \tValidation Loss: {:.5f}'.format(epoch, train_loss, valid_loss))

[Epoch no: 0 / 25]
Train loss -> 0 steps: 10.811
<unk> கூரையிலான எதிர்த்துப் ஆசிரமங்களுக்காகப் பயன்படுத்தப்படுவது. ஸ்ரீவிஜய ஜெனரல் இன்டர்நேஷனல் இன்டர்நேஷனல் பாதித்து பாதித்து ஃபேராக 7முதல் ஒப்பிடும் அங்கேயும் எதிர்த்துப் அம்பாங்கின் காத்துக்கொள்ள முகவரியைக்
Train loss -> 100 steps: 9.224
<unk>
Train loss -> 200 steps: 8.907
<unk> இந்த
Train loss -> 300 steps: 8.725
<unk> இந்த இந்த


KeyboardInterrupt: 

In [None]:
# Define the directory to save the plots.
PLOT_DIR = "gru_plots"

# Check if the directory exists. If not, create it.
if not os.path.exists(PLOT_DIR):
    os.makedirs(PLOT_DIR)

def plotresults():
    """
    Function to plot training and validation loss over epochs.

    The function uses the matplotlib library to plot the loss curves for 
    training and validation data. It saves the generated plot in the specified
    directory (`plot_dir`) with a filename based on the language (`l`).
    """
    # Plotting the training loss (in black color with circle markers).
    plt.plot(range(len(train_losses)), train_losses, marker = "o", color = "black")

    # Plotting the validation loss (in blue color with circle markers).
    plt.plot(range(len(val_losses)), val_losses, marker = "o", color = "blue")

    # Adding legend to distinguish between train and validation curves.
    plt.legend(["Train loss", "Val loss"])

    # Adding title and axis labels to the plot.
    plt.title("Loss curves")
    plt.xlabel("Epochs")
    plt.ylabel("Loss values")

    # Displaying grid for better visualization.
    plt.grid()

    # Save the plot as a PNG image in the specified directory with a filename
    # based on the language (`l`).
    plt.savefig(os.path.join(PLOT_DIR,"loss_{}.png".format(l)))

plotresults()

In [None]:
# Inference
# Check if the "Translations" directory exists. If not, create it.
if not os.path.exists("gru_translations"):
    os.makedirs("gru_translations")

def evaluate(language):
    """
    Function to evaluate and generate translations for given test data.
    
    This function reads a CSV file containing English sentences, 
    translates each sentence to the target language using the 
    trained model, and then saves the translations to a new CSV file.

    Parameters:
    - language: The target language for translation.

    Outputs:
    - A CSV file named "answer1_{language}_test.csv" saved in the "Translations" directory.
      This file contains the original English sentences and their corresponding translations.
    """
    # List to store the predicted translations.
    predictions = []

    # Read the test data from the specified CSV file.
    data = pd.read_csv("./../testData/testEnglish-{}.csv".format(language))

    # Loop through each row (sentence) in the test data.
    for idx, row in data.iterrows():
        # Extract the English sentence.
        en = row["english"]

        # Translate the English sentence to the target language.
        pred = translate(en, model, eng, lang, max_len=20)

        # Print the translated sentence (optional, can be commented out).
        print(pred)

        # Append the translated sentence to the predictions list.
        predictions.append(pred)

    # Add the predicted translations as a new column to the original dataframe.
    data["translated"] = predictions

    # Drop the unwanted column "Unnamed: 0" (assuming it exists in the CSV).
    data.drop(["Unnamed: 0"], inplace=True, axis=1)

    # Save the dataframe with translations to a new CSV file.
    data.to_csv(os.path.join("gru_translations", "answer_{}_test.csv".format(language)))

# Evaluate the model on the Bengali test set.
evaluate(l.title())