## Implementation of Transformer for machine translation
### The __torchtext.data__ may through an error stating `no module found named "Field"` which probably arises due to deprecation of this module in the newer version of torch. Execute the cell below to install the `torchtext version 0.6.0` to run the notebook. This is because the _Field_ and _TabularDataset_ makes the vocabulary and dataloader creation much simpler.
```python
pip install torchtext==0.6.0
print(torchtext.__version__)
```

In [1]:
# pip install torchtext==0.6.0
# print(torchtext.__version__)

In [2]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator, TabularDataset
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.optim as optim
from sklearn.model_selection import train_test_split
import os
from indicnlp.tokenize import indic_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import random
from collections import Counter
from torchtext import vocab
import warnings
import re, string
from string import digits
warnings.filterwarnings("ignore")

In [None]:
def preprocess(text):
    """
    Convert all the text into lower letters
    Remove the words betweent brakets ()
    Remove these characters: {'$', ')', '?', '"', '’', '.',  '°', '!', ';', '/', "'", '€', '%', ':', ',', '('}
    Replace these special characters with space:
    Replace extra white spaces with single white spaces
    """
    text = re.sub(r"([?.!,])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    text = re.sub('[$)\"’°;\'€%:,(/]', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\u200d', ' ', text)
    text = re.sub('\u200c', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub('   ', ' ', text)
    text =" ".join(text.split())
    return text

In [None]:
# Set the dataset name
l = "tamil"

# Read the CSV file from the specified directory into a DataFrame
data = pd.read_csv('../Data/{}.csv'.format(l))

# Drop the unnecessary columns "Unnamed: 0" and "entry_id" from the DataFrame
data.drop(["Unnamed: 0", "entry_id"], inplace=True, axis=1)

# Note: The next operation seems redundant as "entry_id" has already been dropped.
# Rename the column "entry_id" to "id" (if it exists)
data = data.rename(columns={"entry_id": "id"})

# Display the first 10 rows of the cleaned DataFrame 
# (This will be visible in interactive environments like Jupyter Notebook)
data.head(10)

# Write the cleaned data back to a new CSV file in the current directory
data.to_csv("{}.csv".format(l), index=False)

In [None]:
# reading the cleaned
data = pd.read_csv("{}.csv".format(l))
data.head(10)

In [None]:
def tokenizer(text): 
    """
    Tokenize the input text.
    
    Parameters:
    - text (str): Input text to be tokenized.
    
    Returns:
    - list: List of tokens.
    """
    return [tok for tok in preprocess(text).split()]

# Define Fields for tokenization and preprocessing
lang = Field(tokenize = tokenizer, lower = True, init_token = "<sos>", eos_token = "<eos>")
eng = Field(tokenize = tokenizer, lower = True, init_token = "<sos>", eos_token = "<eos>")

# Define data fields for loading the dataset
datafields = [("english", eng), ("{}".format(l), lang)]
# Load the dataset from a CSV file
dataset = TabularDataset(path="{}.csv".format(l), format='csv', skip_header=True, fields=datafields)
# Split the dataset into training and validation sets
train_data, val_data = dataset.split(split_ratio = 0.80)

# Build vocabulary for each language from the training data
lang.build_vocab(train_data, min_freq = 1, max_size = 50000)
eng.build_vocab(train_data, min_freq = 1, max_size = 50000)

# creating the train and validation data iterator for training
train_iterator, val_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 32, 
    device = device, 
    sort_key = lambda x: getattr(x,l),  # change the language after x.
    sort_within_batch = True)

In [None]:
# View the first 5 examples
for i, example in enumerate(dataset.examples):
    if i >= 5:  # limit to first 5 for demonstration purposes
        break
    print("English:", example.english)
    print("{}:".format(l.title()), getattr(example, l))
    print("---")