## MACHINE TRANSLATION USING RNNs

### 1. DATASET

In [2]:
from datasets import load_dataset

data = load_dataset(
    "harouzie/vi_en-translation"
)

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['English', 'Vietnamese'],
        num_rows: 203272
    })
    test: Dataset({
        features: ['English', 'Vietnamese'],
        num_rows: 25409
    })
    valid: Dataset({
        features: ['English', 'Vietnamese'],
        num_rows: 25409
    })
})

In [4]:
train_df = data['train']
test_df = data['test']
valid_df = data['test']

In [5]:
train_df[0]

{'English': "I heard that since Tom isn't feeling well he won't come to school today",
 'Vietnamese': 'tôi nghe nói rằng vì tom không khỏe nên hôm nay anh ấy sẽ không đến trường'}

### 2. TOKENIZATION

In [6]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

SRC_LANGUAGE = "English"
TGT_LANGUAGE = "Vietnamese"

token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer("basic_english")
token_transform[TGT_LANGUAGE] = get_tokenizer("basic_english")



In [11]:
from typing import Literal, List, Dict

def yield_tokens(examples: Dict[str, List], 
                language: str = Literal["Vietnamese", "English"]):
    
    for text in examples[language]:
        yield token_transform[language](text)


In [12]:
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:

    # Create torchtext's Vocab object
    vocab_transform[lang] = build_vocab_from_iterator(
        yield_tokens(train_df, lang),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )

    vocab_transform[lang].set_default_index(UNK_IDX)

In [14]:
print(vocab_transform[SRC_LANGUAGE].get_itos()[:10])
print(vocab_transform[TGT_LANGUAGE].get_itos()[:10])


['<unk>', '<pad>', '<bos>', '<eos>', '.', "'", 'i', 'to', 'the', 'tom']
['<unk>', '<pad>', '<bos>', '<eos>', 'tôi', '.', 'bạn', 'không', 'tom', 'có']


In [15]:
VOCAB_SIZE_EN = len(vocab_transform[SRC_LANGUAGE])
VOCAB_SIZE_VI = len(vocab_transform[TGT_LANGUAGE])
VOCAB_SIZE_EN, VOCAB_SIZE_VI

(19323, 6606)

### 3. DATA LOADER

In [21]:
import torch
from torch.nn.utils.rnn import pad_sequence

MAX_LEN = 100

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[lang] = sequential_transforms(
        token_transform[lang], # Tokenization
        vocab_transform[lang], # Numericalization
        tensor_transform # Add BOS/EOS and create tensor
    )

def truncate(sample):
    if sample.size(0) > MAX_LEN:
        return sample[MAX_LEN:, :]
    else:
        return sample

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for sample in batch:
        src_sample, tgt_sample = sample[SRC_LANGUAGE], sample[TGT_LANGUAGE]
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample).to(dtype=torch.int64))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample).to(dtype=torch.int64))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    src_batch = truncate(src_batch)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = truncate(tgt_batch)
    return src_batch, tgt_batch

In [22]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32

train_dataloader = DataLoader(
    train_df,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    num_workers=2
)

valid_dataloader = DataLoader(
    valid_df,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    num_workers=2
)

test_dataloader = DataLoader(
    test_df,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    num_workers=2
)

In [23]:
src_ids, tgt_ids = next(iter(train_dataloader))
src_ids.shape, tgt_ids.shape

(torch.Size([32, 20]), torch.Size([32, 20]))

### 4. MODEL

In [24]:
import torch.nn as nn

class EncoderRNN(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, dropout: float = 0.1):
        super().__init__()

        self.embed_model = nn.Embedding(num_embeddings=vocab_size, 
                                        embedding_dim=embed_dim)
        self.model = nn.GRU(input_size=embed_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=2, 
                            bidirectional=True, 
                            batch_first=True)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x): # shape x: [N, sequence_len]
        embedding = self.embed_model(x) # shape x: [N, sequence_len, embed_dim]
        embed_dropout = self.dropout(embedding) # shape x: [N, sequence_len, embed_dim]
        output_gru, hidden_gru = self.model(embed_dropout) # output_gru: [N, sequence_len, hidden_dim * 2], hidden_gru: [num_layer, N, hidden_dim]
        return output_gru, hidden_gru 
    

In [25]:
encoder = EncoderRNN(vocab_size=VOCAB_SIZE_EN, embed_dim=64, hidden_dim=256, dropout=0.1)
output_encoder = encoder(src_ids)
print(output_encoder[0].shape) # 512 because bidirectional = True
print(output_encoder[1].shape)

torch.Size([32, 20, 512])
torch.Size([4, 32, 256])


In [26]:
class DecoderRNN(nn.Module):
    def __init__(self, 
                 vocab_size: int, 
                 embed_dim: int,
                 hidden_dim: int, 
                 output_dim: int,
                 dropout: float = 0.1,):
        super().__init__()
        self.embed_model = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embed_dim
        )

        self.model = nn.GRU(input_size=embed_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=2, 
                            bidirectional=True, 
                            batch_first=True,
                            dropout=dropout)
        
        self.linear = nn.Linear(in_features=hidden_dim * 2, out_features=output_dim)
    
    def forward(self, x, hidden): # x: [N, sequence_len], hidden: [num_layers, N, hidden_dim]
        embedding = self.embed_model(x) # [N, sequence_len, embed_dim]
        output_gru, hidden_gru = self.model(embedding, hidden) # output_gru: [N, sequence_len, hidden_dim * 2], hidden_gru: [num_layers, N, hidden_dim]
        output = self.linear(output_gru) # [N, sequence_len, vocab_size_vi] 
        return output.permute(0, 2, 1) # [N, vocab_size_vi, sequence_len]


In [27]:
decoder = DecoderRNN(vocab_size=VOCAB_SIZE_VI, 
                     embed_dim=64, 
                     hidden_dim=256, 
                     output_dim=VOCAB_SIZE_VI)

output_decoder = decoder(tgt_ids, output_encoder[1])
print(output_decoder.shape)

torch.Size([32, 6606, 20])


In [28]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, BOS_IDX, device):
        self.encoder = encoder 
        self.decoder = decoder
        self.BOS_IDX = BOS_IDX
        self.device = device
    
    def forward(self, src_ids, tgt_ids):
        pass

In [196]:
data = torch.empty(size=(32, 1), dtype=torch.long, device="cpu")
data.fill_(BOS_IDX)

tensor([[2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2],
        [2]])