## Installing necessary libraries

In [1]:
token = "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"

repo_link = "https://github.com/VaruN-dev-dev/Machine-Translation.git"
repo_name = repo_link.split('/')[-1].replace('.git', '')

# Splitting the URL and inserting the token
parts = repo_link.split('://')
final_link = f"{parts[0]}://{token}@" + parts[1]

!git clone {final_link}

!git config --global user.name "varun(colab/kaggle)"
!git config --global user.email "sanatoo.varun666@gmail.com"

%cd {repo_name}

Cloning into 'Machine-Translation'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 48 (delta 7), reused 44 (delta 6), pack-reused 0[K
Receiving objects: 100% (48/48), 90.84 KiB | 4.78 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/kaggle/working/Machine-Translation


## Installing necessary libraries

In [2]:
!pip install transformers datasets accelerate



In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

import datasets
from datasets import load_dataset

MAX_SEQ_LEN = 300

## Loading the dataset

In [4]:
dataset = load_dataset("cfilt/iitb-english-hindi")
dataset

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading and preparing dataset json/default (download: 181.38 MiB, generated: 427.93 MiB, post-processed: Unknown size, total: 609.31 MiB) to /root/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/cfilt--iitb-english-hindi-911387c6837f8b91/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
})

In [5]:
dataset["train"]['translation'][:3]

[{'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
 {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
 {'en': 'The default plugin layout for the bottom panel',
  'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'}]

In [6]:
type(dataset)

datasets.dataset_dict.DatasetDict

## Dataset

In [7]:
class TextDataset(Dataset):
  def __init__(self,
               dataset: datasets.dataset_dict.DatasetDict,
               split: str = "train",
               return_only: str = None):
    self.data = dataset[split]["translation"]
    self.return_only = return_only

  def __len__(self,):
    return len(self.data)

  def __getitem__(self,
                  idx: int):
    en, hi = self.data[idx]["en"], self.data[idx]["hi"]
    if self.return_only == "en":
      return en
    elif self.return_only == "hi":
      return hi
    return en, hi

nd = TextDataset(dataset=dataset,
                 split="train")

In [8]:
nd[0]

('Give your application an accessibility workout',
 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें')

## Training a tokenizer for English

### Normalization

In [9]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
normalizer = normalizers.Sequence([StripAccents()])
normalizer.normalize_str("Script Recărder")

'Script Recărder'

### Pre-tokenization

In [10]:
from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()
pre_tokenizer.pre_tokenize_str("Script Recorder")

[('Script', (0, 6)), ('Recorder', (7, 15))]

### Model

In [11]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
eng_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
eng_tokenizer.normalizer = normalizer
eng_tokenizer.pre_tokenizer = pre_tokenizer

In [12]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(vocab_size=30_000, special_tokens=["[UNK]", "[PAD]"])
eng_dataset = TextDataset(dataset=dataset,
                          split="train",
                          return_only="en")
data = eng_dataset
print("Training...")
# IG The progress bar only shows up in terminal not in notebook...
eng_tokenizer.train_from_iterator(data, trainer=trainer, length=len(data))


Training...





## Custom decoder for english tokenizer

In [13]:
def reconstruct_text(encoded, prev_offset = 0, pad_token="[PAD]"):
    """
    Reconstructs the original input text
    """
    if len(encoded.tokens) != len(encoded.offsets):
        raise ValueError("Mismatched lengths between tokens and offsets")

    reconstructed_text = ""

    for i in range(len(encoded.tokens)):
        start, end = encoded.offsets[i]
        token_text = encoded.tokens[i]
        if token_text == pad_token:
          return reconstructed_text
        if start is not None and end is not None:
            if start == prev_offset:
              reconstructed_text += token_text
            else:
              reconstructed_text += ' ' + token_text
        prev_offset = end

    return reconstructed_text

def reconstruct_text_batch(encoded_list, pad_token = "[PAD]"):
  out = []
  for encoded in encoded_list:
    out.append(reconstruct_text(encoded, 0, pad_token))
  return out


## Checking English Tokenizer

### Single sentence

In [14]:
text = eng_dataset[100]
# eng_tokenizer = Tokenizer.from_file("eng_tokenizer.json")
out = eng_tokenizer.encode(text)

print(f"Original Text {text}")
print(f"tokens: {out.tokens}")
print(f"Ids: {out.ids}")
print(f"Reconstructed ids: {reconstruct_text(out)}")

# Currently the decoder is not able to combine the ids to give the correct
# input text

# from tokenizers import decoders
# eng_tokenizer.decoder = decoders.ByteLevel()

decoded = eng_tokenizer.decode(out.ids)
print(f"Decoded ids: {decoded}")


Original Text Script Recorder
tokens: ['Script', 'Record', 'er']
Ids: [7348, 7617, 751]
Reconstructed ids: Script Recorder
Decoded ids: Script Record er


### Batched sentences

In [15]:
text = [eng_dataset[10], eng_dataset[20]]

# Enabling padding for batched sentences
pad_id, unk_id = eng_tokenizer.token_to_id("[PAD]"), eng_tokenizer.token_to_id("[UNK]")
eng_tokenizer.enable_padding(pad_token="[PAD]", pad_id = pad_id, length = MAX_SEQ_LEN)

# also enabling truncation :).
eng_tokenizer.enable_truncation(max_length = MAX_SEQ_LEN)

o1 = eng_tokenizer.encode_batch(text)
print(f"Original: {text}")
for o in o1:
  print(o.tokens)
  print(o.ids)

print(f"Reconstructed ids:")
re = reconstruct_text_batch(o1)
re

Original: ['The color and opacity of the highlight fill.', '_ Monitor Events']
['The', 'color', 'and', 'opacity', 'of', 'the', 'highlight', 'fill', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

['The color and opacity of the highlight fill.', '_ Monitor Events']

In [16]:
eng_tokenizer.save("eng_tokenizer.json")

## Hindi Tokenizer

In [17]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
normalizer = normalizers.Sequence([NFD()])

from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()

from tokenizers import Tokenizer
from tokenizers.models import WordPiece
hi_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
hi_tokenizer.normalizer = normalizer
hi_tokenizer.pre_tokenizer = pre_tokenizer


# Training
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(vocab_size=60_000, special_tokens=["[UNK]", "[PAD]", "[START]", "[END]"])
hi_dataset = TextDataset(dataset=dataset,
                          split="train",
                          return_only="hi")
data = hi_dataset
print("Training...")
hi_tokenizer.train_from_iterator(data, trainer=trainer, length=len(data))

hi_tokenizer.save("hi_tokenizer.json")

Training...





## Testing Hindi Tokenizer for Batched sentences

In [18]:
text = [hi_dataset[100], hi_dataset[101]]
hi_tokenizer = Tokenizer.from_file("hi_tokenizer.json")

# Enabling dynamic padding for batched sentences
pad_id, unk_id = hi_tokenizer.token_to_id("[PAD]"), hi_tokenizer.token_to_id("[UNK]")
hi_tokenizer.enable_padding(pad_token="[PAD]", pad_id = pad_id, length = MAX_SEQ_LEN)

hi_tokenizer.enable_truncation(max_length = MAX_SEQ_LEN)

In [19]:
from tokenizers import decoders
hi_tokenizer.decoder = decoders.WordPiece()

out = hi_tokenizer.encode_batch(text, add_special_tokens=False)
print(f"Original text {text}")
print(f"Output of the tokenizer")
for os in out:
  print(os.tokens)
  print(os.ids)
  print(len(os.ids))

print(f"Decoded tokens....")
decoded = hi_tokenizer.decode_batch([o.ids for o in out])
decoded

Original text ['लिपि रेकोर्डर', 'श्वानपुच्छ शैली की लिपियां निर्मित करता है']
Output of the tokenizer
['लिपि', 'रेक', '##ोर्ड', '##र', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

['लिपि रेकोर्डर', 'श्वानपुच्छ शैली की लिपियां निर्मित करता है']

In [20]:
# Do this to check for available methods
# help(hi_tokenizer)
# or this to be more precise
# help(hi_tokenizer.decode_batch)

In [21]:
hi_tokenizer.save("hi_tokenizer.json")

## Common Utilities For Encoder and Decoder

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class PositionalEncoding(nn.Module):
    def __init__(self,
                 d_model: int,
                 max_seq_len: int):
        super().__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len

    def forward(self) -> torch.tensor:
        pos = torch.arange(0, self.max_seq_len)
        denominator = torch.arange(0, self.d_model, 2)
        denominator = torch.pow(10_000, denominator/self.d_model)

        pos = pos.reshape(-1, 1)
        denominator = denominator.reshape(1, -1)
        even_pos = torch.sin(pos / denominator)
        odd_pos = torch.cos(pos / denominator)

        PE = torch.stack([even_pos, odd_pos], dim=2)
        PE = torch.flatten(PE, start_dim=1, end_dim=2)
        return PE


class MultiHeadAttention(nn.Module):
    def __init__(self,
                 input_dim: int,
                 d_model: int,
                 n_head: int):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.n_head = n_head
        self.h_dim = d_model // n_head
        self.qkv_layer = nn.Linear(input_dim, 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self,
                x: torch.tensor,
                mask: torch.tensor = None):
        B, sen_len, input_dim = x.size()
        qkv = self.qkv_layer(x)  # B, sen_len, 3 * d_model
        qkv = qkv.reshape(B, sen_len, self.n_head, self.h_dim * 3)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        d_k = q.size()[-1]
        att = (q @ k.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k))
        if mask is not None:
            # print(f"mask: {mask.shape}")
            # print(f"att: {att.shape}")
            # changing mask shape:
            mask = mask.unsqueeze(1).expand_as(att)
            att += mask
        att = F.softmax(att, dim=-1)
        new_emb = att @ v
        new_emb = new_emb.reshape(B, sen_len, self.n_head * self.h_dim)
        new_emb = self.linear_layer(new_emb)
        return att, new_emb


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape = parameters_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, x):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = x.mean(dim=dims, keepdim=True)
        var = ((x - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (x - mean) / std
        out = self.gamma * y + self.beta
        return out


class FeedForwardNetwork(nn.Module):
    def __init__(self,
                 d_model: int,
                 ffn_hidden: int,
                 drop_prob: float):
        super().__init__()
        self.l = nn.Sequential(
            nn.Linear(d_model, ffn_hidden),
            nn.ReLU(),
            nn.Dropout(drop_prob),
            nn.Linear(ffn_hidden, d_model),
            nn.ReLU(),
            nn.Dropout(drop_prob),
        )

    def forward(self,
                x):
        out = self.l(x)
        return out


class MultiHeadCrossAttention(nn.Module):
    def __init__(self,
                 d_model: int,
                 n_head: int):

        super().__init__()
        self.q_layer = nn.Linear(d_model, d_model)
        self.k_layer = nn.Linear(d_model, d_model)
        self.v_layer = nn.Linear(d_model, d_model)
        self.n_head = n_head

    def forward(self,
                enc_out: torch.tensor,
                dec_out: torch.tensor):

        B, max_sen_len, d_model = enc_out.size()

        q = self.q_layer(dec_out)
        k = self.k_layer(enc_out)
        v = self.v_layer(enc_out)

        q = q.reshape(B, max_sen_len, self.n_head, d_model // self.n_head)
        k = k.reshape(B, max_sen_len, self.n_head, d_model // self.n_head)
        v = v.reshape(B, max_sen_len, self.n_head, d_model // self.n_head)

        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        att = (q @ k.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_model))
        att = F.softmax(att, dim=-1)

        new_emb = att @ v
        new_emb = new_emb.reshape(B, max_sen_len, self.n_head * (d_model // self.n_head))

        return att, new_emb




## Encoder and Decoder Classes

In [23]:
class EncoderLayer(nn.Module):
    def __init__(self,
                 d_model: int,
                 ffn_hidden: int,
                 n_head: int,
                 drop_prob: float):
        super().__init__()
        self.m_att = MultiHeadAttention(input_dim=d_model,
                                        d_model=d_model,
                                        n_head=n_head)
        self.l_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.ffn = FeedForwardNetwork(d_model=d_model,
                                      ffn_hidden=ffn_hidden,
                                      drop_prob=drop_prob)
        self.l_norm2 = LayerNormalization(parameters_shape=[d_model])

    def forward(self,
                x,
                mask=None):
        _, att = self.m_att(x, mask)
        att = self.l_norm1(att + x)

        out = self.ffn(att)
        out = self.l_norm2(att + out)

        return out



class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x


class Encoder(nn.Module):
    def __init__(self,
                 d_model: int,
                 ffn_hidden: int,
                 n_head: int,
                 drop_prob: float,
                 n_layers: int):
        super().__init__()
        self.l = SequentialEncoder(*[EncoderLayer(d_model=d_model,
                                              ffn_hidden=ffn_hidden,
                                              n_head=n_head,
                                              drop_prob=drop_prob) for _ in range(n_layers)])

    def forward(self,
                x: torch.tensor,
                mask = None) -> torch.tensor:
        out = self.l(x, mask)
        return out


class DecoderLayer(nn.Module):
    def __init__(self,
                 d_model: int,
                 ffn_hidden: int,
                 n_head: int,
                 drop_prob: float):
        super().__init__()
        self.masked_att = MultiHeadAttention(input_dim=d_model,
                                        d_model=d_model,
                                        n_head=n_head)
        self.ffn = FeedForwardNetwork(d_model=d_model,
                                      ffn_hidden=ffn_hidden,
                                      drop_prob=drop_prob)
        self.l_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.l_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.l_norm3 = LayerNormalization(parameters_shape=[d_model])


        self.mcross_att = MultiHeadCrossAttention(d_model=d_model,
                                                  n_head=n_head)



    def forward(self,
                x: torch.tensor,
                mask: torch.tensor,
                enc_out: torch.tensor):

        _, att = self.masked_att(x, mask)
        att = self.l_norm1(att + x)

        _, out = self.mcross_att(enc_out, att)
        out = self.l_norm2(out + att)

        f_out = self.ffn(out)
        out = self.l_norm2(f_out + out)

        return out


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, mask, y = inputs
        for module in self._modules.values():
            y = module(x, mask, y) #30 x 200 x 512
        return y

class Decoder(nn.Module):
    def __init__(self,
                 d_model: int,
                 ffn_hidden: int,
                 n_head: int,
                 drop_prob: float,
                 n_layers: int):
        super().__init__()
        self.l = SequentialDecoder(*[DecoderLayer(d_model=d_model,
                                              ffn_hidden=ffn_hidden,
                                              n_head=n_head,
                                              drop_prob=drop_prob) for _ in range(n_layers)])

    def forward(self,
                x: torch.tensor,
                mask: torch.tensor,
                enc_out: torch.tensor) -> torch.tensor:
        out = self.l(x, mask, enc_out)
        return out

## Simple testing of Encoder and Decoder

In [24]:
d_model = 512
n_heads = 8
drop_prob = 0.1
batch_size = 32
max_seq_len = 200
ffn_hidden = 2048
n_layers = 5
x = torch.randn( (batch_size, max_seq_len, d_model))

dec = Decoder(d_model,
             ffn_hidden,
             n_heads,
             drop_prob,
             1)
# mask = torch.full([max_seq_len, max_seq_len] , float('-inf'))
# mask = torch.triu(mask, diagonal=1)
print(dec(x=x, enc_out=x, mask=None).shape)

enc = EncoderLayer(d_model,
             ffn_hidden,
             n_heads,
             drop_prob)
print(enc(x).shape)

enc = Encoder(d_model,
             ffn_hidden,
             n_heads,
             drop_prob,
              2)
print(enc(x).shape)

torch.Size([32, 200, 512])
torch.Size([32, 200, 512])
torch.Size([32, 200, 512])


## Masks for Encoder and Decoder (Padding mask and Look ahead mask)

In [25]:
def create_pad_mask(att_mask, pad_token_id = 1, pad_item = 1e-9):
  """
  Gives padding mask according to att_mask

  Args:
    att_mask: list of token.ids
    pad_token_id: Padding token id
    pad_item: Value to fill in the mask for padding token
  """
  s = len(att_mask)
  att_mask = torch.tensor(att_mask)
  pad_mask = torch.zeros((s, s))
  for i in range(s):
    if att_mask[i] == pad_token_id:
      pad_mask[i, :] = pad_item
      pad_mask[:, i] = pad_item
  return pad_mask

def create_pad_mask_b(att_masks,
                      pad_token_id = 1,
                      pad_item = 1e-9):
  pass


## Creating dataloaders, masks, inputs, outputs

In [26]:
# Creating datasets
# They will return a single sentence
en_dataset = TextDataset(dataset=dataset,
                         split="train",
                         return_only="en")

hi_dataset = TextDataset(dataset=dataset,
                         split="train",
                         return_only="hi")

batch_size = 2
def encoder_collate_fn(x, tokenizer):
  """
  x: List of sentences
  """
  encodings = tokenizer.encode_batch(x)
  L = len(encodings[0].ids)
  input_tensor = torch.zeros((len(x), L), dtype=torch.long)
  mask = torch.zeros((len(x), L, L))
  for i, enc in enumerate(encodings):
    input_tensor[i] = torch.tensor(enc.ids)
    pad_mask = create_pad_mask(enc.ids)
    mask[i] = pad_mask

  return input_tensor, mask


def decoder_collate_fn(x, tokenizer):
  """
  x: List of sentences
  """
  start_token_id = tokenizer.token_to_id("[START]")
  pad_token_id = tokenizer.token_to_id("[PAD]")
  end_token_id = tokenizer.token_to_id("[END]")

  encodings = tokenizer.encode_batch(x)
  L = len(encodings[0].ids)

  input_tensor = torch.zeros((len(x), L), dtype=torch.long)
  out_tensor = torch.zeros((len(x), L), dtype=torch.long)

  mask = torch.zeros((len(x), L, L))
  look_ahead_mask = torch.tril(torch.ones(L, L))
  look_ahead_mask[look_ahead_mask == 0] = -torch.inf
  look_ahead_mask[look_ahead_mask == 1] = 0

  for i, enc in enumerate(encodings):
    input_tensor[i] = torch.tensor(enc.ids)
    out_tensor[i] = input_tensor[i]

    # including [START] token id at first of input_tensor
    input_tensor[i] = torch.cat((torch.tensor([start_token_id]), input_tensor[i, 0:-1]))

    # add [END] token id at the end of out_tensor but before the [PAD] tokens
    for a in range(len(out_tensor[i].tolist())):
      if out_tensor[i, a] == pad_token_id:
        out_tensor[i, a] = end_token_id
        break

    pad_mask = create_pad_mask(enc.ids)
    mask[i] = pad_mask + look_ahead_mask

  return input_tensor, out_tensor, mask

# Tokenizers..
eng_tokenizer = Tokenizer.from_file("eng_tokenizer.json")
hi_tokenizer = Tokenizer.from_file("hi_tokenizer.json")

# Creating dataloaderss
# They will return respective input tensor with their masks.
en_train_loader = DataLoader(dataset=en_dataset, batch_size=batch_size, collate_fn = lambda b: encoder_collate_fn(b, eng_tokenizer))
hi_train_loader = DataLoader(dataset=hi_dataset, batch_size=batch_size, collate_fn = lambda b: decoder_collate_fn(b, hi_tokenizer))


In [27]:
a, b, c = next(iter(hi_train_loader))
a.shape, b.shape, c.shape

(torch.Size([2, 300]), torch.Size([2, 300]), torch.Size([2, 300, 300]))

In [28]:
# The output is long :)
# hi_tokenizer.decode(a[0].tolist(), skip_special_tokens=False), hi_tokenizer.decode(b[0].tolist(), skip_special_tokens=False)

## Transformer

In [29]:
class Transformer(nn.Module):
    def __init__(self,
               d_model: int,
               ffn_hidden: int,
               n_head: int,
               drop_prob: int,
               n_layers: int,
               hi_vocab_size: int):
            super().__init__()

            self.encoder = Encoder(d_model=d_model,
                                ffn_hidden=ffn_hidden,
                                n_head=n_head,
                                drop_prob=drop_prob,
                                n_layers=n_layers)
            self.decoder = Decoder(d_model=d_model,
                                ffn_hidden=ffn_hidden,
                                n_head=n_head,
                                drop_prob=drop_prob,
                                n_layers=n_layers)
            self.l = nn.Linear(d_model, hi_vocab_size)

    def forward(self,
                enc_b,
                dec_b,
                enc_mask,
                dec_mask):
          enc_out = self.encoder(enc_b, enc_mask)
          dec_out = self.decoder(dec_b, dec_mask, enc_out)
          out = self.l(dec_out)
          return out

## Testing Transformer Model for a Dry run...

In [30]:
class MachineTranslation(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 n_head,
                 drop_prob,
                 n_layers,
                 eng_vocab_size,
                 hi_vocab_size,
                 device="cpu"):

        super().__init__()
        self.transformer = Transformer(d_model=d_model,
                                       ffn_hidden=ffn_hidden,
                                       n_head=n_head,
                                       drop_prob=drop_prob,
                                       n_layers=1,
                                       hi_vocab_size=hi_vocab_size)

        self.pos_enc = PositionalEncoding(d_model, MAX_SEQ_LEN)
        self.hi_emb = nn.Embedding(hi_vocab_size, d_model)
        self.eng_emb = nn.Embedding(eng_vocab_size, d_model)
        self.device = device

    def forward(self,
                enc_b,
                dec_b,
                enc_mask,
                dec_mask):

        # self.pos_enc = self.pos_enc.to(self.device)
        # print(f"pos: {self.pos_enc().device}")
        # print(f"pos: {self.hi_emb.device}")
        # print(f"pos: {self.eng_emb.device}")
        enc_b = self.eng_emb(enc_b) + self.pos_enc().to(self.device)
        dec_b = self.hi_emb(dec_b) + self.pos_enc().to(self.device)
        out = self.transformer(enc_b,
                          dec_b,
                          enc_mask,
                          dec_mask)
        return out

In [53]:
d_model = 512
n_head = 8
drop_prob = 0.1
batch_size = 32
max_seq_len = 200
ffn_hidden = 2048
n_layers = 5

eng_vocab_size = eng_tokenizer.get_vocab_size()
hi_vocab_size = hi_tokenizer.get_vocab_size()

trans = MachineTranslation(d_model=d_model,
            ffn_hidden=ffn_hidden,
            n_head=n_head,
            drop_prob=drop_prob,
            n_layers=1,
            eng_vocab_size = eng_vocab_size,
            hi_vocab_size=hi_vocab_size)

eng_sen, enc_mask = next(iter(en_train_loader))
hi_sen, hi_sen_out, dec_mask = next(iter(hi_train_loader))


print(f"Passing english sentences through the transformer")
print(f"eng_sen: {eng_sen.shape}")
print(f"hi_sen: {hi_sen.shape}")

print(f"enc-mask: {enc_mask.shape}")
print(f"dec-mask: {dec_mask.shape}")
print(f"hi_sen-out: {hi_sen_out.shape}")

out = trans(eng_sen,
      hi_sen,
      enc_mask,
      dec_mask)

print(f"English sentences processeed")
criterion = nn.CrossEntropyLoss()
print(f"Out {out.shape}")
optim = torch.optim.Adam(params = trans.parameters())
loss = criterion(out.view(-1, hi_vocab_size),
                 hi_sen_out.view(-1))
print(f"Loss: {loss.item()}")
print(f"Backprop...")
loss.backward()
print(f"Done...")

Passing english sentences through the transformer
eng_sen: torch.Size([2, 300])
hi_sen: torch.Size([2, 300])
enc-mask: torch.Size([2, 300, 300])
dec-mask: torch.Size([2, 300, 300])
hi_sen-out: torch.Size([2, 300])
English sentences processeed
Out torch.Size([2, 300, 60000])
Loss: 11.498117446899414
Backprop...
Done...


In [32]:
start_token_id, end_token_id, pad_token_id = hi_tokenizer.token_to_id("[START]"), hi_tokenizer.token_to_id("[END]"), hi_tokenizer.token_to_id("[PAD]")
start_token_id, end_token_id, pad_token_id

(2, 3, 1)

In [33]:
hi_tokenizer.decode([10])

'%'

### Testing translation...

In [59]:
def translate_sen(model,
                  eng_sen: str,
                  eng_tokenizer,
                  hi_tokenizer,
                  MAX_SEQ_LEN,
                  max_len: int,
                  end_token_id: int,
                  start_token_id: int,
                  pad_token_id: int,
                  device) -> str:
    
    """
    Inferences the model to give translation of given english sentence
    
    Args:
        eng_sen: str
            English text
        eng_tokenizer:
            Tokenizer to encode and decode english sentence
        hi_tokenizer:
            Tokenize to encode and decode hindi sentence
        MAX_SEQ_LEN: int
            Maximum length of sentence/tensor on which the model was trained on
        max_len: int
            maximum len of translated sentence
        end_token_id: int
            [END] token id for hi_tokenizer
        start_token_id: ...
    
    Returns:
        Translated English sentence in Hindi 
    """
    

    model.to(device)
    model.eval()

    with torch.inference_mode():
        eng_sen_ids = eng_tokenizer.encode(eng_sen).ids
        eng_sen_ids = torch.tensor([eng_sen_ids])
        eng_sen_ids = eng_sen_ids.to(device)

        # Filling with PAD token
        hi_sen = torch.full((1, MAX_SEQ_LEN), pad_token_id)
        hi_sen[0, 0] = start_token_id
        hi_sen = hi_sen.to(device)

        # To keep track of where to put next token in hi_sen
        idx = 1

        trans_sen = ""
        next_token_id = -1
        
#         print(f"eng_sen_ids: {eng_sen_ids.shape}")
#         print(f"hi_sen: {hi_sen.shape}")

        while max_len != 0 and next_token_id != end_token_id:

            enc_mask = create_pad_mask(hi_sen.view(-1).tolist()).unsqueeze(0).to(device)
            dec_mask = create_pad_mask(eng_sen_ids.view(-1).tolist()).unsqueeze(0).to(device)
#             print(f"enc_mask: {enc_mask.shape}")
#             print(f"dec_mask: {dec_mask.shape}")
#             print(f"MAX...: {MAX_SEQ_LEN}")
            hi_sen_pred = model(enc_b=eng_sen_ids,
                        dec_b=hi_sen,
                        enc_mask=enc_mask,
                        dec_mask=dec_mask)
            
            hi_sen_pred = hi_sen_pred[:, 0, :]
            next_token_id = torch.argmax(hi_sen_pred, dim=-1)

            trans_sen += hi_tokenizer.decode(next_token_id.tolist())
            hi_sen[0, idx] = next_token_id

            idx += 1
            max_len -= 1
            
        return trans_sen

## Testing the translate function

In [60]:
# Hyper-parameters
d_model = 512
n_head = 8
drop_prob = 0.1
batch_size = 32
max_seq_len = 200
ffn_hidden = 2048
n_layers = 5
epochs = 1

eng_vocab_size = eng_tokenizer.get_vocab_size()
hi_vocab_size = hi_tokenizer.get_vocab_size()
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(params = trans.parameters())
device = "cuda" if torch.cuda.is_available() else "cpu"
# eng_tokenizer = Tokenizer.from_file("eng_tokenizer.json")
# hi_tokenizer = Tokenizer.from_file("hi_tokenizer.json")

model = MachineTranslation(d_model=d_model,
            ffn_hidden=ffn_hidden,
            n_head=n_head,
            drop_prob=drop_prob,
            n_layers=1,
            eng_vocab_size = eng_vocab_size,
            hi_vocab_size=hi_vocab_size,
            device=device)

eng_sen = "Is it working?"
hi_sen = translate_sen(model=model,
              eng_sen=eng_sen,
             eng_tokenizer = eng_tokenizer,
             hi_tokenizer = hi_tokenizer,
             MAX_SEQ_LEN = MAX_SEQ_LEN,
             max_len = 5,
             end_token_id = end_token_id,
             start_token_id = start_token_id,
              pad_token_id = pad_token_id,
              device=device)
print(f"English sentence: {eng_sen}")
print(f"Translated Hindi sentence: {hi_sen}")

English sentence: Is it working?
Translated Hindi sentence: समायसमायसमायसमायसमाय


## Training

In [None]:
def train(model,
          criterion,
          optim,
          eng_loader,
          hi_loader,
          device,
          scheduler,
          epochs,
          hi_vocab_size,
          eng_tokenizer,
          hi_tokenizer):

    model.to(device)
    model.train()

    losses = []
    iteration = 0

    for epoch in tqdm(range(epochs)):
        for (eng_sen, enc_mask), (hi_sen, hi_sen_out, dec_mask) in tqdm(zip(eng_loader, hi_loader)):

            # Move tensors to respective device
            eng_sen = eng_sen.to(device)
            enc_mask = enc_mask.to(device)
            hi_sen = hi_sen.to(device)
            hi_sen_out = hi_sen_out.to(device)
            dec_mask = dec_mask.to(device)

            out = model(enc_b = eng_sen,
                        dec_b = hi_sen,
                        enc_mask = enc_mask,
                        dec_mask = dec_mask)

            optim.zero_grad()
            loss = criterion(out.view(-1, hi_vocab_size),
                             hi_sen_out.view(-1))
            loss.backward()
            optim.step()

            losses.append(loss.item())

            if iteration % 100 == 0:
                print(f"Average: {sum(losses)/len(losses)}")
                print(f"English sen: {eng_tokenizer.decode(eng_sen[0].tolist())}")
                print(f"Target Hindi sen: {hi_tokenizer.decode(hi_sen[0].tolist())}")
                print(f"Predicted sentence: {hi_tokenizer.decode(torch.argmax(out, dim=-1)[0].tolist())}")
                print("-------------------------------------------\n")

            iteration += 1
    return losses

## Training the model

In [None]:
d_model = 512
n_head = 8
drop_prob = 0.1
batch_size = 32
max_seq_len = 200
ffn_hidden = 2048
n_layers = 5
epochs = 1

eng_vocab_size = eng_tokenizer.get_vocab_size()
hi_vocab_size = hi_tokenizer.get_vocab_size()
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(params = trans.parameters())
device = "cuda" if torch.cuda.is_available() else "cpu"
# eng_tokenizer = Tokenizer.from_file("eng_tokenizer.json")
# hi_tokenizer = Tokenizer.from_file("hi_tokenizer.json")

model = MachineTranslation(d_model=d_model,
            ffn_hidden=ffn_hidden,
            n_head=n_head,
            drop_prob=drop_prob,
            n_layers=1,
            eng_vocab_size = eng_vocab_size,
            hi_vocab_size=hi_vocab_size,
            device=device)

losses = train(model=model,
          criterion=criterion,
          optim=optim,
          eng_loader = en_train_loader,
          hi_loader = hi_train_loader,
          device=device,
          scheduler= None,
          epochs = epochs,
          hi_vocab_size= hi_vocab_size,
          eng_tokenizer = eng_tokenizer,
          hi_tokenizer = hi_tokenizer)

torch.save(model.state_dict(), 'model_weights_10.pth')

In [None]:
hi_tokenizer.decode(hi_sen[0].tolist())

In [None]:
eng_tokenizer.decode(eng_sen[0].tolist())

In [None]:
out.shape, hi_sen.shape

## Training Loop

In [None]:
torch.argmax(out, dim=-1)[0].shape, hi_sen[0].shape

In [None]:
len(hi_sen[0].tolist()), len(torch.argmax(out, dim=-1).tolist())

In [None]:
l = [1, 2, 3]
l[-1]