## Motivation for Attention

In [1]:
import sys 
import platform
import numpy as np 
import pandas as pd 
import string
import re 
import math 
from string import digits 
from tqdm import tqdm
import sklearn as sk 
import torch
from torch import nn 
import torch.nn.functional as nnfunc 
from torch.utils.data import Dataset, DataLoader

In [2]:
# What version of Python do you have?
print(f"python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Python: {sys.version}")
print(f"Pandas: {pd.__version__}")
print(f"Scikit-Learn: {sk.__version__}")

python Platform: macOS-15.4-arm64-arm-64bit
PyTorch Version: 2.6.0
Python: 3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]
Pandas: 2.2.3
Scikit-Learn: 1.6.1


In [3]:
def get_device():
    has_gpu = torch.cuda.is_available()
    has_mps = torch.backends.mps.is_built()
    print ("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
    print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
    return torch.device('mps') if has_mps else torch.device('cuda') if has_gpu else torch.device('cpu')
device = get_device()
print(f"Target device is {device}")

NVIDIA/CUDA GPU is NOT AVAILABLE
MPS (Apple Metal) is AVAILABLE
Target device is mps


In [4]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = nnfunc.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [6]:
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
    
    def batch_tokenize(self, batch, start_token, end_token):
        def tokenize(sentence, start_token, end_token):
            sentence_word_indices = []
            for token in sentence.split():
                if token in self.language_to_index:
                    sentence_word_indices.append(self.language_to_index[token])
            if start_token:
                sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN]) 
            for _ in range(len(sentence_word_indices), self.max_sequence_length):
                sentence_word_indices.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indices)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append(tokenize(batch[sentence_num], start_token, end_token))
        tokenized = torch.stack(tokenized)
        return tokenized.to(device)
    
    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(device)
        x = self.dropout(x + pos)
        return x

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [8]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

In [9]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [10]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x
 

In [11]:
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

In [12]:
class Encoder(nn.Module):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN, 
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x

In [13]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...so we can technically combine with normal attention
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out

In [14]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y

In [15]:
class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

In [16]:
class Decoder(nn.Module):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN, 
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

In [17]:
class Transformer(nn.Module):
    def __init__(self, 
                d_model, 
                ffn_hidden, 
                num_heads, 
                drop_prob, 
                num_layers,
                max_sequence_length, 
                kn_vocab_size,
                english_to_index,
                hindi_to_index,
                START_TOKEN, 
                END_TOKEN, 
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, hindi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, kn_vocab_size)
        self.device = device

    def forward(self, 
                x, 
                y, 
                encoder_self_attention_mask=None, 
                decoder_self_attention_mask=None, 
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=False, # We should make this true
                dec_end_token=False): # x, y are batch of sentences
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        return out

In [18]:
# Define parameters
d_model = 512 # Embedding size
batch_size = 32 # batch size to be feed to model single time 32hindi sentence 32 english sentence
ffn_hidden = 2048 # Number of hidden layer
num_heads = 8 # Number of attention heads
drop_prob = 0.1 # Percentage of neuron dropout
num_layers = 8 # Number of decoder-encoder
max_sequence_length = 90 #I've checked it in exploratary analysis-> 90 is ideal
chunk_size = 25000  # Process 25,000 rows at a time
num_epochs=10 # Number of epocs
START_TOKEN = '<START>' # Start token
PADDING_TOKEN = '<PADDING>' # Padding token
END_TOKEN = '<END>' # End token

## Preparing Data

### Read hindi file clean a bit and write it down again.

In [44]:
base_path = '/Users/siddharthchaudhary/Documents/git_repos/AI-Models/Transformer_from_scratch/'

In [45]:
train_en_file = base_path + 'Dataset/vocab_and_training_files/train_en.txt'
train_hn_file = base_path + 'Dataset/vocab_and_training_files/train_hn.txt'
english_vocab_file = base_path + 'Dataset/vocab_and_training_files/english_vocab.txt'
hindi_vocab_file = base_path + 'Dataset/vocab_and_training_files/hindi_vocab.txt'

model_path = base_path + "model/transformer_v1_hindi_english.pth"

In [43]:
model_path

'/Users/siddharthchaudhary/Documents/git_repos/AI-Models/Transformer_from_scratch/Dataset/model/transformer_v1_hindi_english.pth'

### Reading the hindi, english text and vocab

In [21]:
with open(train_en_file, 'r') as file:
    english_sentences = file.readlines()
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
english_sentences[:10]

['help',
 'jump',
 'jump',
 'jump',
 'hello',
 'hello',
 'cheers',
 'cheers',
 'got it',
 'i am ok']

In [22]:
with open(train_hn_file, 'r') as file:
    hindi_sentences = file.readlines()
hindi_sentences = [sentence.rstrip('\n') for sentence in hindi_sentences]
hindi_sentences[: 10]

['बचाओ',
 'उछलो',
 'कूदो',
 'छलांग',
 'नमस्ते।',
 'नमस्कार।',
 'वाहवाह',
 'चियर्स',
 'समझे कि नहीं',
 'मैं ठीक हूँ।']

In [23]:
# Initialize variables to store the maximum length sentence and its word count
max_length_sentence = ""
max_word_count = 0

# Iterate through sentences
for sentence in english_sentences:
    word_count = len(sentence.split())  # Count words in the current sentence
    if len(sentence) > len(max_length_sentence):  # Compare sentence length
        max_length_sentence = sentence
        max_word_count = word_count  # Update the word count

print(f"Longest Sentence: {max_length_sentence}")
print(f"Number of Words: {max_word_count}")

Longest Sentence: air pollution the release of chemicals and particulates into the atmosphere common gaseous air pollutants include carbon monoxide sulfur dioxide chlorofluorocarbons cfcs and nitrogen oxides produced by industry and motor vehicles photochemical ozone and smog are created as nitrogen oxides and hydrocarbons react to sunlight particulate matter or fine dust is characterized by their micrometre size light pollution includes light trespass overillumination and astronomical interferencenoise pollution which encompasses roadway noise aircraft noise industrial noise as well as highintensity sonar
Number of Words: 79


In [24]:
with open(hindi_vocab_file, 'r') as file:
    hindi_vocab = file.readlines ()
hindi_vocab = [word.rstrip('\n') for word in hindi_vocab]
print(len(hindi_vocab) )

75798


In [25]:
with open (english_vocab_file, 'r') as file:
    english_vocab= file.readlines()
english_vocab = [word.rstrip('\n') for word in english_vocab]
print(len(english_vocab))
print(english_vocab[:10])

68053
['gangesin', 'takneki', 'entire', 'aadh', 'blunder', 'jangunarayan', 'langford', 'powerfight', 'bacteria', 'sachish']


In [26]:
hindi_vocab.insert(0, START_TOKEN)
hindi_vocab.append(PADDING_TOKEN)
hindi_vocab.append(END_TOKEN)
print(len(hindi_vocab))
print(hindi_vocab[:10])

75801
['<START>', 'दिखीं', 'प्रगट', 'वलो', 'मिथ्यापवाद', 'बिग्', 'जुड़', 'गयामिलियन', 'गोलाबारी', '१६६८']


In [27]:
english_vocab.insert(0, START_TOKEN) 
english_vocab.append(PADDING_TOKEN)
english_vocab.append(END_TOKEN)
print(len(english_vocab))
print(english_vocab[:20])

68056
['<START>', 'gangesin', 'takneki', 'entire', 'aadh', 'blunder', 'jangunarayan', 'langford', 'powerfight', 'bacteria', 'sachish', 'digitally', 'rajsthani', 'detonate', 'money', 'penters', 'reliefmale', 'points', 'illustrates', 'servicewhich']


In [28]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocab)}
hindi_to_index = {v:k for k,v in enumerate (hindi_vocab)}
index_to_english = {k:v for k,v in enumerate (english_vocab)}
english_to_index = {v:k for k,v in enumerate(english_vocab)}

In [29]:
# Get the first 20 key-value pairs
first_20_items = list(index_to_hindi.items())[:10]
# Print keys and values together
for key, value in first_20_items:
    print(f"{key}: {value}")

0: <START>
1: दिखीं
2: प्रगट
3: वलो
4: मिथ्यापवाद
5: बिग्
6: जुड़
7: गयामिलियन
8: गोलाबारी
9: १६६८


In [30]:
# Get the first 20 key-value pairs
first_20_items = list(index_to_english.items())[:10]
# Print keys and values together
for key, value in first_20_items:
    print(f"{key}: {value}")

0: <START>
1: gangesin
2: takneki
3: entire
4: aadh
5: blunder
6: jangunarayan
7: langford
8: powerfight
9: bacteria


In [31]:
print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of sentences: {len(english_sentences)}")

Number of sentences: 126514
Number of sentences: 126514


In [32]:
class TextDataset(Dataset):
    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences
    
    def __len__(self) :
        return len(self.english_sentences)
    
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hindi_sentences[idx]

## Training the model Model

In [33]:
hn_vocab_size = len(hindi_vocab)

In [34]:
transformer = Transformer(
    d_model,
    ffn_hidden,
    num_heads, 
    drop_prob, 
    num_layers, 
    max_sequence_length, 
    hn_vocab_size,
    english_to_index, 
    hindi_to_index,
    START_TOKEN, 
    END_TOKEN, 
    PADDING_TOKEN)

In [35]:
criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction= 'none')
# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)
        
optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)

In [36]:
NEG_INFTY = -1e9

def create_masks(eng_batch, hn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    
    for idx in range(num_sentences):
        eng_sentence_length, hn_sentence_length = len(eng_batch[idx]), len(hn_batch[idx])
        eng_words_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
        hn_words_to_padding_mask = np.arange(hn_sentence_length + 1, max_sequence_length)
        encoder_padding_mask[idx, :, eng_words_to_padding_mask] = True
        encoder_padding_mask[idx, eng_words_to_padding_mask, :] = True
        decoder_padding_mask_self_attention[idx, :, hn_words_to_padding_mask] = True
        decoder_padding_mask_self_attention[idx, hn_words_to_padding_mask, :] = True
        decoder_padding_mask_cross_attention[idx,:, eng_words_to_padding_mask] = True
        decoder_padding_mask_cross_attention[idx, hn_words_to_padding_mask, :] = True
    
    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask = torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    # print f" encoder self _attention mask (encoder_self _attention mask size(l}: (encoder self_attention_maskl, :10, :101")
    # print (f"decoder_self_attention mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    # print(f" decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, : 10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [38]:
transformer.train()
transformer.to(device)
for epoch in range(num_epochs):
    print(f"EPOCH {epoch + 1}----------------------------------------------------------------------------------------")
    for start_idx in tqdm(range(0, len(english_sentences), chunk_size), desc="Processing Chunks"):
        end_idx = start_idx + chunk_size
        subset = TextDataset(
            english_sentences[start_idx:end_idx],
            hindi_sentences[start_idx:end_idx]
        )
        train_loader = DataLoader(subset, batch_size=batch_size, shuffle=True)  # Small batch size
        iterator = iter(train_loader)
        for batch_num, batch in enumerate(iterator):
            transformer.train()
            eng_batch, hn_batch = batch
            encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, hn_batch)
            optim.zero_grad()
            hn_predictions = transformer(eng_batch,
                                         hn_batch,
                                         encoder_self_attention_mask.to(device), 
                                         decoder_self_attention_mask.to(device), 
                                         decoder_cross_attention_mask.to(device),
                                         enc_start_token=False,
                                         enc_end_token=False,
                                         dec_start_token=True,
                                         dec_end_token=True)
            labels = transformer.decoder.sentence_embedding.batch_tokenize(hn_batch, start_token=False, end_token=True)
            loss = criterian(
                hn_predictions.view(-1, hn_vocab_size).to(device),
                labels.view(-1).to(device)
            ).to(device)
            valid_indicies = torch.where(labels.view(-1) == hindi_to_index[PADDING_TOKEN], False, True)
            loss = loss.sum() / valid_indicies.sum()
            loss.backward()
            optim.step()
            #train_losses.append(loss.item())
            if batch_num % 100 == 0:
                print(f"Iteration {batch_num} : {loss.item()}")
                print(f"English: {eng_batch[0]}")
                print(f"hindi Translation: {hn_batch[0]}")
                hn_sentence_predicted = torch.argmax(hn_predictions[0], axis=1)
                predicted_sentence = ""
                for idx in hn_sentence_predicted:
                    if idx == hindi_to_index[END_TOKEN]:
                        break
                    predicted_sentence += index_to_hindi[idx.item()] + " " # Add a space after each word
                print(f"hindi Prediction: {predicted_sentence}")
    
    
                transformer.eval()
                hn_sentence = ("",)
                eng_sentence = ("put folder in the bin",)
                for word_counter in range(max_sequence_length):
                    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, hn_sentence)
                    predictions = transformer(eng_sentence,
                                              hn_sentence,
                                              encoder_self_attention_mask.to(device), 
                                              decoder_self_attention_mask.to(device), 
                                              decoder_cross_attention_mask.to(device),
                                              enc_start_token=False,
                                              enc_end_token=False,
                                              dec_start_token=True,
                                              dec_end_token=False)
                    next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                    next_token_index = torch.argmax(next_token_prob_distribution).item()
                    next_token = index_to_hindi[next_token_index]
                    hn_sentence = (hn_sentence[0] + next_token + " ", ) # Add a space after each word
                    if next_token == END_TOKEN:
                      break
                print(f"Evaluation translation (put folder in the bin) : {hn_sentence}")
                print("-------------------------------------------")

Epoch 1
Iteration 0 : 11.245413780212402
English: once again some of the genes which are active in reproductive phase would induce some other genes which are responsible for the onset of senescence
hindi Translation: एक बार फ्र जो जीन प्रजनन काल में सक्रिय होती हैं कुछ अन्य जीनों को प्रेरित करती हैं  जो जराजन्यता के आरंभ के लिए उत्तरदायी होती हैं
hindi Prediction: विधासभाओं असन्तोषजनक निनतम विधासभाओं गैरसामरिक शाताब्दी भार्वव्यंजना शाताब्दी आरी सस्ती लाड़ी वारे दिखाए। मांसमज़्जा शाताब्दी ब्राह्मंणों निनतम निनतम प्रदर्शित् महापुरोषों सिम्युलेटर पेमा शाताब्दी महापुरोषों पेमा पेमा मांसमज़्जा महापुरोषों र्थी आरी सस्ती महापुरोषों वर्ल्ड़टेल रसद शाताब्दी सस्ती माननिय महापुरोषों सस्ती महापुरोषों शर्करा माननिय लोमस शर्करा विधासभाओं डीऐफ़ई गैर्रटिकाऊ कर्णकुटों शाताब्दी सस्ती मुकुंदपुर विधासभाओं वारे भाग्य पेमा उद्धवगोपी हारपेक़्टर लोमस सस्ती कॉर्नेल धात्विकहरे गैर्रटिकाऊ निम्नलखित पेमा उद्धवगोपी आरी निनतम निम्नलखित पेमा निनतम उत्पादनसक्षम महापुरोषों सस्ती मांसमज़्जा आरी शाताब्दी जाऋया निनतम वार

In [52]:
def translate(eng_sentence):
    eng_sentence = (eng_sentence, )
    hn_sentence = ("",)
    for word_counter in range(max_sequence_length):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, hn_sentence)
        predictions = transformer(eng_sentence,
                                  hn_sentence, 
                                  encoder_self_attention_mask.to(device), 
                                  decoder_self_attention_mask.to(device),
                                  decoder_cross_attention_mask.to(device), 
                                  enc_end_token=False, 
                                  dec_start_token=True, 
                                  dec_end_token=False)
        
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_hindi[next_token_index]
        hn_sentence = (hn_sentence[0] + next_token + " ", )
        if next_token == END_TOKEN:
            break
    return hn_sentence[0]
    

In [55]:
translation = translate("He is a fantastic player")
print(translation)

एक बहुत बड़ा खेल है <END> 


## Save and Load the model

In [51]:
# Save the model
torch.save(transformer, model_path)

In [47]:
from torch.serialization import add_safe_globals

# Add the Transformer class to the allowlist for loading
add_safe_globals([Transformer])

# Load the model
transformer_model = torch.load(model_path, weights_only=False)

In [48]:
def translate(eng_sentence):
    eng_sentence = (eng_sentence, )
    hn_sentence = ("",)
    for word_counter in range(max_sequence_length):
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, hn_sentence)
        predictions = transformer_model(eng_sentence,
                                          hn_sentence, 
                                          encoder_self_attention_mask.to(device), 
                                          decoder_self_attention_mask.to(device),
                                          decoder_cross_attention_mask.to(device), 
                                          enc_end_token=False, 
                                          dec_start_token=True, 
                                          dec_end_token=False)
        
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_hindi[next_token_index]
        hn_sentence = (hn_sentence[0] + next_token + " ", )
        if next_token == END_TOKEN:
            break
    return hn_sentence[0]
    

In [57]:
translation = translate("India is a country")
print(translation)

एक देश है <END> 
