In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [109]:
class MultiHeadAttention(nn.Module):
    def __init__(self,
                 n_heads,
                 n_embd,
                 mask = None):
        
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.n_embd = n_embd
        self.mask = mask
        self.head_dim = n_embd // n_heads
        self.qkv_layer = nn.Linear(n_embd, 3 * n_embd)
        self.linear_layer = nn.Linear(n_embd, n_embd)

    def forward(self,
                input_embeddings):
        
        batch_size, max_seq_len, n_embd = input_embeddings.size()
        qkv = self.qkv_layer(input_embeddings)
        qkv = qkv.reshape((batch_size, max_seq_len ,self.n_heads, 3 * self.head_dim))
        qkv = qkv.permute(0, 2, 1, 3)

        q, k, v = qkv.chunk(3, dim=-1)
        attention = torch.matmul(q, k.transpose(-2, -1)) // n_embd
        print(f"scaled after permute -> {attention.permute(1, 0, 2, 3).shape}")
        if self.mask:
            # print(f"Attention shape -> {attention.shape}")
            # print(f"scaled after permute -> {attention.permute(1, 0, 2, 3).shape}")
            attention += self.mask
            # print(f"scaled after permute -> {attention.permute(1, 0, 2, 3).shape}")
        # print(f"Attention shape -> {attention.shape}")

        attention = F.softmax(attention, dim=-1)
        # print(f"attention shape -> {attention.shape}")
        updated_emb = torch.matmul(attention, v)
        # print(f"updated emb -> {updated_emb.shape}")
        
        updated_emb = updated_emb.permute(0, 2, 1, 3).reshape(batch_size, max_seq_len, n_embd)
        return self.linear_layer(updated_emb)

ma = MultiHeadAttention(n_heads=8, n_embd=512)
x = torch.randn(2, 4, 512)
ma(x).shape

scaled after permute -> torch.Size([8, 2, 4, 4])


torch.Size([2, 4, 512])

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def forward(self,
                input_encodings):
        
        batch_size, max_seq_len, n_embd = input_encodings.size()
        i = torch.arange(0, n_embd, 2).float()

        denominator = torch.pow(10_000, (i)/n_embd).type(torch.float32)
        position = torch.arange(0, max_seq_len, dtype=torch.float32).reshape(max_seq_len, 1)
        even_PE = torch.sin(position/ denominator)
        odd_PE = torch.cos(position/ denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2).view(max_seq_len, -1)
        return stacked.unsqueeze(0).repeat(batch_size, 1, 1)
    
p = PositionalEncoding()
p(x).shape


torch.Size([2, 4, 512])

In [4]:
class CrossMultiHeadAttention(nn.Module):
    def __init__(self,
                 n_heads,
                 n_embd, 
                 mask = None):
        
        super(CrossMultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.n_embd = n_embd
        self.mask = mask
        self.head_dim = n_embd // n_heads
        self.kv_layer = nn.Linear(n_embd, 2 * n_embd)
        self.q_layer = nn.Linear(n_embd, n_embd)
        self.linear_layer = nn.Linear(n_embd, n_embd)

    def forward(self,
                kv,
                q):
        
        batch_size, max_seq_len, n_embd = kv.size()
        kv = self.kv_layer(kv)
        kv = kv.reshape((batch_size, max_seq_len ,self.n_heads, 2 * self.head_dim))
        q = self.q_layer(q)
        kv = kv.permute(0, 2, 1, 3)
        q = q.reshape(batch_size, max_seq_len, self.n_heads, self.head_dim)
        q = q.permute(0, 2, 1, 3)

        k, v = kv.chunk(2, dim=-1)
        attention = torch.matmul(q, k.transpose(-2, -1)) // n_embd
        if self.mask:
            attention += self.mask
        attention = F.softmax(attention, dim=-1)
        updated_emb = torch.matmul(attention, v)
        
        updated_emb = updated_emb.permute(0, 2, 1, 3).reshape(batch_size, max_seq_len, n_embd)
        return self.linear_layer(updated_emb)


In [5]:
class EncoderLayer(nn.Module):
    def __init__(self,
                 n_heads,
                 n_embd, 
                 mask = None):
        
        super(EncoderLayer, self).__init__()
        self.n_heads = n_heads
        self.n_embd = n_embd
        self.mask = mask 
        self.positional_encodings = PositionalEncoding()
        self.layer_norm = nn.LayerNorm(n_embd)
        self.multiheadattention = MultiHeadAttention(n_heads=self.n_heads,
                                                     n_embd=self.n_embd,
                                                     mask=mask)
        self.dropout = nn.Dropout(p=0.3)
        self.feedforward = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.Tanh(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Tanh()
        )

    def forward(self,
                input_embeddings):
        out = self.positional_encodings(input_embeddings)
        attention = self.multiheadattention(input_embeddings=input_embeddings)
        attention = self.dropout(attention)
        attention = self.layer_norm(attention + out)
        # print(f"attention shape -> {attention.shape}")
        updated_embeddings = self.feedforward(attention)
        updated_embeddings = self.layer_norm(updated_embeddings + attention)
        return updated_embeddings
    
model = EncoderLayer(n_heads=8,
                n_embd=512)
model(x).shape
        

torch.Size([2, 4, 512])

In [6]:
class DecoderLayer(nn.Module):
    def __init__(self,
                 n_heads,
                 n_embd,
                 mask = None):
        
        super(DecoderLayer, self).__init__()
        self.n_heads = n_heads
        self.n_embd = n_embd
        self.mask = mask
        self.positional_encodings = PositionalEncoding()
        self.layer_norm = nn.LayerNorm(n_embd)
        self.crossmultiheadattention = CrossMultiHeadAttention(n_heads=n_heads,
                                                               n_embd=n_embd,
                                                               mask=mask)
        self.multiheadattention = MultiHeadAttention(n_heads=n_heads,
                                                     n_embd=n_embd,
                                                     mask=mask)
        self.dropout = nn.Dropout(p=0.3)
        self.feedforward = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.Tanh(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Tanh()
        )

    def forward(self,
                output_embeddings,
                input_embeddings):
        
        output_embeddings = self.positional_encodings(output_embeddings)
        attention = self.multiheadattention(output_embeddings)

        # print(f"Attention size -> {attention.shape}")
        q = self.layer_norm(attention + output_embeddings)
        kv = input_embeddings

        updated_attention = self.crossmultiheadattention(kv=kv,
                                                         q=q)
        updated_attention = self.layer_norm(updated_attention + q)
        final_weights = self.feedforward(updated_attention)
        final_weights = self.layer_norm(updated_attention + final_weights)

        return final_weights
    
decoder = DecoderLayer(n_heads=8,
                       n_embd=512)
out = decoder(x, x)
x.shape


torch.Size([2, 4, 512])

In [7]:
class Encoder(nn.Module):
    def __init__(self,
                 num_layers,
                 n_heads,
                 n_embd,
                 mask=None):
        super(Encoder, self).__init__()

        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(EncoderLayer(n_heads=n_heads,
                                            n_embd=n_embd,
                                            mask=mask))
            

    def forward(self,
               input_embeddings,):
        
        out = input_embeddings
        for layer in self.layers:
            out = layer(out)
        return out
enc = Encoder(2, 8, 512)
enc(x).shape

torch.Size([2, 4, 512])

In [8]:
class Decoder(nn.Module):
    def __init__(self,
                 num_layers,
                 n_heads, 
                 n_embd,
                 mask=None):
        super(Decoder, self).__init__()

        self.layers = nn.ModuleList()
        for _ in range(num_layers):
            self.layers.append(
                DecoderLayer(n_heads=n_heads,
                             n_embd=n_embd,
                             mask=mask)
            )

        self.Liner = nn.Linear(n_embd, n_embd)

    def forward(self, 
                output_embeddings,
                input_embeddings):
        
        in_e = input_embeddings
        out_e = output_embeddings
        for layer in self.layers:
            out_e = layer(input_embeddings=in_e,
                        output_embeddings=out_e)
            
        return out_e
    
deco = Decoder(num_layers=2,
               n_heads=8,
               n_embd=512)

            

class Transformer(nn.Module):
    def __init__(self,
                 n_heads,
                 n_embd,
                 encoder_mask=None,
                 decoder_mask=None,
                 num_encoder_layers=3,
                 num_decoder_layers=3,
                 vocab_size_input=70,
                 vocab_size_output=70):
        
        super(Transformer, self).__init__()
        # It changes (Batch_size, Max_seq_len, vocab_size) -> (Batch_size, max_seq_len, n_embd)
        # For encoder part
        self.input_embeddings = nn.Linear(vocab_size_input, n_embd)

        # For decoder part
        self.output_embeddings = nn.Linear(vocab_size_output, n_embd)

        self.encoder = Encoder(num_layers=num_encoder_layers,
                               n_heads=n_heads,
                               n_embd=n_embd,
                               mask=encoder_mask)
        self.deocder = Decoder(num_layers=num_decoder_layers,
                               n_embd=n_embd,
                               n_heads=n_heads,
                               mask=decoder_mask)
        self.final_layer = nn.Sequential(
            nn.Linear(n_embd, 2 * vocab_size_output),
            nn.ReLU(),
            nn.Linear(2 * vocab_size_output, vocab_size_output), 
            nn.ReLU(), 
            nn.Softmax(dim=-1)
        )

    def forward(self,
                input_sentence_embeddings,
                output_sentence_embeddings):
        
        input_sentence_embeddings = self.input_embeddings(input_sentence_embeddings)
        output_sentence_embeddings = self.output_embeddings(output_sentence_embeddings)

        enc_out = self.encoder(input_sentence_embeddings)
        dec_out = self.deocder(output_embeddings=output_sentence_embeddings,
                               input_embeddings=input_sentence_embeddings)
        out = self.final_layer(dec_out)
        return out





encoder = Encoder(n_heads=8,
                  n_embd=512,
                  num_layers=3)
decoder = Decoder(n_heads=8,
                  n_embd=512,
                  num_layers=2)
enc_out = encoder(x)
dec_out = decoder(enc_out, x)
enc_out.shape, dec_out.shape


x_new = torch.randn(4, 300, 70).float()
transformer = Transformer(n_heads=8,
                          n_embd=512)
output = transformer(x_new,
            x_new)

In [9]:
import torch 
import torch.nn as nn
import torch.optim as optim
import spacy
import torch.nn.functional as F
from tqdm.auto import tqdm
import unicodedata
from torchtext.data import Field, BucketIterator

In [10]:
es = 0x090
ee = 0x005A 
combinations = list("0123456789ABCDEF")

hindi_vocab = []
for codepoint in range(0x900, 0x980):
    character = chr(codepoint)
    hindi_vocab.append(character)



START_TOKEN = '+'
PADDING_TOKEN = '-'
END_TOKEN = '_'


english_vocab_start = 0x0041
english_vocab_end = 0x005A

english_vocab = [chr(code) for code in range(english_vocab_start, english_vocab_end + 1) 
                   if 'L' in unicodedata.category(chr(code))]


## For lower case alphabets
english_vocab_start = 0x0061
english_vocab_end = 0x007A

english_vocab2 = [chr(code) for code in range(english_vocab_start, english_vocab_end + 1) 
                   if 'L' in unicodedata.category(chr(code))]

english_vocab += english_vocab2

hindi_vocab.insert(0, START_TOKEN)
english_vocab.insert(0, START_TOKEN)

english_vocab.extend(list(",.!`:;"))
english_vocab.extend(list("0123456789@#$%^&*()"))
english_vocab.append(' ')
english_vocab.append("'")

hindi_vocab.append("'")
hindi_vocab.append(" ")
hindi_vocab.append(',')
hindi_vocab.append('.')

hindi_vocab.insert(0, PADDING_TOKEN)
english_vocab.insert(0, PADDING_TOKEN)

hindi_vocab.insert(0, END_TOKEN)
english_vocab.insert(0, END_TOKEN)

# Print the Hindi alphabets
len(english_vocab), len(hindi_vocab)



(82, 135)

In [11]:
index_to_hindi = {k:v for k, v in enumerate(hindi_vocab)}
index_to_english = {k:v for k, v in enumerate(english_vocab)}

hindi_to_index = {v:k for k, v in enumerate(hindi_vocab)}
english_to_index = {v:k for k, v in enumerate(english_vocab)}

print(len(index_to_hindi), len(hindi_to_index))
print(len(index_to_english), len(english_to_index))



135 135
82 82


In [12]:
import os

with open("dataset/Hindi_English_Truncated_Corpus.csv", 'r') as f:
    l = f.readlines()

full_sentense = l[2].split(',', 1)[1].strip('\n"')
full_sentense
for i in range(len(full_sentense)):
    if full_sentense[i] in hindi_vocab:
        print(f"We got {full_sentense[i]} at {i}")
        break
print(full_sentense[:45], full_sentense[45:])

dataset_path = "dataset/Hindi_English_Truncated_Corpus.csv"

def get_first_index(sentence, vocab, vocab2):
    for i in range(len(sentence)):
        if sentence[i] in vocab and sentence[i] != " " and sentence[i] not in vocab2:
            return i
    return -1

def get_dataset():
    with open(dataset_path, 'r') as f:
        lines = f.readlines()
    english_sentences, hindi_sentences = [], []
    # Skipping first line cause it's header
    # i = 0
    for a in range(1, len(lines)):
        # print(f"lines -> {lines[a]}")
        line = lines[a].split(',', 1)[1].strip('\n"')
        # print(f"after -> {line}")
        index = get_first_index(line, hindi_vocab, english_vocab)
        if index == -1:
            continue
        eng = line[:index].strip('",?')
        hin = line[index:].strip('",.!_`?')
        # print(f"eng -> {eng}")
        # print(f"hin -> {hin}\n")
        english_sentences.append(eng)
        hindi_sentences.append(hin)
        # if i== 3:
        #     break
        # i+= 1
    
    return english_sentences, hindi_sentences

english_sentences, hindi_sentences = get_dataset()


We got ' at 1
I'd like to tell you about one such child,"," मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,


In [13]:
english_sentences[:4], hindi_sentences[:5]

(['politicians do not have permission to do what needs to be done.',
  "I'd like to tell you about one such child",
  'This percentage is even greater than the percentage in India.',
  "what we really mean is that they're bad at not paying attention."],
 ['राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ',
  'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी',
  'यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।',
  'हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते',
  'इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।'])

In [14]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Kannada: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in hindi_sentences], PERCENTILE)}" )

97th percentile length Kannada: 267.0
97th percentile length English: 265.0


In [15]:
MAX_SEQUENCE_LENGTH = 300
def is_valid_token(sentence, vocab):
    # for token in list(set(sentence)):
    for token in sentence:
        if token not in vocab:
            return False
    return True
    
    
def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1)

valid_sentence_indices = []
for index in range(len(hindi_sentences)):
    hindi_sentence, english_sentence = hindi_sentences[index], english_sentences[index]
    if is_valid_length(hindi_sentence, MAX_SEQUENCE_LENGTH) \
        and is_valid_token(hindi_sentence, hindi_vocab) \
        and is_valid_token(english_sentence, english_vocab) \
        and is_valid_length(english_sentence, MAX_SEQUENCE_LENGTH):
        valid_sentence_indices.append(index)

print(f"Number of sentences in Hindi: {len(hindi_sentences)}")
print(f"Number of sentences in English: {len(english_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indices)}")
valid_sentence_indices[:5]


Number of sentences in Hindi: 127575
Number of sentences in English: 127575
Number of valid sentences: 90289


[0, 1, 2, 3, 4]

In [65]:
len(hindi_sentences)

90289

In [16]:
def is_valid_token(sentence, vocab):
    # for token in list(set(sentence)):
    for token in sentence:
        if token not in vocab:
            print(f"is this space{token}yeah")
            print("huh")
            return False
    return True

is_valid_token(hindi_sentences[0], hindi_vocab), 

(True,)

In [17]:
english_sentences[:10], hindi_sentences[:10]

(['politicians do not have permission to do what needs to be done.',
  "I'd like to tell you about one such child",
  'This percentage is even greater than the percentage in India.',
  "what we really mean is that they're bad at not paying attention.",
  '.The ending portion of these Vedas is called Upanishad.',
  'The then Governor of Kashmir resisted transfer , but was finally reduced to subjection with the aid of British .',
  'In this lies the circumstances of people before you.',
  'And who are we to say, even, that they are wrong',
  '“”Global Warming“” refer to warming caused in recent decades and probability of its continual presence and its indirect effect on human being.',
  "You may want your child to go to a school that is not run by the LEA - a non-maintained special school or an independent school that can meet your child 's needs ."],
 ['राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ',
  'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी',
  'यह प्रत

In [18]:
hindi_sentences = [hindi_sentences[idx] for idx in valid_sentence_indices]
english_sentences = [english_sentences[idx] for idx in valid_sentence_indices]

In [19]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences 
        self.hindi_sentences = hindi_sentences

    def __len__(self):
        return len(self.english_sentences)
    
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hindi_sentences[idx]
dataset = TextDataset(english_sentences, hindi_sentences)
len(dataset), dataset[100]

train_dataloader = DataLoader(dataset=dataset,
                              batch_size=32,
                              shuffle=True)
a, b = next(iter(train_dataloader))
a[:4], b[:4]
    

(('People feel that forests are their open treasure-houses for them to use as they feel like .',
  'We will send you details about claiming expenses when we write to tell you about the arrangements for the hearing .',
  'Stopping smoking',
  "Microsoft's induct I.M.I"),
 ('लोग यह मानते हैं कि वन उनके लिए खुले खजाने की तरह हैं जिनका वे जैसा चाहें वैसा उपयोग कर सकते हैं ',
  'ख़र्चे क्लेम करने के विवरण हम आप को तब भेजेंगे जब हम आप को सुनवाई के प्रबन्धों के बारे में लिखेंगे ',
  'धूम्रपान बंद करें',
  'माइक्रोसाफ्ट का इण्डिक आईएमई'))

In [20]:

from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
def create_masks(shape: tuple):
  mask = torch.full(shape, float('-inf'))
  mask = torch.triu(mask, diagonal=1)
  return mask

In [22]:
new_mask = create_masks((1, 8, 4, 4))
new_mask[0][0] # There are 8 of it

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

### Tokenizing sentences

In [74]:
def char_to_tensor(char,
                   vocab_to_index,
                   vocab_size):
    
    char_tensor = torch.zeros((vocab_size))
    char_tensor[vocab_to_index[char]] = 1
    return char_tensor


def sentence_to_tensor(sen,
                       vocab_to_index,
                       vocab_size,
                       max_seq_len):
    padding_tensor = char_to_tensor(char=PADDING_TOKEN,
                                    vocab_to_index=vocab_to_index,
                                    vocab_size=vocab_size)
    sen_tensor = torch.stack([padding_tensor] * max_seq_len)
    for i, char in enumerate(sen):
        char_tensor = char_to_tensor(char=char,
                       vocab_to_index=vocab_to_index,
                       vocab_size=vocab_size)
        sen_tensor[i] = char_tensor
    return sen_tensor


def sentences_to_tensor(sentences,
                        vocab_to_index,
                        vocab_size,
                        max_seq_len):
    batch_size = len(sentences)
    sens_tensor = torch.zeros(batch_size, max_seq_len, vocab_size)
    for i, sen in enumerate(sentences):
        sen_tensor = sentence_to_tensor(sen=sen,
                                        vocab_to_index=vocab_to_index,
                                        vocab_size=vocab_size,
                                        max_seq_len=max_seq_len)
        sens_tensor[i] = sen_tensor
    return sens_tensor

sen_tensor = sentence_to_tensor(english_sentences[0], english_to_index, len(english_vocab), 300)
sentences_to_tensor(hindi_sentences[:5], hindi_to_index, len(hindi_vocab), 300).shape


torch.Size([5, 300, 135])

## Training (Using inbuilt transformer network)

In [84]:
class InbuiltTransformer(nn.Module):
    def __init__(self,
                 d_model,
                 n_heads,
                 num_encoder_layers,
                 num_decoder_layers,
                 input_vocab_size,
                 output_vocab_size,
                 inp_mask=None,
                 out_mask=None):
        
        super(InbuiltTransformer, self).__init__()
        self.inp_mask = inp_mask
        self.out_mask = out_mask

        self.input_embeddings = nn.Linear(input_vocab_size, d_model)
        self.output_embeddings = nn.Linear(output_vocab_size, d_model)

        self.trans = nn.Transformer(d_model=d_model,
                                    nhead=n_heads,
                                    num_encoder_layers=num_encoder_layers,
                                    num_decoder_layers=num_decoder_layers,
                                    dim_feedforward=1024,
                                    batch_first=True)
        
    def forward(self,
                input_embeddings,
                output_embeddings):
        
        input_embeddings = self.input_embeddings(input_embeddings)
        output_embeddings = self.output_embeddings(output_embeddings)
        out = self.trans(src=input_embeddings,
                   tgt=output_embeddings,
                   tgt_mask=self.out_mask,
                   src_mask=self.inp_mask)
        
        return out



        

In [97]:
N_HEADS = 8
N_EMBD = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
mask = create_masks((MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH))
transformer = InbuiltTransformer(d_model=N_EMBD,
                                 n_heads=N_HEADS,
                                 num_encoder_layers=NUM_ENCODER_LAYERS,
                                 num_decoder_layers=NUM_DECODER_LAYERS,
                                 input_vocab_size=len(english_vocab),
                                 output_vocab_size=len(hindi_vocab),
                                 out_mask=mask)

criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [98]:
transformer.train()
for eng_sens, hin_sens in train_dataloader:

    hin_sens = sentences_to_tensor(sentences=hin_sens,
                                   vocab_to_index=hindi_to_index,
                                   vocab_size=len(hindi_vocab),
                                   max_seq_len=MAX_SEQUENCE_LENGTH)
    
    eng_sens = sentences_to_tensor(sentences=eng_sens,
                                   vocab_to_index=english_to_index,
                                   vocab_size=len(english_vocab),
                                   max_seq_len=MAX_SEQUENCE_LENGTH)
    
    
    out = transformer(input_embeddings=eng_sens,
                      output_embeddings=hin_sens)
    print(out.shape)
    

torch.Size([32, 300, 512])


KeyboardInterrupt: 

## Training (Using Custom made)

In [102]:
N_HEADS = 8
N_EMBD = 512
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
mask = create_masks((1, 1, N_HEADS, N_HEADS))
transformer = Transformer(n_heads=N_HEADS,
                          n_embd=N_EMBD,
                          decoder_mask=None,
                          num_encoder_layers=NUM_ENCODER_LAYERS,
                          num_decoder_layers=NUM_DECODER_LAYERS,
                          vocab_size_input=len(english_vocab),
                          vocab_size_output=len(hindi_vocab))


criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [103]:
transformer.train()
for eng_sens, hin_sens in train_dataloader:

    hin_sens = sentences_to_tensor(sentences=hin_sens,
                                   vocab_to_index=hindi_to_index,
                                   vocab_size=len(hindi_vocab),
                                   max_seq_len=MAX_SEQUENCE_LENGTH)
    
    eng_sens = sentences_to_tensor(sentences=eng_sens,
                                   vocab_to_index=english_to_index,
                                   vocab_size=len(english_vocab),
                                   max_seq_len=MAX_SEQUENCE_LENGTH)
    
    
    out = transformer(eng_sens,
                      hin_sens)
    print(out.shape)
    

torch.Size([32, 300, 135])


KeyboardInterrupt: 

In [112]:
a = torch.randn((8, 10, 4, 4))
mask.shape, (torch.sum((mask, a), dim=01))

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (3687628710.py, line 2)