In [25]:
import torch 
import torch.nn as nn
import torch.optim as optim
import spacy
import torch.nn.functional as F
from tqdm.auto import tqdm
import torch.utils.tensorboard 
from torchtext.datasets import Multi30k
import unicodedata
from torchtext.data import Field, BucketIterator

## Building Vocabulary

In [26]:
es = 0x090
ee = 0x005A 
combinations = list("0123456789ABCDEF")

hindi_vocab = []
for codepoint in range(0x900, 0x980):
    character = chr(codepoint)
    hindi_vocab.append(character)


In [27]:
len(hindi_vocab)

128

In [28]:
# START_TOKEN = ''
# PADDING_TOKEN = ''
# END_TOKEN = ''
START_TOKEN = '+'
PADDING_TOKEN = '-'
END_TOKEN = '_'

# hindi_vocab_start = 0x0900
# hindi_vocab_end = 0x097F

english_vocab_start = 0x0041
english_vocab_end = 0x005A

# hindi_vocab = [chr(code) for code in range(hindi_vocab_start, hindi_vocab_end + 1)
#                    if 'L' in unicodedata.category(chr(code))]

english_vocab = [chr(code) for code in range(english_vocab_start, english_vocab_end + 1) 
                   if 'L' in unicodedata.category(chr(code))]


## For lower case alphabets
english_vocab_start = 0x0061
english_vocab_end = 0x007A

english_vocab2 = [chr(code) for code in range(english_vocab_start, english_vocab_end + 1) 
                   if 'L' in unicodedata.category(chr(code))]

english_vocab += english_vocab2

hindi_vocab.insert(0, START_TOKEN)
english_vocab.insert(0, START_TOKEN)

english_vocab.extend(list(",.!`:;"))
english_vocab.extend(list("0123456789@#$%^&*()"))
# english_vocab.append('"')
english_vocab.append(' ')
english_vocab.append("'")

hindi_vocab.append("'")
hindi_vocab.append(" ")
# hindi_vocab.append('"')
hindi_vocab.append(',')
hindi_vocab.append('.')

hindi_vocab.insert(0, PADDING_TOKEN)
english_vocab.insert(0, PADDING_TOKEN)

hindi_vocab.insert(0, END_TOKEN)
english_vocab.insert(0, END_TOKEN)

# Print the Hindi alphabets
len(english_vocab), len(hindi_vocab)


(82, 135)

In [29]:
index_to_hindi = {k:v for k, v in enumerate(hindi_vocab)}
index_to_english = {k:v for k, v in enumerate(english_vocab)}

hindi_to_index = {v:k for k, v in enumerate(hindi_vocab)}
english_to_index = {v:k for k, v in enumerate(english_vocab)}

print(len(index_to_hindi), len(hindi_to_index))
print(len(index_to_english), len(english_to_index))



135 135
82 82


In [30]:
import os

with open("dataset/Hindi_English_Truncated_Corpus.csv", 'r') as f:
    l = f.readlines()

In [31]:
full_sentense = l[2].split(',', 1)[1].strip('\n"')
full_sentense
for i in range(len(full_sentense)):
    if full_sentense[i] in hindi_vocab:
        print(f"We got {full_sentense[i]} at {i}")
        break
full_sentense[:45], full_sentense[45:]

We got ' at 1


('I\'d like to tell you about one such child,","',
 'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,')

In [32]:
dataset_path = "dataset/Hindi_English_Truncated_Corpus.csv"

def get_first_index(sentence, vocab, vocab2):
    for i in range(len(sentence)):
        if sentence[i] in vocab and sentence[i] != " " and sentence[i] not in vocab2:
            return i
    return -1

def get_dataset():
    with open(dataset_path, 'r') as f:
        lines = f.readlines()
    english_sentences, hindi_sentences = [], []
    # Skipping first line cause it's header
    # i = 0
    for a in range(1, len(lines)):
        # print(f"lines -> {lines[a]}")
        line = lines[a].split(',', 1)[1].strip('\n"')
        # print(f"after -> {line}")
        index = get_first_index(line, hindi_vocab, english_vocab)
        if index == -1:
            continue
        eng = line[:index].strip('",?')
        hin = line[index:].strip('",.!_`?')
        # print(f"eng -> {eng}")
        # print(f"hin -> {hin}\n")
        english_sentences.append(eng)
        hindi_sentences.append(hin)
        # if i== 3:
        #     break
        # i+= 1
    
    return english_sentences, hindi_sentences

english_sentences, hindi_sentences = get_dataset()


In [33]:
english_sentences[:4], hindi_sentences[:5]

(['politicians do not have permission to do what needs to be done.',
  "I'd like to tell you about one such child",
  'This percentage is even greater than the percentage in India.',
  "what we really mean is that they're bad at not paying attention."],
 ['राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ',
  'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी',
  'यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।',
  'हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते',
  'इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।'])

In [34]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Kannada: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in hindi_sentences], PERCENTILE)}" )

97th percentile length Kannada: 267.0
97th percentile length English: 265.0


## Need to account for vowels and other thing like आ and 'ा are same but aren't in the vocab

In [35]:
MAX_SEQUENCE_LENGTH = 300
def is_valid_token(sentence, vocab):
    # for token in list(set(sentence)):
    for token in sentence:
        if token not in vocab:
            return False
    return True
    
    
def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1)

valid_sentence_indices = []
for index in range(len(hindi_sentences)):
    hindi_sentence, english_sentence = hindi_sentences[index], english_sentences[index]
    if is_valid_length(hindi_sentence, MAX_SEQUENCE_LENGTH) \
        and is_valid_token(hindi_sentence, hindi_vocab) \
        and is_valid_token(english_sentence, english_vocab) \
        and is_valid_length(english_sentence, MAX_SEQUENCE_LENGTH):
        valid_sentence_indices.append(index)

print(f"Number of sentences in Hindi: {len(hindi_sentences)}")
print(f"Number of sentences in English: {len(english_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indices)}")
valid_sentence_indices[:5]


Number of sentences in Hindi: 127575
Number of sentences in English: 127575
Number of valid sentences: 90289


[0, 1, 2, 3, 4]

## For now compromising...

In [36]:
def is_valid_token(sentence, vocab):
    # for token in list(set(sentence)):
    for token in sentence:
        if token not in vocab:
            print(f"is this space{token}yeah")
            print("huh")
            return False
    return True

is_valid_token(hindi_sentences[0], hindi_vocab), 

(True,)

In [37]:
english_sentences

['politicians do not have permission to do what needs to be done.',
 "I'd like to tell you about one such child",
 'This percentage is even greater than the percentage in India.',
 "what we really mean is that they're bad at not paying attention.",
 '.The ending portion of these Vedas is called Upanishad.',
 'The then Governor of Kashmir resisted transfer , but was finally reduced to subjection with the aid of British .',
 'In this lies the circumstances of people before you.',
 'And who are we to say, even, that they are wrong',
 '“”Global Warming“” refer to warming caused in recent decades and probability of its continual presence and its indirect effect on human being.',
 "You may want your child to go to a school that is not run by the LEA - a non-maintained special school or an independent school that can meet your child 's needs .",
 'Please ensure that you use the appropriate form .',
 'Category: Religious Text',
 'This period summarily is pepped up with devotion.',
 'So there i

In [38]:
hindi_sentences = [hindi_sentences[idx] for idx in valid_sentence_indices]
english_sentences = [english_sentences[idx] for idx in valid_sentence_indices]

In [39]:
from transformer import Transformer
import torch

D_MODEL = 512
BATCH_SIZE = 30
FFN_HIDDEN = 2048
NUM_HEADS = 8
DROP_PROB = 0.1
NUM_LAYERS = 1
MAX_SEQUENCE_LENGTH = 300
HINDI_VOCAB_SIZE = len(hindi_vocab)

transformer = Transformer(d_model=D_MODEL, 
                          ffn_hidden= FFN_HIDDEN,
                          num_heads=NUM_HEADS, 
                          drop_prob=DROP_PROB, 
                          num_layers=NUM_LAYERS, 
                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                          kn_vocab_size=HINDI_VOCAB_SIZE,
                          english_to_index=english_to_index,
                          kannada_to_index=hindi_to_index,
                          START_TOKEN=START_TOKEN, 
                          END_TOKEN=END_TOKEN, 
                          PADDING_TOKEN=PADDING_TOKEN)
     

In [40]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(82, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding):

In [41]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences 
        self.hindi_sentences = hindi_sentences

    def __len__(self):
        return len(self.english_sentences)
    
    def __getitem__(self, idx):
        return self.english_sentences[idx], self.hindi_sentences[idx]
    

In [42]:
dataset = TextDataset(english_sentences, hindi_sentences)
len(dataset), dataset[1]

(90289,
 ("I'd like to tell you about one such child",
  'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी'))

In [43]:
train_loader = DataLoader(dataset, BATCH_SIZE)
iterator = iter(train_loader)
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 1:
        break


[('politicians do not have permission to do what needs to be done.', "I'd like to tell you about one such child", 'This percentage is even greater than the percentage in India.', "what we really mean is that they're bad at not paying attention.", '.The ending portion of these Vedas is called Upanishad.', 'The then Governor of Kashmir resisted transfer , but was finally reduced to subjection with the aid of British .', 'In this lies the circumstances of people before you.', 'And who are we to say, even, that they are wrong', "You may want your child to go to a school that is not run by the LEA - a non-maintained special school or an independent school that can meet your child 's needs .", 'Please ensure that you use the appropriate form .', 'This period summarily is pepped up with devotion.', 'So there is some sort of justice', 'The first two were found unreliable and the prosecution case rested mainly on the evidence of the remaining five approvers .', 'And now at present the naturecur

In [44]:

from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=hindi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [45]:

NEG_INFTY = -1e9

def create_masks(eng_batch, kn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , False)

    for idx in range(num_sentences):
      eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, MAX_SEQUENCE_LENGTH)
      kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, MAX_SEQUENCE_LENGTH)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [46]:
hindi_sentences

['राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है ',
 'मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी',
 'यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।',
 'हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते',
 'इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।',
 'कश्मीर के तत्कालीन गवर्नर ने इस हस्तांतरण का विरोध किया था , लेकिन अंग्रेजों की सहायता से उनकी आवाज दबा दी गयी ',
 'इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।',
 'और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं',
 'हो सकता है कि आप चाहते हों कि आप का नऋर्नमेनटेन्ड ह्यबिना किसी समर्थन के हृ विशेष स्कूल , या किसी स्वतंत्र स्कूल में जाए , इजसके पास विशेष शैक्षणिक जऋऋरतों वाले बच्चों के प्रति सहूलियत हों . ',
 'कृपया यह सुनिश्चित कर लें कि आप सही फॉर्म का प्रयोग कर रहें हैं ',
 'यह काल समग्रतः भक्ति भावना से ओतप्रोत काल है।',
 'तो वहाँ न्याय है',
 'पहले दो को अविश्वसनीय मानकर बाकी पांच मुखबिरों के आधार पर मुकदमा चलाया गया ',
 'हाल में नेपाल के हस्पताल सामन्यतया आयुर्वेद, प्राकृतिक चिकित्सा तथा आधुनिक चिकीत्स

In [48]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch: {epoch}")
    for batch_num, batch in enumerate(train_loader):
        eng_batch, hin_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, hin_batch)
        optim.zero_grad()
        hin_predictions = transformer(eng_batch,
                                     hin_batch,
                                     encoder_self_attention_mask.to(device), 
                                     decoder_self_attention_mask.to(device), 
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(hin_batch, start_token=False, end_token=True)
        loss = criterian(
            hin_predictions.view(-1, HINDI_VOCAB_SIZE).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indicies = torch.where(labels.view(-1) == hindi_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Kannada Translation: {hin_batch[0]}")
            kn_sentence_predicted = torch.argmax(hin_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in kn_sentence_predicted:
              if idx == hindi_to_index[END_TOKEN]:
                break
              predicted_sentence += index_to_hindi[idx.item()]
            print(f"Kannada Prediction: {predicted_sentence}")


            transformer.eval()
            kn_sentence = (START_TOKEN)
            eng_sentence = ("How are you",)
            for word_counter in range(MAX_SEQUENCE_LENGTH):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, kn_sentence)
                predictions = transformer(eng_sentence,
                                          kn_sentence,
                                          encoder_self_attention_mask.to(device), 
                                          decoder_self_attention_mask.to(device), 
                                          decoder_cross_attention_mask.to(device),
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
                next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_hindi[next_token_index]
                kn_sentence = (kn_sentence[0] + next_token, )
                if next_token == END_TOKEN:
                  break
            
            print(f"Evaluation translation (should we go to the mall?) : {kn_sentence}")
            print("-------------------------------------------")

    


Epoch: 0
Iteration 0 : 4.3825883865356445
English: politicians do not have permission to do what needs to be done.
Kannada Translation: राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है 
Kannada Prediction:       क          क क                                                 -                               क     ििेे     ि               ॶ   क       ििकिा                ा                                        क                               ७ेेटके   े                                  े  े              


RuntimeError: The size of tensor a (301) must match the size of tensor b (300) at non-singleton dimension 1