In [140]:
import torch 
import numpy as np
import sys
import torch.nn as nn 
import torch.functional as F

In [90]:
english_file="../dataset/english.txt"
kannada_file="../dataset/kannada.txt"

In [91]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

kannada_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ', 
                      'ँ', 'ఆ', 'ఇ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 
                      'ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ೠ', 'ಌ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 
                      'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 
                      'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 
                      'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 
                      'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 
                      'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 
                      'ಯ', 'ರ', 'ಱ', 'ಲ', 'ಳ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', 
                      '಼', 'ಽ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೄ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್', 'ೕ', 'ೖ', 'ೞ', 'ೣ', 'ಂ', 'ಃ', 
                      '೦', '೧', '೨', '೩', '೪', '೫', '೬', '೭', '೮', '೯', PADDING_TOKEN, END_TOKEN]

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@', 
                        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 
                        'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
                        'Y', 'Z',
                        '[', '\\', ']', '^', '_', '`', 
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                        'y', 'z', 
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

In [92]:
text=list("ಆಘಾತಕಾರಿಯಾದುದು")
text

['ಆ', 'ಘ', 'ಾ', 'ತ', 'ಕ', 'ಾ', 'ರ', 'ಿ', 'ಯ', 'ಾ', 'ದ', 'ು', 'ದ', 'ು']

In [93]:
index_to_kannada={k:v for k,v in enumerate(kannada_vocabulary)}
kannada_to_index={v:k for k,v in enumerate(kannada_vocabulary)}
index_to_english={k:v for k,v in enumerate(english_vocabulary)}
english_to_index={v:k for k,v in enumerate(english_vocabulary)}
# its just mapping

In [94]:
with open (english_file, 'rb') as f:
    english_sentences=f.readlines()
with open(kannada_file,'rb') as f:
    kannada_sentences=f.readlines()

# num of sentences
TOTAL_SENTENCES=10000
english_sentences=english_sentences[:TOTAL_SENTENCES]
kannada_sentences=kannada_sentences[:TOTAL_SENTENCES]



In [95]:
english_sentences = [sentences.decode('utf-8').rstrip('\n') if isinstance(sentences, bytes) else sentences.rstrip('\n') for sentences in english_sentences]
kannada_sentences = [sentences.decode('utf-8').rstrip('\n') if isinstance(sentences, bytes) else sentences.rstrip('\n') for sentences in kannada_sentences]



In [96]:
english_sentences[:5]

['Hes a scientist.',
 "'But we speak the truth aur ye sach hai ke Gujarat mein vikas pagal hogaya hai,'' Rahul Gandhi further said in Banaskantha",
 '8 lakh crore have been looted.',
 'I read a lot into this as well.',
 "She was found dead with the phone's battery exploded close to her head the following morning."]

In [97]:
print(max(len(x) for x in english_sentences))
print()
max_sentence = max(english_sentences, key=len)
print(max_sentence)
print(max(len(x) for x in kannada_sentences))


468

Union Minister of Agriculture & Farmers Welfare, Rural Development &Panchayati Raj and Food Processing Industries, Shri Narendra Singh Tomar today launched AYUSHMAN SAHAKAR, a unique scheme to assist cooperatives play an important role in creation of healthcare infrastructure in the country formulated by the apex autonomous development finance institution under the Ministry of Agriculture and Farmers Welfare, the National Cooperative Development Corporation (NCDC)
491


In [98]:
percentile=95
print(f"{percentile}th percentile of english : {np.percentile([len(x) for x in english_sentences],percentile)}")
print(f"{percentile}th percentile of kannada  : {np.percentile([len(x) for x in kannada_sentences],percentile)}")


95th percentile of english : 151.0
95th percentile of kannada  : 149.0


In [99]:
# it means 95 percent have less than 151 characters

In [100]:
max_sequence_length = 200

def is_valid_token(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) <= (max_sequence_length - 1)

valid_sentences_indexes = []
for index in range(len(kannada_sentences)):
    kannada_sentence, english_sentence = kannada_sentences[index], english_sentences[index]
    if is_valid_length(kannada_sentence, max_sequence_length) \
    and is_valid_length(english_sentence, max_sequence_length) \
    and is_valid_token(kannada_sentence, kannada_vocabulary):
        valid_sentences_indexes.append(index)

print("Number of sentences: ", len(kannada_sentences))
print("Number of sentences: ", len(english_sentences))


Number of sentences:  10000
Number of sentences:  10000


In [101]:
print("Number of valid_sentences: ", len(valid_sentences_indexes))


Number of valid_sentences:  8159


In [109]:
english_sentences=[english_sentences[i] for i in valid_sentences_indexes]
kannada_sentences=[kannada_sentences[i] for i in valid_sentences_indexes]


In [110]:
# creating a datasets
from torch.utils.data import Dataset,DataLoader
class TextDatasets(Dataset):
    def __init__(self,english_sentences,kannada_sentences):
        self.english_sentences = english_sentences
        self.kannada_sentences = kannada_sentences
    def __len__(self):
        return len(self.english_sentences)
    
    def __getitem__(self, index):
        return self.english_sentences[index],self.kannada_sentences[index]

In [111]:
dataset=TextDatasets(english_sentences,kannada_sentences)

In [112]:
len(dataset)

8159

In [113]:
dataset[1]

("'But we speak the truth aur ye sach hai ke Gujarat mein vikas pagal hogaya hai,'' Rahul Gandhi further said in Banaskantha",
 '"ಆದರೆ ಸತ್ಯ ಹೊರ ಬಂದೇ ಬರುತ್ತದೆ ಎಂದು ಹೇಳಿದ ರಾಹುಲ್ ಗಾಂಧಿ, ""ಸೂರತ್ ಜನರು ಚೀನಾದ ಜತೆ ಸ್ಪರ್ಧೆ ನಡೆಸುತ್ತಿದ್ದಾರೆ"')

In [115]:
batch_size=3
train_loader=DataLoader(dataset,batch_size)
iterator=iter(train_loader)


In [117]:
for batch_num,batch in enumerate(iterator):
    english_batch,kannada_batch=batch
    print(f'Batch {batch_num}')
    print('English:', english_batch)
    print('Kannada:', kannada_batch)
    break


Batch 0
English: ('I read a lot into this as well.', 'How did mankind come under Satans rival sovereignty?', 'And then I became Prime Minister.')
Kannada: ('ಇದರ ಬಗ್ಗೆ ನಾನೂ ಸಾಕಷ್ಟು ಓದಿದ್ದೇನೆ.', 'ಮಾನವಕುಲವು ಸೈತಾನನ ಆಳಿಕೆಯ ಕೆಳಗೆ ಬಂದದ್ದು ಹೇಗೆ?', 'ನಂತರ ಪ್ರಧಾನಿ ಕೂಡ ಆಗುತ್ತೇನೆ.')


In [118]:
# tokenization
def tokenization(sentence,language_to_index,start_token=True,end_token=True):
    sentences_word_indexes=[language_to_index[token]for token in list(sentence)]
    if start_token:
        sentences_word_indexes.insert(0,language_to_index[START_TOKEN])
    if end_token:
        sentences_word_indexes.append(language_to_index[END_TOKEN])
    for _ in range(len(sentences_word_indexes),max_sequence_length):
        sentences_word_indexes.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(sentences_word_indexes)


In [119]:
batch

[('I read a lot into this as well.',
  'How did mankind come under Satans rival sovereignty?',
  'And then I became Prime Minister.'),
 ('ಇದರ ಬಗ್ಗೆ ನಾನೂ ಸಾಕಷ್ಟು ಓದಿದ್ದೇನೆ.',
  'ಮಾನವಕುಲವು ಸೈತಾನನ ಆಳಿಕೆಯ ಕೆಳಗೆ ಬಂದದ್ದು ಹೇಗೆ?',
  'ನಂತರ ಪ್ರಧಾನಿ ಕೂಡ ಆಗುತ್ತೇನೆ.')]

In [132]:
eng_tokenized,kn_tokenized=[],[]
for sentence_num in range(batch_size) :
    eng_sentences,kn_sentence=batch[0][sentence_num],batch[1][sentence_num]
    eng_tokenized.append(tokenization(eng_sentences, english_to_index,start_token=False,end_token=False))
    kn_tokenized.append(tokenization(kn_sentence, kannada_to_index, start_token=True, end_token=True))
eng_tokenized=torch.stack(eng_tokenized)
kn_tokenized=torch.stack(kn_tokenized)

    

In [134]:
eng_tokenized# 95 is paddinf tokens

tensor([[41,  1, 82, 69, 65, 68,  1, 65,  1, 76, 79, 84,  1, 73, 78, 84, 79,  1,
         84, 72, 73, 83,  1, 65, 83,  1, 87, 69, 76, 76, 15, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,
         95, 95],
        [40, 79, 87,  1, 68, 73, 68,  1, 77, 65, 78, 75, 73, 78, 68,  1, 67, 79,
         7

In [135]:
kn_tokenized# 0 is start token #124 is end token  and 123 is padding token

tensor([[  0,  43,  73,  82,   1,  78,  58, 106,  58, 100,   1,  75,  93,  75,
          97,   1,  89,  93,  56,  88, 106,  66,  96,   1,  54,  73,  94,  73,
         106,  73, 101,  75, 100,  15, 124, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
         123, 123, 123, 123, 123, 123, 123, 123, 123

In [136]:
NEG_INFTY = -1e9

def create_masks(eng_batch, kn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: {encoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [137]:
create_masks(batch[0],batch[1])

encoder_self_attention_mask torch.Size([3, 200, 200]): tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
decoder_self_attention_mask torch.Size([3, 200, 200]): tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e

(tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09]],
 
         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
    

In [141]:
# everything is summed
class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN
    
    def batch_tokenize(self, batch, start_token=True, end_token=True):

        def tokenize(sentence, start_token=True, end_token=True):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())
    
    def forward(self, x, end_token=True): # sentence
        x = self.batch_tokenize(x ,end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x
