In [6]:
# !kaggle datasets download -d parvmodi/english-to-hindi-machine-translation-dataset

In [7]:
# !unzip english-to-hindi-machine-translation-dataset.zip

In [8]:
filepath_en = 'train.en'
filepath_hi = 'train.hi'

with open(filepath_en, encoding='utf-8') as file:
    lines_en = file.readlines()
print(len(lines_en))
print(lines_en[:5])

with open(filepath_hi, encoding='utf-8') as file:
    lines_hi = file.readlines()

print(len(lines_hi))
print(lines_hi[:5])

10125706
["However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles\n", 'Whosoever desires the reward of the world, with Allah is the reward of the world and of the Everlasting Life. Allah is the Hearer, the Seer.\n', 'The value of insects in the biosphere is enormous because they outnumber all other living groups in measure of species richness.\n', 'Mithali To Anchor Indian Team Against Australia in ODIs\n', 'After the assent of the Honble President on 8thSeptember, 2016, the 101thConstitutional Amendment Act, 2016 came into existence\n']
10125706
['आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।\n', 'और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है\n', 'जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि प

In [9]:
TOTAL_SENTENCES = 200000
english_sentences = lines_en[:TOTAL_SENTENCES]
hindi_sentences = lines_hi[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
hindi_sentences = [sentence.rstrip('\n') for sentence in hindi_sentences]

print(english_sentences[:5])
print("================================")
print(hindi_sentences[:5])

["however, paes, who was partnering australia's paul hanley, could only go as far as the quarterfinals where they lost to bhupathi and knowles", 'whosoever desires the reward of the world, with allah is the reward of the world and of the everlasting life. allah is the hearer, the seer.', 'the value of insects in the biosphere is enormous because they outnumber all other living groups in measure of species richness.', 'mithali to anchor indian team against australia in odis', 'after the assent of the honble president on 8thseptember, 2016, the 101thconstitutional amendment act, 2016 came into existence']
['आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।', 'और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है', 'जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि प्रजातियों की समृद्धि के मामले मे

In [10]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Kannada: {np.percentile([len(x) for x in hindi_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )


97th percentile length Kannada: 258.0
97th percentile length English: 267.0


In [11]:
START_TOKEN = ''
PADDING_TOKEN = ''
END_TOKEN = ''

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\' ', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

hindi_vocabulary = [START_TOKEN, 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ',
    'क', 'ख', 'ग', 'घ', 'ङ','ड़', 'च', 'छ', 'ज', 'झ', 'ञ',
    'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
    'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श',
    'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ',
    'ं', 'ः', 'ँ', '्', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '|',
    ':', '<', '=', '>', '?', '@',
    '[', '\' ', ']', '^', '_', '`',
    ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9','\u200d','।', 'ृ',
    PADDING_TOKEN, END_TOKEN
]




In [12]:
max_sequence_length = 250

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(hindi_sentences)):
    hindi_sentence, english_sentence = hindi_sentences[index], english_sentences[index]
    if is_valid_length(hindi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(hindi_sentence, hindi_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(hindi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")


Number of sentences: 200000
Number of valid sentences: 151260


In [13]:
# for i in 'शख्स':
  # print(i)


# hindi_sentences = [
#     'आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।',
#     'और जो शख्स (अपने आमाल का) बदला दुनिया ही में चाहता है तो ख़ुदा के पास दुनिया व आख़िरत दोनों का अज्र मौजूद है और ख़ुदा तो हर शख्स की सुनता और सबको देखता है',
#     'जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि प्रजातियों की समृद्धि के मामले में उनकी संख्या अन्य जीव समूहों से ज़्यादा है।',
#     'आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को',
#     '8 सितम्‍बर, 2016 को माननीय राष्ट्रपति की स्‍वीकृति मिलने के बाद 101वां संविधान संशोधन अधिनियम, 2016 अस्तित्व में आया'
# ]

# Define the provided vocabulary set
hindi_vocabulary_set = set(hindi_vocabulary)

# Combine all sentences into one string
combined_sentences = ''.join(hindi_sentences)

# Find all unique characters in the combined sentences
unique_chars_in_sentences = set(combined_sentences)

# Find characters in the sentences that are not in the vocabulary
missing_chars = unique_chars_in_sentences - hindi_vocabulary_set

print("Characters in the sentences not in the vocabulary:", missing_chars)


Characters in the sentences not in the vocabulary: {'±', 'ᒡ', 'ਸ', 'Я', '班', 'コ', 'ज़', '้', 'ঔ', '℃', 'ط', 'ú', 'z', '\u200e', '\uf0d8', 'ল', 'గ', 'ී', 'ಚ', 'נ', 'ग़', 'Ň', '²', 'ו', '現', 'ص', '≈', 'শ', 'រ', 'Q', 'S', '॒', 'د', '©', 'h', 'I', 'ច', 'ō', 'ා', '่', 'ô', 'ᡠ', 'A', 'ా', 'ב', '॥', '\xad', '“', 'в', 'ز', 'K', 'ʔ', 'ॐ', '\uf146', 'Ι', 'l', 'β', '˜', 'ê', '♦', '→', 'з', 'េ', '‚', '经', '車', 'ě', 'ล', '唐', '☺', 'উ', 'т', 'ˌ', 'ᠩ', 'ン', 'ḍ', 'พ', 'H', 'ר', 'ʊ', 'デ', 'T', 'é', '̈', '‘', '¨', '記', 'Â', 'آ', '⋅', 'y', 'ූ', 'ְ', '⚡', '大', 'ţ', 'ы', 'න', '\\', 'о', '\u200f', 'ʃ', 'ॊ', 'ऽ', 'า', 'W', 'ன', 'Р', 'i', '−', '»', '₹', '\uf514', 'ό', 'Y', '春', 'ු', 'फ़', 'd', 'B', '州', 'Z', 'ী', 'আ', 'ー', 'М', '♫', 'و', 'ؤ', '្', 'ɑ', 'أ', 'מ', 'п', 'ឃ', '̯', 'k', 'ঙ', 'י', 'Ê', 'Ć', '长', 'ख़', '洞', 'ξ', 'É', 'V', 'ء', '్', 'Ó', '®', 'ප', '›', 'ס', 'ς', '明', 'r', 'م', 'ہ', 'ৱ', 'ң', 'ऩ', 'ī', 'C', 'ల', 'е', 'ห', 'ス', 'c', '의', 'ق', 'š', '世', 'ٰ', '¸', 'ळ', '’', '¶', '夕', 'ර', 'ï', '‑', 'ச', 'ŋ',

In [14]:
index_to_hindi = {k:v for k,v in enumerate(hindi_vocabulary)}
hindi_to_index = {v:k for k,v in enumerate(hindi_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [15]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dropout
from tensorflow.keras import layers
import numpy as np

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, d_model, max_seq_len):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len

    def call(self, inputs):
        pos = np.arange(self.max_seq_len)[:, np.newaxis]
        i = np.arange(self.d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.d_model))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.constant(pos_encoding, dtype=tf.float32)

class SentenceEmbedding(tf.keras.Model):
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super(SentenceEmbedding, self).__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = Dropout(0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):
        def tokenize(sentence, start_token, end_token):
            sentence_word_indices = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indices.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indices.append(self.language_to_index[self.END_TOKEN])
            sentence_word_indices.extend([self.language_to_index[self.PADDING_TOKEN]] * (self.max_sequence_length - len(sentence_word_indices)))
            return tf.convert_to_tensor(sentence_word_indices, dtype=tf.int32)

        tokenized = [tokenize(sentence, start_token, end_token) for sentence in batch]
        return tf.stack(tokenized)

    def call(self, x, start_token, end_token):
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder(x)
        x = self.dropout(x + pos)
        return x


In [17]:
import tensorflow as tf

d_model = 512

sentence_embedding = SentenceEmbedding(
    max_sequence_length=max_sequence_length,
    d_model=d_model,
    language_to_index=english_to_index,
    START_TOKEN=START_TOKEN,
    END_TOKEN=END_TOKEN,
    PADDING_TOKEN=PADDING_TOKEN
)

output = sentence_embedding(english_sentences[:10], start_token=True, end_token=True)

print(output)


tf.Tensor(
[[[ 0.          1.          0.         ...  1.          0.
    1.        ]
  [ 0.8142139   0.518672    0.8392038  ...  1.0089104   0.01558238
    0.98024493]
  [ 0.9281368  -0.42546865  0.91599506 ...  1.0037731  -0.01198027
    1.024127  ]
  ...
  [ 0.92680717 -0.37553754 -0.47029194 ...  0.99964774  0.02560204
    0.99967223]
  [ 0.18475212 -0.98278517  0.45737407 ...  0.9996449   0.02570567
    0.99966955]
  [-0.7271632  -0.6864646   0.9914194  ...  0.999642    0.02580929
    0.99966687]]

 [[ 0.          1.          0.         ...  1.          0.
    1.        ]
  [ 0.88086265  0.5756271   0.7747787  ...  0.9555404   0.02847967
    0.9758246 ]
  [ 0.8820403  -0.43777713  0.9537623  ...  1.0089104   0.01568605
    0.98024493]
  ...
  [ 0.92680717 -0.37553754 -0.47029194 ...  0.99964774  0.02560204
    0.99967223]
  [ 0.18475212 -0.98278517  0.45737407 ...  0.9996449   0.02570567
    0.99966955]
  [-0.7271632  -0.6864646   0.9914194  ...  0.999642    0.02580929
    0.99966

In [30]:
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_english_sentences = []
for index in range(len(english_sentences)):
    english_sentence =  english_sentences[index]
    if  is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(english_sentence, english_vocabulary):
        valid_english_sentences.append(english_sentence)


def batch_tokenize(language_to_index, batch, start_token, end_token, START_TOKEN, END_TOKEN, PADDING_TOKEN, max_sequence_length):
    def tokenize(sentence, start_token, end_token):
        sentence_word_indices = [language_to_index[token] for token in list(sentence)]
        if start_token:
            sentence_word_indices.insert(0, language_to_index[START_TOKEN])
        if end_token:
            sentence_word_indices.append(language_to_index[END_TOKEN])
        sentence_word_indices.extend([language_to_index[PADDING_TOKEN]] * (max_sequence_length - len(sentence_word_indices)))
        return tf.convert_to_tensor(sentence_word_indices, dtype=tf.int32)

    tokenized = [tokenize(sentence, start_token, end_token) for sentence in batch]
    return tf.stack(tokenized)

res = batch_tokenize(language_to_index = english_to_index, batch = valid_english_sentences[:10], start_token = True, end_token = True,
                     START_TOKEN= START_TOKEN, END_TOKEN= END_TOKEN, PADDING_TOKEN=PADDING_TOKEN, max_sequence_length= max_sequence_length)

print(res)
# print(valid_english_sentences)
# count_70 = tf.math.count_nonzero(res[0] == 70)

# print(count_70)


tf.Tensor(
[70 46 53 61 43 60 43 56 13  1 54 39 43 57 13  1 61 46 53  1 61 39 57  1
 54 39 56 58 52 43 56 47 52 45  1 39 59 57 58 56 39 50 47 39  8 57  1 54
 39 59 50  1 46 39 52 50 43 63 13  1 41 53 59 50 42  1 53 52 50 63  1 45
 53  1 39 57  1 44 39 56  1 39 57  1 58 46 43  1 55 59 39 56 58 43 56 44
 47 52 39 50 57  1 61 46 43 56 43  1 58 46 43 63  1 50 53 57 58  1 58 53
  1 40 46 59 54 39 58 46 47  1 39 52 42  1 49 52 53 61 50 43 57 70 70 70
 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70
 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70
 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70
 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70 70
 70 70 70 70 70 70 70 70 70 70], shape=(250,), dtype=int32)
tf.Tensor(110, shape=(), dtype=int64)
