# **English To Sinhala Translation**

Import Relavent Libraries

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Masking
from keras.layers import Attention, LayerNormalization, Dropout
from keras.optimizers import Adam

Mount the google drive

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Read The data file

In [6]:
text_file = "/content/drive/My Drive/DL_mini_project3/Trainingtxt.txt"
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
i = 0
for line in lines:
  print(line)
  i = i + 1
  if(i==20):
    break

ඔයාට තැපෑලෙන් පැකේජ් එකක් හම්බවේවි	you will receive a package in the mail
ලියාපදිංචිය සම්පූර්ණ කලාට පස්සෙ ඔයාට තහවුරු කිරීමේ කේතයක්  හම්බවේවි	you will receive a confirmation code after completing the registration
ඊලඟ මිලදී ගැනීම කරන කොට ඔයාට වට්ටමක් හම්බවේවි	you will receive a discount on your next purchase
ඔයාට අපේ පාරිභෝගික සේවා කණ්ඩායමෙන් දුරකථන ඇමතුමක් හම්බවේවි	you will receive a phone call from our customer service team
ඔයාගේ ඇණවුම සූදානම් උනාට පස්සෙ ඔයාට දැනුම් දීමක් හම්බවේවි	you will receive a notification when your order is ready for pickup
ඔයාට පැය 24ක් ඇතුලත ඔයාගෙ විමසීමට පිළිතුරක්  හම්බවේවි	you will receive a response to your inquiry within 24 hours
ඔයාට ඔයාගේ පක්ෂපාතිත්වය වෙනුවෙන් තෑග්ගක් හම්බවේවි	you will receive a gift for your loyalty
ඔයාට උත්සවයට ආරාධනාවක් හම්බවේවි	you will receive an invitation to the event
ඔයාට රිටන් කරන භාණ්ඩය වෙනුවෙන් ආපසු මුදල් ගෙවීමක් හම්බවේවි	you will receive a refund for the returned item
ඔයාට විස්තර එක්ක  ඊමේල් එකක් හම්බවේවි	you'll receive an e

Split the English and Sinhala translation pairs

In [7]:
import random
import string
import re

text_pairs = []


for line in lines:
    if '\t' in line:
        english, sinhala = line.split("\t")
        sinhala = "[start]" + sinhala.strip() + "[end]"
        text_pairs.append((english.strip(), sinhala))
for i in range(3):
    print(random.choice(text_pairs))

('හරි', '[start]ok[end]')
('ලෙඩක් හරියටම හොයාගන්න පුලුවන්', '[start]a disease can be found exactly[end]')
('කවුරුහරි ලුනු බැරල් එකක් වතුරට හලලා', '[start]someone poured a barrel of salt into the water[end]')


In [8]:

# Print random text pairs
for i in range(3):
    print(random.choice(text_pairs))

('හ්ම් ඔව්', '[start]hmm yes[end]')
('මං ඊලඟ දවස එනකල් බලාන ඉන්නවා', '[start]i am waiting for the next day[end]')
('මාර්වින්     තාත්තා', '[start]marvin    father[end]')


Randomize the Data

In [9]:
import random
random.shuffle(text_pairs)

Spliting the data into training, validation and Testing

In [10]:
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]
print("Total sentences:",len(text_pairs))
print("Training set size:",len(train_pairs))
print("Validation set size:",len(val_pairs))
print("Testing set size:",len(test_pairs))

Total sentences: 54213
Training set size: 37951
Validation set size: 8131
Testing set size: 8131


In [11]:
len(train_pairs)+len(val_pairs)+len(test_pairs)

54213

Removing Puctuation

In [12]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
f"[{re.escape(strip_chars)}]"

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\\\\\^_`\\{\\|\\}\\~¿]'

Vectorizing the English and Sinhala text pairs


In [15]:
import tensorflow as tf
from tensorflow.keras import layers
def custom_standardization(input_string):
  lowercase = tf.strings.lower(input_string)
  return tf.strings.regex_replace(
    lowercase, f"[{re.escape(strip_chars)}]", "")
vocab_size = 15000
sequence_length = 20
source_vectorization = layers.TextVectorization(
  max_tokens=vocab_size,
  output_mode="int",
  output_sequence_length=sequence_length,
)
target_vectorization = layers.TextVectorization(
  max_tokens=vocab_size,
  output_mode="int",
  output_sequence_length=sequence_length + 1,
  standardize=custom_standardization,
)
train_english_texts = [pair[0] for pair in train_pairs]
train_sinhala_texts = [pair[1] for pair in train_pairs]

In [16]:
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_sinhala_texts)

Prepare dataset for the translation task

In [17]:
batch_size=64

def format_dataset(eng,sin):
  eng=source_vectorization(eng)
  sin=target_vectorization(sin)
  return({
      "english":eng,
      "sinhala":sin[:,:-1],
  },sin[:,1:])

def make_dataset(pairs):
  eng_texts, sin_texts =zip(*pairs)
  eng_texts =list(eng_texts)
  sin_texts =list(sin_texts)
  dataset = tf.data.Dataset.from_tensor_slices((eng_texts, sin_texts))
  dataset = dataset.batch(batch_size)
  dataset = dataset.map(format_dataset, num_parallel_calls=4)
  return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

for inputs, targets in train_ds.take(1):
  print(f"inputs['english'].shape:{inputs['english'].shape}")
  print(f"inputs['sinhala'].shape:{inputs['sinhala'].shape}")
  print(f"targets.shape:{targets.shape}")

  inputs['english'].shape:  (64, 20)
  inputs['sinhala'].shape: (64, 20)
  targets.shape: (64, 20)
print(list(train_ds.as_numpy_iterator())[50])

inputs['english'].shape:(64, 20)
inputs['sinhala'].shape:(64, 20)
targets.shape:(64, 20)
({'english': array([[   5,  374,    3, ...,    0,    0,    0],
       [   6,  713,    0, ...,    0,    0,    0],
       [  56,   50,  414, ...,    0,    0,    0],
       ...,
       [4878,   55,  255, ...,    0,    0,    0],
       [  84,    0,    0, ...,    0,    0,    0],
       [ 921,  322,  257, ...,    0,    0,    0]]), 'sinhala': array([[   7,   67,   33, ...,    0,    0,    0],
       [  48,    6,    3, ...,    0,    0,    0],
       [  35, 2503,   39, ...,    0,    0,    0],
       ...,
       [ 594,   14,    5, ...,    0,    0,    0],
       [ 159,   25,    0, ...,    0,    0,    0],
       [ 106,    3, 1262, ...,    0,    0,    0]])}, array([[  67,   33,  198, ...,    0,    0,    0],
       [   6,    3, 1439, ...,    0,    0,    0],
       [2503,   39,  108, ...,    0,    0,    0],
       ...,
       [  14,    5, 6604, ...,    0,    0,    0],
       [  25,    0,    0, ...,    0,    0,    

Transformer encoder implemented as a subclassed Layer

In [18]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

The Transformer decoder

In [19]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = mask
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask
        )
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

Positional Encoding

In [20]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

End To End Transformer

In [21]:
embed_dim = 256
dense_dim = 2048
num_heads = 8



encoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="english")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)

encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="sinhala")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)

x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)

x = layers.Dropout(0.5)(x)

decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 sinhala (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding (Posi  (None, None, 256)            3845120   ['english[0][0]']             
 tionalEmbedding)                                                                                 
                                                                                                  
 positional_embedding_1 (Po  (None, None, 256)            3845120   ['sinhala[0][0]']         

Training the sequence-sequence Transformer

In [22]:
transformer.compile(optimizer="rmsprop",
                    loss="sparse_categorical_crossentropy",
                    metrics=["accuracy"])

transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7b08374859f0>

In [24]:
import numpy as np

# Define the vocabulary and index lookup for Sinhala
sin_vocab = target_vectorization.get_vocabulary()
sin_index_lookup = dict(zip(range(len(sin_vocab)), sin_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"

    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = sin_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break

    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]

for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print("English: ", input_sentence)
    print("Sinhala: ", decode_sequence(input_sentence))


-
English:  මම එක බෑග් එකක් අරගත්තා
Sinhala:  [start] had one like one[end]                
-
English:  මෙච්චර වෙලා උනාපහල තාම සුද්ධ කරල ඉවර නෑනේ
Sinhala:  [start] from not so there was done yet[end]             
-
English:  මම හිතුවේ ඔයා ආයේ මාව හම්බවෙන්නේ නෑ කියලා
Sinhala:  [start] thought i would not think like you again[end]        me[end] me[end]   
-
English:  ඒත් මට ඕනේ වැඩිහිටි එකක් කරන්න
Sinhala:  [start] i want to make a got an great die[end]           
-
English:  මේ ගෙදර තියෙනවා පරණ ලස්සන දේවල් ටොන් ගානක්
Sinhala:  [start] from new things in this is a beautiful of father[end]          
-
English:  මං මගේ තත්වේ පෙන්වන එක නතර කරන්නම්
Sinhala:  [start] i come on my chance[end]               
-
English:  ඒවගේම රුසියාව එක්ක තියෙන රාජ්‍යත්‍රාන්ත්‍රික සම්බන්ධතා නිසා
Sinhala:  [start] from things with kehelmal got into the village give away[end]          
-
English:  මට තියෙන්නේ මෙච්චරයි
Sinhala:  [start] there is no love me[end]               
-
English:  ඔයා කොහේවත් යන් නැහැ
Sinh

In [26]:
sin_vocab = target_vectorization.get_vocabulary()
sin_index_lookup = dict(zip(range(len(sin_vocab)), sin_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"

    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = sin_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break

    return decoded_sentence

# Get English input from the user
input_sentence = input("Enter a sinhala sentence: ")

# Translate the input sentence to Sinhala
translated_sentence = decode_sequence(input_sentence)

# Print the translated sentence
print("English translation:", translated_sentence)


Enter a sinhala sentence: මම ගෙදර යනවා 
English translation: [start] i go home[end]                 
