
#English to Sinhala Transalation with Transforms



In [4]:
from IPython.display import Image

In [5]:
Image(url ='https://www.tensorflow.org/images/tutorials/transformer/apply_the_transformer_to_machine_translation.gif')

#Necessary Library Imports

#Prepare the Data

In [6]:
import random
import tensorflow as tf
import string
import re
from tensorflow import keras
from tensorflow.keras import layers

#Mount the Google Drive

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Read the data file

In [8]:
text_file = "/content/drive/My Drive/Deep Learning/Mini Project 03/sin.txt"
with open(text_file) as f:
  lines = f.read().split("\n")[:-1]
i = 0
for line in lines:
  print(line)
  i = i + 1
  if(i==20):
    break

Go.		යන්න.
Go.		යන්න.
Go.		යන්න.
Go.		යන්න.
Hi.		ආයුබෝවන්.
Run!		දුවන්න!
Run.		දුවන්න.
Who?		කවුද?
Wow!		වාව්!
Fire!		ගිනි!
Fire!		ගිනි!
Fire!		ගිනි!
Help!		උදව්!
Help!		උදව්!
Help!		උදව්!
Jump!		පනින්න!
Jump.		පනින්න.
Stop!		නවත්වන්න!
Stop!		නවත්වන්න!
Stop!		නවත්වන්න!


In [9]:
for x in range(len(lines)-10,len(lines)):
  print(lines[x])

Tom promised.	ටොම් පොරොන්දු විය.
Tom promised.	ටොම් පොරොන්දු විය.
Tom ran away.	ටොම් පලා ගියේය.
Tom relented.	ටොම් පසුබට විය.
Tom relented.	ටොම් පසුබට විය.
Tom relented.	ටොම් අනුකම්පා කළා.
Tom resigned.	තෝමස් ඉල්ලා අස්විය.
Tom saw Mary.	ටොම් මේරිව දැක්කා.
Tom screamed.	ටොම් කෑගැසුවා.
Tom shot her.	ටොමස් ඔහුට වෙඩි තැබුවේය.


#Split the English and Sinhala translation pairs

In [10]:
text_pairs = []

for line in lines:
    parts = line.split("\t")

    if len(parts) == 2:
        english, sinhala = parts
        sinhala = "[start]" + sinhala + "[end]"
        text_pairs.append((english, sinhala))
    else:

        print("Skipping line:", line)

Skipping line: Go.		යන්න.
Skipping line: Go.		යන්න.
Skipping line: Go.		යන්න.
Skipping line: Go.		යන්න.
Skipping line: Hi.		ආයුබෝවන්.
Skipping line: Run!		දුවන්න!
Skipping line: Run.		දුවන්න.
Skipping line: Who?		කවුද?
Skipping line: Wow!		වාව්!
Skipping line: Fire!		ගිනි!
Skipping line: Fire!		ගිනි!
Skipping line: Fire!		ගිනි!
Skipping line: Help!		උදව්!
Skipping line: Help!		උදව්!
Skipping line: Help!		උදව්!
Skipping line: Jump!		පනින්න!
Skipping line: Jump.		පනින්න.
Skipping line: Stop!		නවත්වන්න!
Skipping line: Stop!		නවත්වන්න!
Skipping line: Stop!		නවත්වන්න!
Skipping line: Wait!		ඉන්න!
Skipping line: Wait.		ඉන්න.
Skipping line: Go on.		දිගටම යන්න.
Skipping line: Go on.		දිගටම කරගෙන යන්න.
Skipping line: Hello!		ආයුබෝවන්.
Skipping line: I ran.		මම දිව්වා.
Skipping line: I ran.		මම දුවමින් සිටියෙමි.
Skipping line: I try.		මම උත්සාහ කරනවා.
Skipping line: I won!		මම දිනුවා!
Skipping line: Oh no!		අපොයි නෑ!
Skipping line: Relax.		එය සෝඩා සමඟ ගන්න.
Skipping line: Smile.		සිනාසෙන්න.
Skipp

In [12]:
for i in range(3):
  print(random.choice(text_pairs))

("I'm Tom.", '[start]මම ටොම්.[end]')
("It's our job.", '[start]ඒක අපේ වැඩක්.[end]')
('Please hurry!', '[start]කරුණාකර ඉක්මන් කරන්න![end]')


#Randomize the data

In [13]:
import random
random.shuffle(text_pairs)

#Splitting the data into training, validation and testing

In [14]:
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

print("Total sentences:",len(text_pairs))
print("Training set size:",len(train_pairs))
print("Validation set size:",len(val_pairs))
print("Testing set size:",len(test_pairs))

Total sentences: 3426
Training set size: 2400
Validation set size: 513
Testing set size: 513


In [15]:
len(train_pairs)+len(val_pairs)+len(test_pairs)

3426

#Removing Punctuation

In [16]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
f"[{re.escape(strip_chars)}]"

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\\\\\^_`\\{\\|\\}\\~¿]'

In [17]:
f"{3+5}"


'8'

#vectorizing the English and Sinhala text pairs

In [18]:
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")
vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
  max_tokens=vocab_size,
  output_mode="int",
  output_sequence_length=sequence_length,
)

target_vectorization = layers.TextVectorization(
  max_tokens=vocab_size,
  output_mode="int",
  output_sequence_length=sequence_length + 1,
  standardize=custom_standardization,
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

#Preparing datasets for the translation task

In [19]:
batch_size = 64

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
    "english": eng,
    "sinhala": spa[:, :-1],
    }, spa[:, 1:])
def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

for inputs, targets in train_ds.take(1):
  print(f"inputs['english'].shape: {inputs['english'].shape}")
  print(f"inputs['sinhala'].shape: {inputs['sinhala'].shape}")
  print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (32, 20)
inputs['sinhala'].shape: (32, 20)
targets.shape: (32, 20)


In [20]:
print(list(train_ds.as_numpy_iterator())[20])

({'english': array([[  2,  47,  17, ...,   0,   0,   0],
       [158,   5,  33, ...,   0,   0,   0],
       [ 42,   8,  55, ...,   0,   0,   0],
       ...,
       [ 15,  19,  62, ...,   0,   0,   0],
       [870,  12,   0, ...,   0,   0,   0],
       [  4,   9, 535, ...,   0,   0,   0]]), 'sinhala': array([[   2,   27, 1244, ...,    0,    0,    0],
       [   9,   15,    0, ...,    0,    0,    0],
       [ 729,  113,   15, ...,    0,    0,    0],
       ...,
       [1815,    0,    0, ...,    0,    0,    0],
       [  17,   11,    0, ...,    0,    0,    0],
       [   2, 1392,    0, ...,    0,    0,    0]])}, array([[  27, 1244,    0, ...,    0,    0,    0],
       [  15,    0,    0, ...,    0,    0,    0],
       [ 113,   15,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [  11,    0,    0, ...,    0,    0,    0],
       [1392,    0,    0, ...,    0,    0,    0]]))


#Transformer encoder implemented as a subclassed layer

In [21]:
class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention = layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim, activation="relu"),
        layers.Dense(embed_dim),]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
  def call(self, inputs, mask=None):
    if mask is not None:
        mask = mask[:, tf.newaxis, :]
    attention_output = self.attention(
        inputs, inputs, attention_mask=mask)
    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)

  def get_config(self):
    config = super().get_config()
    config.update({
      "embed_dim": self.embed_dim,
      "num_heads": self.num_heads,
      "dense_dim": self.dense_dim,
    })
    return config

#The transformer decorder

In [22]:
class TransformerDecoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention_1 = layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_dim)
    self.attention_2 = layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = keras.Sequential(
      [layers.Dense(dense_dim, activation="relu"),
      layers.Dense(embed_dim),]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
    self.layernorm_3 = layers.LayerNormalization()
    self.supports_masking = True

  def get_config(self):
    config = super().get_config()
    config.update({
        "embed_dim": self.embed_dim,
        "num_heads": self.num_heads,
        "dense_dim": self.dense_dim,
    })
    return config

  def get_causal_attention_mask(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis]
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype="int32")
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1),
        tf.constant([1, 1], dtype=tf.int32)], axis=0)
    return tf.tile(mask, mult)
  def call(self, inputs, encoder_outputs, mask=None):
    causal_mask = self.get_causal_attention_mask(inputs)
    if mask is not None:
      padding_mask = tf.cast(
          mask[:, tf.newaxis, :], dtype="int32")
      padding_mask = tf.minimum(padding_mask, causal_mask)
    else:
      padding_mask = mask
    attention_output_1 = self.attention_1(
        query=inputs,
        value=inputs,
        key=inputs,
        attention_mask=causal_mask)
    attention_output_1 = self.layernorm_1(inputs + attention_output_1)
    attention_output_2 = self.attention_2(
        query=attention_output_1,
        value=encoder_outputs,
        key=encoder_outputs,
        attention_mask=padding_mask,
    )
    attention_output_2 = self.layernorm_2(
        attention_output_1 + attention_output_2)
    proj_output = self.dense_proj(attention_output_2)
    return self.layernorm_3(attention_output_2 + proj_output)

#The Positional Encoding

In [23]:
class PositionalEmbedding(layers.Layer):
  def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.token_embeddings = layers.Embedding(
        input_dim=input_dim, output_dim=output_dim)
    self.position_embeddings = layers.Embedding(
      input_dim=sequence_length, output_dim=output_dim)
    self.sequence_length = sequence_length
    self.input_dim = input_dim
    self.output_dim = output_dim

  def call(self, inputs):
    length = tf.shape(inputs)[-1]
    positions = tf.range(start=0, limit=length, delta=1)
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = self.position_embeddings(positions)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask=None):
    return tf.math.not_equal(inputs, 0)

  def get_config(self):
    config = super(PositionalEmbedding, self).get_config()
    config.update({
        "output_dim": self.output_dim,
        "sequence_length": self.sequence_length,
        "input_dim": self.input_dim,
    })
    return config

In [24]:
embed_dim = 256
dense_dim = 2048
num_heads = 8
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="sinhala")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [25]:
transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 sinhala (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding (Posi  (None, None, 256)            3845120   ['english[0][0]']             
 tionalEmbedding)                                                                                 
                                                                                                  
 positional_embedding_1 (Po  (None, None, 256)            3845120   ['sinhala[0][0]']         

#Training the sequence-to-sequence Transformer

In [26]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x783347087220>

In [29]:
import numpy as np
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
  tokenized_input_sentence = source_vectorization([input_sentence])
  decoded_sentence ="[start]"
  for i in range(max_decoded_sentence_length):
    tokenized_target_sentence = target_vectorization(
        [decoded_sentence])[:,:-1]
    predictions = transformer(
        [tokenized_input_sentence,tokenized_target_sentence])
    sampled_token_index = np.argmax(predictions[0,i,:])
    sampled_token = spa_index_lookup[sampled_token_index]
    decoded_sentence += " " + sampled_token
    if sampled_token == "[end]":
      break
  return decoded_sentence


test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
Eat slowly.
[start] කනවා[end]       වනු වන්න[end]  විය[end]         
-
What a dope!
[start] මොන දවසක්ද[end]      පලයන්[end]   වේ[end]      ඇත[end]   
-
They've left.
[start] ගියා[end]      ගියා[end]    ගියා[end]         
-
Is Tom cured?
[start] ඉන්නවද[end]       වේ[end]   වේ[end]         
-
Do come in.
[start] එන්න[end]      එන්න[end] යන්න[end]            
-
Leave now.
[start] දිනුවා[end]     යන්න[end]  යන්න[end]   යන්න[end]         
-
Let's start.
[start] පටන් ගන්න[end]         ඇත[end]         
-
Keep dancing.
[start] දිගටම කරගෙන යන්න[end]     නොවන්න[end]   ගන්න[end]         
-
Tom's upset.
[start] කලින් එනවා[end]      නොවන්න[end]   විය[end]         
-
Here she is!
[start] ගන්න[end]       මෙහි වන්න[end]  විය[end]         
-
Shadow him.
[start] උත්සාහ කරන්න[end]      කරන්න[end]   විය[end]         
-
I resigned.
[start] ඉල්ලා අස්විය[end]     ඇත[end] ඉල්ලා ගන්න[end]  ඇත[end]      ඇත[end]   
-
Say cheese.
[start] කියන්න[end]      කියන්න[end]    කියන්න[end]         
-
Where am I?
[start]