<a href="https://colab.research.google.com/github/PasinduWaidyarathna/Deep-Learning-Mini-Project-03/blob/main/EntoSin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Necessary Library Imports

In [1]:
import random
import tensorflow as tf
import string
import re
from tensorflow import keras
from tensorflow.keras import layers

Mount the Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install openpyxl



Read the data file

In [20]:
import openpyxl

excel_file = "/content/drive/MyDrive/entosin.xlsx"
workbook = openpyxl.load_workbook(excel_file)
sheet = workbook.active

for i in range(2, 22):
    cell1 = sheet.cell(row=i, column=1).value
    cell2 = sheet.cell(row=i, column=2).value
    print(f"{cell1}\t{cell2}")

workbook.close()


Go.	යන්න.
Hi.	හායි.
Run!	දුවන්න!
Run.	දුවන්න.
Who?	කවුද?
Fire!	ගිනි!
Help!	උදව්!
Jump!	පනින්න!
Jump.	පනින්න.
Stop!	නවත්වන්න!
Wait!	ඉන්න!
Wait.	ඉන්න.
Go on.	ඉදිරියට යන්න.
Hello!	ආයුබෝවන්!
I ran.	මම දිව්වා.
I try.	මම උත්සාහ කරනවා.
I won!	මම දිනුවා!
Oh no!	ඔහ් නෑ!
Relax.	සන්සුන් වන්න.
Smile.	සිනහව.


Split the English and Sinhala translation pairs

In [21]:
excel_file = "/content/drive/MyDrive/entosin.xlsx"
workbook = openpyxl.load_workbook(excel_file)
sheet = workbook.active

text_pairs = []
for row in sheet.iter_rows(values_only=True):
    if len(row) >= 2 and row[1] is not None:
        english, sinhala = row[:2]
        sinhala = "[start] " + sinhala + " [end]"
        text_pairs.append((english, sinhala))

for _ in range(3):
    print(random.choice(text_pairs))

workbook.close()


('I want you to try it.', '[start] මට ඔබ එය උත්සාහ කිරීමට අවශ්යයි. [end]')
('They made fun of him.', '[start] ඔවුන් ඔහුට විහිළු කළා. [end]')
('I think you worry too much.', '[start] මම හිතන්නේ ඔබ ඕනෑවට වඩා කරදර වෙනවා. [end]')


Randomize the data

In [22]:
random.shuffle(text_pairs)

Spliting the data into training, validation and Testing

In [23]:
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

print("Total sentences:",len(text_pairs))
print("Training set size:",len(train_pairs))
print("Validation set size:",len(val_pairs))
print("Testing set size:",len(test_pairs))


Total sentences: 54952
Training set size: 38468
Validation set size: 8242
Testing set size: 8242


In [24]:
len(train_pairs)+len(val_pairs)+len(test_pairs)

54952

Removing Punctuations

In [25]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

f"[{re.escape(strip_chars)}]"

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\\\\\^_`\\{\\|\\}\\~¿]'

In [26]:
f"{3+5}"

'8'

Vectorizing the English and Sinhala text pairs

In [27]:
def custom_standardization(input_string):
  lowercase = tf.strings.lower(input_string)
  return tf.strings.regex_replace(
      lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_english_texts = [pair[0] for pair in train_pairs]
train_sinhala_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_sinhala_texts)


Preparing datasets for the translation task

In [28]:
batch_size = 64

def format_dataset(eng, sin):
   eng = source_vectorization(eng)
   sin = target_vectorization(sin)
   return ({
         "english": eng,
         "sinhala": sin[:, :-1],
   }, sin[:, 1:])

def make_dataset(pairs):
    eng_texts, sin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    sin_texts = list(sin_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, sin_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)


for inputs, targets in train_ds.take(1):
   print(f"inputs['english'].shape: {inputs['english'].shape}")
   print(f"inputs['sinhala'].shape: {inputs['sinhala'].shape}")
   print(f"targets.shape: {targets.shape}")




inputs['english'].shape: (64, 20)
inputs['sinhala'].shape: (64, 20)
targets.shape: (64, 20)


In [29]:
print(list(train_ds.as_numpy_iterator())[50])

({'english': array([[   3,  780,   35, ...,    0,    0,    0],
       [  18, 2082,   81, ...,    0,    0,    0],
       [   9,   88,  122, ...,    0,    0,    0],
       ...,
       [  25,  254,  172, ...,    0,    0,    0],
       [  11,    4,   16, ...,    0,    0,    0],
       [ 133,    2,  137, ...,    0,    0,    0]]), 'sinhala': array([[  2,   5, 123, ...,   0,   0,   0],
       [  2,  13,  10, ...,   0,   0,   0],
       [  2,   8, 631, ...,   0,   0,   0],
       ...,
       [  2, 108, 172, ...,   0,   0,   0],
       [  2,  11, 601, ...,   0,   0,   0],
       [  2,   6,  11, ...,   0,   0,   0]])}, array([[   5,  123,  657, ...,    0,    0,    0],
       [  13,   10, 2219, ...,    0,    0,    0],
       [   8,  631,  180, ...,    0,    0,    0],
       ...,
       [ 108,  172,  206, ...,    0,    0,    0],
       [  11,  601, 3225, ...,    0,    0,    0],
       [   6,   11,   29, ...,    0,    0,    0]]))


Transformer encoder implemented as a subclassed Layer

In [30]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
      super().__init__(**kwargs)
      self.embed_dim = embed_dim
      self.dense_dim = dense_dim
      self.num_heads = num_heads
      self.attention = layers.MultiHeadAttention(
           num_heads=num_heads, key_dim=embed_dim)
      self.dense_proj = keras.Sequential(
           [layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim),]
      )
      self.layernorm_1 = layers.LayerNormalization()
      self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
      if mask is not None:
         mask = mask[:, tf.newaxis, :]
      attention_output = self.attention(
         inputs, inputs, attention_mask=mask)
      proj_input = self.layernorm_1(inputs + attention_output)
      proj_output = self.dense_proj(proj_input)
      return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config


The Transformer decoder

In [31]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
      super().__init__(**kwargs)
      self.embed_dim = embed_dim
      self.dense_dim = dense_dim
      self.num_heads = num_heads
      self.attention_1 = layers.MultiHeadAttention(
          num_heads=num_heads, key_dim=embed_dim)
      self.attention_2 = layers.MultiHeadAttention(
          num_heads=num_heads, key_dim=embed_dim)
      self.dense_proj = keras.Sequential(
          [layers.Dense(dense_dim, activation="relu"),
           layers.Dense(embed_dim),]
      )
      self.layernorm_1 = layers.LayerNormalization()
      self.layernorm_2 = layers.LayerNormalization()
      self.layernorm_3 = layers.LayerNormalization()
      self.supports_masking = True

    def get_config(self):
      config = super().get_config()
      config.update({
          "embed_dim": self.embed_dim,
          "num_heads": self.num_heads,
          "dense_dim": self.dense_dim,
      })
      return config

    def get_causal_attention_mask(self, inputs):
      input_shape = tf.shape(inputs)
      batch_size, sequence_length = input_shape[0], input_shape[1]
      i = tf.range(sequence_length)[:, tf.newaxis]
      j = tf.range(sequence_length)
      mask = tf.cast(i >= j, dtype="int32")
      mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
      mult = tf.concat(
              [tf.expand_dims(batch_size, -1),
               tf.constant([1, 1], dtype=tf.int32)], axis=0)
      return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
      causal_mask = self.get_causal_attention_mask(inputs)
      if mask is not None:
           padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
           padding_mask = tf.minimum(padding_mask, causal_mask)
      else:
           padding_mask = mask
      attention_output_1 = self.attention_1(
           query=inputs,
           value=inputs,
           key=inputs,
           attention_mask=causal_mask)
      attention_output_1 = self.layernorm_1(inputs + attention_output_1)
      attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
       )
      attention_output_2 = self.layernorm_2(
          attention_output_1 + attention_output_2)
      proj_output = self.dense_proj(attention_output_2)
      return self.layernorm_3(attention_output_2 + proj_output)



Positional Encoding

In [32]:
class PositionalEmbedding(layers.Layer):
     def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
         super().__init__(**kwargs)
         self.token_embeddings = layers.Embedding(
             input_dim=input_dim, output_dim=output_dim)
         self.position_embeddings = layers.Embedding(
             input_dim=sequence_length, output_dim=output_dim)
         self.sequence_length = sequence_length
         self.input_dim = input_dim
         self.output_dim = output_dim

     def call(self, inputs):
         length = tf.shape(inputs)[-1]
         positions = tf.range(start=0, limit=length, delta=1)
         embedded_tokens = self.token_embeddings(inputs)
         embedded_positions = self.position_embeddings(positions)
         return embedded_tokens + embedded_positions

     def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

     def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
             "output_dim": self.output_dim,
             "sequence_length": self.sequence_length,
             "input_dim": self.input_dim,
        })
        return config


End-to-end Transformer

In [33]:
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="sinhala")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [34]:
transformer.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 sinhala (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding_2 (Po  (None, None, 256)            3845120   ['english[0][0]']             
 sitionalEmbedding)                                                                               
                                                                                                  
 positional_embedding_3 (Po  (None, None, 256)            3845120   ['sinhala[0][0]']       

Training the sequence-to-sequence Transformer

In [35]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7ba5eefc1f00>

Translating new sentences with our Transformer model

In [36]:
sin_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(sin_vocab)), sin_vocab))
max_decoded_sentence_length = 20


Output Testing and Decoding the output sequence

In [37]:
import numpy as np

def decode_sequence(input_sentence):
     tokenized_input_sentence = source_vectorization([input_sentence])
     decoded_sentence = "[start]"
     for i in range(max_decoded_sentence_length):
       tokenized_target_sentence = target_vectorization(
         [decoded_sentence])[:, :-1]
       predictions = transformer(
         [tokenized_input_sentence, tokenized_target_sentence])
       sampled_token_index = np.argmax(predictions[0, i, :])
       sampled_token = spa_index_lookup[sampled_token_index]
       decoded_sentence += " " + sampled_token
       if sampled_token == "[end]": break
     return decoded_sentence

Transformer translating output

In [40]:
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))



-
Tom drove the car.
[start] ටොම් කාර් එක දෙස ගියේය [end]
-
I'm talking about my friend.
[start] මම මගේ මිතුරා ගැන කතා කරනවා [end]
-
We should help.
[start] අපි උදව් කළ යුතුයි [end]
-
Tom will probably succeed.
[start] ටොම් බොහෝ විට සාර්ථක වනු ඇත [end]
-
When did you buy the watch?
[start] ඔබ පින්තූරය ගත්තේ කවදාද [end]


Evaluation using the BLEU score

In [41]:
from nltk.translate.bleu_score import sentence_bleu

test_eng_texts = [pair[0] for pair in test_pairs]
test_sin_texts = [pair[1] for pair in test_pairs]
score = 0
bleu  = 0
for i in range(20):
    candidate = decode_sequence(test_eng_texts[i])
    reference = test_sin_texts[i].lower()
    print(candidate,reference)
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    bleu+=score
    print(f"Score:{score}")
print(f"\nBLEU score : {round(bleu,2)}/20")

[start] මෙම හොඳයි [end] [start] මෙම ලකුණෙහි තේරුම කුමක්ද? [end]
Score:0.6086956521739131


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[start] ටොම් ඊයේ රෑ එහි දුටුවේය [end] [start] ටොම් එහි රතු දෙයක් දුටුවේය. [end]
Score:0.6216216216216216
[start] මම ඉතා කලබල වී සිටිමි [end] [start] මම ගොඩක් කලබලයි. [end]
Score:0.42857142857142855
[start] ටොම් ඔබ එනතුරු බලා සිටියේය [end] [start] ටොම් ඔබ එනතුරු බලා සිටී. [end]
Score:0.625
[start] ක්රීඩාව ආරම්භ කිරීමට ඉඩ දෙන්න [end] [start] ක්රීඩාව ආරම්භ කිරීමට ඉඩ දෙන්න. [end]
Score:0.6046511627906976
[start] එය සම්පූර්ණයෙන්ම රළු නොවේ [end] [start] එය මුළුමනින්ම නිෂ් .ල ය. [end]
Score:0.4358974358974359
[start] මම දන්නවා ඒක ප්රශ්නයක් [end] [start] මම එය හොඳින් දනිමි. [end]
Score:0.4166666666666667
[start] මම ඔහුට කතා කරන්නම් [end] [start] මම ඔහුට කතා කරන්නම්. [end]
Score:0.6363636363636364
[start] මම ඔහුට පසුගිය වසරේ ඔහු බැලීමට ගියෙමි [end] [start] මම ගිය අවුරුද්දේ ඊයේ පාර්ශවයකදී ඔහුට හමු විය. [end]
Score:0.45098039215686275
[start] කරුණාකර මට පිටව යාමට දෙන්න [end] [start] කරුණාකර මට යන්න කවදාදැයි කියන්න. [end]
Score:0.575
[start] ටොම් එයට ආදරය කරනු ඇත [end] [start] ටොම් එයට ආදරය කරයි. 

Calculate accuracy for 20 new sentences

In [44]:
manualTest = [
    ("I love learning new languages.", "මම අලුත් භාෂා ඉගෙන ගන්න ආසයි."),
    ("This is a beautiful day.", "මේක ලස්සන දවසක්."),
    ("She sings very well.", "ඇය ඉතා හොඳින් ගායනා කරයි."),
    ("They are going to the park.", "ඔවුන් උද්යානයට යනවා."),
    ("The book is on the table.", "පොත මේසය උඩ."),
    ("We need to finish this project.", "අපි මේ ව්‍යාපෘතිය අවසන් කළ යුතුයි."),
    ("He is a good friend.", "ඔහු හොඳ මිතුරෙකි."),
    ("I want to travel around the world.", "මට ලොව වටා සංචාරය කිරීමට අවශ්‍යයි."),
    ("The cat is sleeping on the sofa.", "බළලා සෝෆා මත නිදාගෙන සිටී."),
    ("She cooks delicious food.", "ඇය රසවත් කෑම උයනවා."),
    ("We have a lot of work to do.", "අපිට කරන්න වැඩ ගොඩක් තියෙනවා."),
    ("He speaks Spanish fluently.", "ඔහු චතුර ලෙස ස්පාඤ්ඤ භාෂාව කතා කරයි."),
    ("The concert starts at 7 PM.", "ප්‍රසංගය සවස 7 ට ආරම්භ වේ."),
    ("I need to buy some groceries.", "මට සිල්ලර බඩු ටිකක් ගන්න ඕන."),
    ("She is studying for her exam.", "ඇය විභාගයට පාඩම් කරයි."),
    ("They are watching a movie.", "එයාලා ෆිල්ම් එකක් බලනවා."),
    ("He plays the guitar very well.", "ඔහු ඉතා හොඳින් ගිටාර් වාදනය කරයි."),
    ("We are going to the beach tomorrow.", "අපි හෙට මුහුදු වෙරළට යනවා."),
    ("The museum is closed on Mondays.", "සඳුදා දිනවල කෞතුකාගාරය වසා ඇත."),
    ("I am going to visit my family.", "මම මගේ පවුල බලන්න යනවා.")
]

from nltk.translate.bleu_score import sentence_bleu

testENTexts = [pair[0] for pair in manualTest]
testSITexts = [pair[1] for pair in manualTest]
score = 0
bleu  = 0
for i in range(20):
    candidate = decode_sequence(testENTexts[i])
    reference = testSITexts[i].lower()
    print(candidate,reference)
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    bleu+=score
    print(f"Score:{score}")
print(f"\nBLEU score : {round(bleu,2)}/20")




[start] මම නව භාෂා හතරක් අවශ්යයි [end] මම අලුත් භාෂා ඉගෙන ගන්න ආසයි.
Score:0.2894736842105263
[start] මෙය ලස්සන දවසක් [end] මේක ලස්සන දවසක්.
Score:0.3103448275862069
[start] ඇය ඉතා හොඳින් ගායනා කරනවා [end] ඇය ඉතා හොඳින් ගායනා කරයි.
Score:0.38461538461538464
[start] ඔවුන් උද්යානයට යනවා [end] ඔවුන් උද්යානයට යනවා.
Score:0.3333333333333333
[start] පොත මේසය මත ය [end] පොත මේසය උඩ.
Score:0.2962962962962963
[start] අපි මෙම කාර්යය අවසන් කිරීමට අවශ්යයි [end] අපි මේ ව්‍යාපෘතිය අවසන් කළ යුතුයි.
Score:0.24489795918367346
[start] ඔහු හොඳ මිතුරෙකි [end] ඔහු හොඳ මිතුරෙකි.
Score:0.4
[start] මට ගිනි ඔයාව ගිනි තියෙනවා [end] මට ලොව වටා සංචාරය කිරීමට අවශ්‍යයි.
Score:0.1794871794871795
[start] නිවස පොලිස් මත බොරු ය [end] බළලා සෝෆා මත නිදාගෙන සිටී.
Score:0.22857142857142856
[start] ඇය කෑම රසයි [end] ඇය රසවත් කෑම උයනවා.
Score:0.32
[start] අපට කරන්න බොහෝ වැඩ ගොඩක් තියෙනවා [end] අපිට කරන්න වැඩ ගොඩක් තියෙනවා.
Score:0.391304347826087
[start] ඔහු ස්පා ් යනු ස්පා ් යනු ස්පා ් කළේ වේ [end] ඔහු චතුර ලෙස ස්පාඤ්ඤ භාෂා