<a href="https://colab.research.google.com/github/Satwikram/Transformer-model-for-language-understanding/blob/master/Portuguese%20to%20English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Author: Satwik Ram K
Portuguese to English Transalation

## Importing Dependencies

In [51]:
import numpy as np
import pandas as pd
import re
import math
import re
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt


## Loading Dataset

The dataset is available in Tfds

In [52]:
data, metadata = tfds.load('ted_hrlr_translate/pt_to_en',with_info = True, as_supervised=True)

In [53]:
data

{'test': <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.string)>,
 'train': <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.string)>,
 'validation': <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.string)>}

In [54]:
metadata

tfds.core.DatasetInfo(
    name='ted_hrlr_translate',
    version=1.0.0,
    description='Data sets derived from TED talk transcripts for comparing similar language pairs
where one is high resource and the other is low resource.
',
    homepage='https://github.com/neulab/word-embeddings-for-nmt',
    features=Translation({
        'en': Text(shape=(), dtype=tf.string),
        'pt': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=54781,
    splits={
        'test': 1803,
        'train': 51785,
        'validation': 1193,
    },
    supervised_keys=('pt', 'en'),
    citation="""@inproceedings{Ye2018WordEmbeddings,
      author  = {Ye, Qi and Devendra, Sachan and Matthieu, Felix and Sarguna, Padmanabhan and Graham, Neubig},
      title   = {When and Why are pre-trained word embeddings useful for Neural Machine Translation},
      booktitle = {HLT-NAACL},
      year    = {2018},
      }""",
    redistribution_info=,
)

## Taking Train and Validation Data

In [55]:
train, val = data['train'], data['validation']

In [56]:
type(train)

tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter

## Creating a custom subwords tokenizer from the training dataset.

In [57]:
count = 0
for a, b in train:
  count += 1
  if count == 10:
    break
  else:
    print("Portugees sentence:",a)
    print("English Corresponding Sentence",b)
    print("--"*60)
  

Portugees sentence: tf.Tensor(b'e quando melhoramos a procura , tiramos a \xc3\xbanica vantagem da impress\xc3\xa3o , que \xc3\xa9 a serendipidade .', shape=(), dtype=string)
English Corresponding Sentence tf.Tensor(b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .', shape=(), dtype=string)
------------------------------------------------------------------------------------------------------------------------
Portugees sentence: tf.Tensor(b'mas e se estes fatores fossem ativos ?', shape=(), dtype=string)
English Corresponding Sentence tf.Tensor(b'but what if it were active ?', shape=(), dtype=string)
------------------------------------------------------------------------------------------------------------------------
Portugees sentence: tf.Tensor(b'mas eles n\xc3\xa3o tinham a curiosidade de me testar .', shape=(), dtype=string)
English Corresponding Sentence tf.Tensor(b"but they did n't test for curiosity .", shape=(), 

In [58]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train), target_vocab_size = 2**13
)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train), target_vocab_size = 2**13
)

In [62]:
tokenizer_en.save_to_file('tokenizer_en')
tokenizer_pt.save_to_file('tokenizer_pt')

In [63]:
# Loading saved tokenizer
encoder_en = tfds.features.text.SubwordTextEncoder.load_from_file('/content/tokenizer_en')
encoder_pt = tfds.features.text.SubwordTextEncoder.load_from_file('/content/tokenizer_pt')

In [64]:
sample_string = "Transfomer is cool."

encoded_string = encoder_en.encode(sample_string)

original_string = encoder_en.decode(encoded_string)

print("Orginal String:",sample_string)
print("Encoded String:",encoded_string)
print("Decoded String:",original_string)

Orginal String: Transfomer is cool.
Encoded String: [7915, 1248, 7946, 1391, 2762, 7863, 13, 1671, 7877]
Decoded String: Transfomer is cool.


In [65]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

## Add a start and end token to the input and target.

In [66]:
print(encoder_pt.vocab_size)
print(encoder_en.vocab_size)

8214
8087


In [67]:
a = [encoder_pt.vocab_size]
b = np.array([2])
print(b[0])
c = a + b[0]
print(c)

2
[8216]


In [68]:
def encode(lang1, lang2):
  lang1 = [encoder_pt.vocab_size] + encoder_pt.encode(lang1.numpy()) + [encoder_pt.vocab_size + 1]

  lang2 = [encoder_en.vocab_size] + encoder_en.encode(lang2.numpy()) + [encoder_en.vocab_size + 1]

  return lang1, lang2

In [69]:
def tf_encode(pt, en):
  result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
  result_pt.set_shape([None])
  result_en.set_shape([None])

  return result_pt, result_en

## Dropping long sentences over 40 tokens 

In [70]:
MAX_LENGTH = 40

In [71]:
def filter_max_length(x, y, max_length = MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)

In [72]:
train_dataset = train.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)

# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(BATCH_SIZE)

In [73]:
pt_batch, en_batch = next(iter(val_dataset))
pt_batch, en_batch

(<tf.Tensor: shape=(64, 38), dtype=int64, numpy=
 array([[8214,  342, 3032, ...,    0,    0,    0],
        [8214,   95,  198, ...,    0,    0,    0],
        [8214, 4479, 7990, ...,    0,    0,    0],
        ...,
        [8214,  584,   12, ...,    0,    0,    0],
        [8214,   59, 1548, ...,    0,    0,    0],
        [8214,  118,   34, ...,    0,    0,    0]])>,
 <tf.Tensor: shape=(64, 40), dtype=int64, numpy=
 array([[8087,   98,   25, ...,    0,    0,    0],
        [8087,   12,   20, ...,    0,    0,    0],
        [8087,   12, 5453, ...,    0,    0,    0],
        ...,
        [8087,   18, 2059, ...,    0,    0,    0],
        [8087,   16, 1436, ...,    0,    0,    0],
        [8087,   15,   57, ...,    0,    0,    0]])>)

## Positional Encoding

In [83]:
def get_angles(pos, i, d_model):
  angles_rates = 1 / np.power(1000, (2 * (i//2)) / np.float32(d_model))
  return pos * angles_rates

np.newaxis will expand the dimension of the array

In [84]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

In [85]:
pos_encoding = positional_encoding(50, 512)
print(pos_encoding.shape)

(1, 50, 512)


## Masking

Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value 0 is present: it outputs a 1 at those locations, and a 0 otherwise.

In [102]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.

  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [103]:
xx = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(xx)

<tf.Tensor: shape=(3, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],


       [[[0., 0., 0., 1., 1.]]],


       [[[1., 1., 1., 0., 0.]]]], dtype=float32)>

The look-ahead mask is used to mask the future tokens in a sequence. In other words, the mask indicates which entries should not be used.

In [108]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [109]:
xx = tf.random.uniform((1, 3))
temp = create_look_ahead_mask(xx.shape[1])
temp  

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
       [0., 0., 1.],
       [0., 0., 0.]], dtype=float32)>

## Scaled Dot Product attention

In [110]:
def scaled_dot_product_attention(q, k, v, mask):

  matmul_qk = tf.matmul(q, v, transpose_b = True)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # Applying softmax
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)

  output = tf.matmul(attention_weights, v)

  return output, attention_weights

In [111]:
def print_out(q, k, v):
  temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
  print ('Attention weights are:')
  print (temp_attn)
  print ('Output is:')
  print (temp_out)

## Multi-head Attention