In [None]:
from __future__ import absolute_import, division, unicode_literals, print_function
try:
  %tensorflow_version 2.x
except exception:
  pass  
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re 
import numpy as np
import os
import io
import time

TensorFlow 2.x selected.


#ABOVE WE DO INTIALISATION PROCESS
* IMPORTING TENSORFLOW AND OTHER IMPORTANT LIBRARIES
* WORK OF MATPLOTLIB IS SHOWN BELOW

from matplotlib.ticker import FuncFormatter

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(4)

money = [1.5e5, 2.5e6, 5.5e6, 2.0e7]


def millions(x, pos):

    'The two args are the value and tick position'
    return '$%1.1fM' % (x*1e-6)

formatter = FuncFormatter(millions)

fig, ax = plt.subplots()

ax.yaxis.set_major_formatter(formatter)

plt.bar(x, money)

plt.xticks(x + 0.5, ('Bill', 'Fred', 'Mary', 'Sue'))

plt.show()
![alt text](https://i.stack.imgur.com/q7rse.png)




# PREPARING DATASET
*  downloading launguage dataset from http://www.manythings.org/anki/.
**PREPROCESSING STEPS**
* add start(sos) and end(eos) token in each sentence
* clean sentences by removing special characters
* creating word_index and rev. word_index(dict mapping from word->id and id->word)
* pad each sentence to threshold





In [None]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [None]:
print(path_to_file)
print(path_to_zip)

/root/.keras/datasets/spa-eng/spa.txt
/root/.keras/datasets/spa-eng.zip


EXAMPLE OF SMALL VOCABULARY DEVELOPED FROM RAW DATASET

![alt text](https://miro.medium.com/max/255/1*_Pp0bAv3nZPYHbPFlvO7Hg.png)

In [None]:
#converts unicode to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD',s)
      if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = w.rstrip().strip()

  #adding sos and eos token so that model know when to start and when to stop
  w = '<start> ' + w + ' <end>'
  return w


    

In [None]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [None]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):

    #read txt file and split it with spaces
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    #make word pairs of processed data
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    # below zip function returns tuple where each ith element is ith iterable argument
    return zip(*word_pairs)

In [None]:
en, sp = create_dataset(path_to_file,None)
print(en[100])
print(sp[100])

<start> go away ! <end>
<start> salga de aqui ! <end>


In [None]:
#this function returns MAX LENGTH of sentence present in database returned length is further used 
#to set threshold for input size shorter sentences are post padded
def max_length(tensor):
  return(max(len(t) for t in tensor))

# TOKENIZATION 
* First step of converting are datasets or rather mapping of words to certain value(word-value pair) is done above as text data is not directly fed to neural networks these pair are known as 'Integer Tokens'  
*  In second step these integer tokens are converted word vectors consisting of floating point numbers thus forming embedded layers
![alt text](https://freecontent.manning.com/wp-content/uploads/Chollet_DLfT_01.png)



In [None]:
def tokenize(language):
  #filters argument is used to extract string where each element is character that we will
  #be filtered from texts it includes punctuation line breaks tabs and exclude ' ' '
  language_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  
  #THIS FUNCTION UPDATES INTERNAL LIBRARY ON BASIS OF TEXTS eg:- for t something like, "The cat sat on the mat." 
  #It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2,, 0 is reserved for padding
  language_tokenizer.fit_on_texts(language)

  #TRANSFORM EACH TEXT TO SEQUENCE OF INTEGERS
  tensor = language_tokenizer.texts_to_sequences(language)

  #PERFORM POST PADDING ON TEXT BASED ON THRESHOLD DEFINED BY MAX LEN OF TENSOR 
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

  return tensor,language_tokenizer 


In [None]:
def load_dataset(path,num_examples=None):
  #TO CREATE CLEANED OUTPUT AND INPUT PAIR
  target_language, input_language = create_dataset(path, num_examples)

  input_tensor, input_language_tokenizer = tokenize(input_language)

  target_tensor, target_language_tokenizer = tokenize(target_language)
  
  return input_tensor, target_tensor, input_language_tokenizer, target_language_tokenizer

# TRAIN/VALIDATION SPLIT
* Training on complete dataset of *122936* sentences(trying to push its limits)
* Train-Test split of 80-20 ratio



In [None]:
num_examples = 122936
input_tensor , target_tensor, input_language, target_language = load_dataset(path_to_file, num_examples)

#max length of target tensors
max_length_target, max_length_input = max_length(target_tensor), max_length(input_tensor)

#creating 80-20 split of training and validation sets
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) 

In [None]:
#printing Train/val data
print(len(input_tensor_train), len(input_tensor_val), len(target_tensor_train), len(target_tensor_val))

95171 23793 95171 23793


**SHOWING INDEX TO WORD MAPPINGS**

In [None]:
def convert(language,tensor):
  for t in tensor:
    if t!=0:
      print("%d ----> %s" % (t, language.index_word[t]))

In [None]:
print("Input Language: index to word mapping")
convert(input_language,input_tensor_train[20])
print()
print("target language: index to word mapping")
convert(target_language, target_tensor_train[20])

Input Language: index to word mapping
1 ----> <start>
8 ----> no
17 ----> me
8347 ----> interesaba
3 ----> .
2 ----> <end>

target language: index to word mapping
1 ----> <start>
4 ----> i
253 ----> wasn
12 ----> t
681 ----> interested
16 ----> in
14 ----> it
3 ----> .
2 ----> <end>


# CREATING DATASET BATCH
* Word embeddings visualisation
![alt text](https://miro.medium.com/max/990/1*Fat62b1ZITOFMPXTcHNkLw.jpeg)




In [None]:
#BUFFER SIZE IS USED AS AN ARGUMENT TO SHUFFLE FUCNTION
#If buffer size is 100, it means that Tensorflow will keep a buffer of the next 
#100 samples, and will randomly select one those 100 samples. it then adds the next element to the buffer.
BUFFER_SIZE = len(input_tensor_train)

#batch size is input mini batch if equal to whole input size batch then it would tend to gd resulting in high acc. but very slow iteration
#whereas if batch size=1 then sgd would apply on fast but due to singular direction of gd loss jumps around   
BATCH_SIZE = 128

#steps per epoch is operation performed by model per epoch also here instead of random intialisation we obtain it via floor function
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

#embedding layer dimension
embedding_dim = 256
units = 1024
vocab_input_size = len(input_language.word_index)+1
vocab_target_size = len(target_language.word_index)+1

#DEFINING TENSORFLOW'S INBUILT DYNAMIC DATA PIPELINE FOR INPUT USING TENSOR SLICES 
#NOTE:- we use .tensor_slices instead .tensor coz former retrns word index as elements but latter returns list of sublists of elements(in slices form)
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,target_tensor_train)).shuffle(BUFFER_SIZE)

#for our gpu/tpu to get excatly same size of batch we enable drop remainder = true such that any partial batch is dropped
dataset = dataset.batch(BATCH_SIZE, drop_remainder = True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([128, 53]), TensorShape([128, 51]))

# **MODEL ARCHITECTURE**
**INSPIRED BY NEURAL MACHINE TRANSLATION
BY JOINTLY LEARNING TO ALIGN AND TRANSLATE(D.Bahdanau)**


**BIDIRECTIONAL RECURRENT NEURAL NETWORK ARE USED HERE**
![alt text](https://miro.medium.com/max/764/1*6QnPUSv_t9BY9Fv8_aLb-Q.png)

**GRU CELLS**

![alt text](https://blog.floydhub.com/content/images/2019/07/image14.jpg)


**ENCODER-DECODER MODEL WITH ATTENTION MECHANISM IS IMPLEMENTED**
![alt text](https://devblogs.nvidia.com/wp-content/uploads/2015/07/Figure5_attention_3.png)


#**ENCODER CLASS**
* The input is put through an encoder model which gives us the encoder output of shape (batch_size, max_length, hidden_size) and the encoder hidden state of shape (batch_size, hidden_size).
* Batch_size= 128 , embeddings_dim = 250, encoder_units = 1024 

INTUTION FOR ENCODER MODEL
![alt text](https://miro.medium.com/max/3081/1*xd8j4KoKRSzRq0b1Vx0FAA.png)







In [None]:
#defining our encoders architecture
class Encoder(tf.keras.Model):

  def __init__(self, vocab_size, embeddings_dim, encoder_units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
    self.encoder_units = encoder_units
    self.gru = tf.keras.layers.GRU(self.encoder_units,
                                  return_sequences = True,
                                  return_state = True,
                                  recurrent_initializer = 'glorot_uniform')
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    return output,state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size, self.encoder_units))  



In [None]:
encoder = Encoder(vocab_input_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (128, 53, 1024)
Encoder Hidden state shape: (batch size, units) (128, 1024)


# ATTENTION MECHANISM
**USING BAHDANHAU ATTENTION**
![alt text](https://jscriptcoder.github.io/date-translator/attn_mechanism.png)
![alt text](https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg)![alt text](https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg)

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self,units):
    super(BahdanauAttention,self).__init__()
    
    # W & U are weights for current encoder output and previous hidden state 
    # V is weights for feed forward nn used in attention mechanism
    self.W1 =tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # hidden shape == (batch_size, hidden size)
    #decorator call

    #hidden_with_time_axis == (batch_size, 1, hidden size)
    #query compresses previous output into m dimension(here m=1)
    hidden_with_time_axis = tf.expand_dims(query, 1)

    #score shape == (batch_size, max_length, 1)
    #we get 1 at last axis coz we r applying score to self.V
    #shape of tensor before applying to self.v is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(self.W1(values)+self.W2(hidden_with_time_axis)))

    #attention weights shape(batch_size, max_length,1)
    attention_weights = tf.nn.softmax(score,axis=1)

    #context vector shape after is (batch_size, hidden_size)
    #context vector is summed using tf.reduce_sum axis=1 is taking element batch along column
    context_vector = attention_weights*score
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

     
    

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (128, 1)
Attention weights shape: (batch_size, sequence_length, 1) (128, 53, 1)


# **DECODER CLASS**
* At each time step previous output ,  hidden state and context vector is INPUT TO DECODER  
* Output is predicted translated word
* Batch_size = 128 , decoder_units = 1024, embeddings_dim = 250

INTUTION FOR DECODER MODEL
![alt text](https://miro.medium.com/max/1516/1*GwKpF9yMipPWuruXoTWKPQ.png)




In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, decoder_units, batch_sz):
    super(Decoder, self).__init__()

    #same architecture as encoder just the addition of single dense layer 
    self.batch_sz = batch_sz
    self.decoder_units = decoder_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.decoder_units, 
                                   return_sequences = True,
                                   return_state = True,
                                   recurrent_initializer = 'glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    #used for attention 
    self.attention = BahdanauAttention(self.decoder_units)
  def call(self, x, hidden, encoder_output):
    #encoder_output = batch_size, max_length, hidden
    context_vector, attention_weights = self.attention(hidden, encoder_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, hidden_state + embedding_dim)
    # context vector and previous state and output is current input to gru layer
    x = tf.concat([tf.expand_dims(context_vector,1),x], axis=-1)

    #passing concatenated vector to gru
    output, state = self.gru(x)

    
    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

        
  

In [None]:
decoder = Decoder(vocab_target_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((128, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (128, 12934)


#OPTIMIZER AND LOSS FUNCTION
**Adam OPTIMIZER**
![alt text](https://miro.medium.com/max/3272/1*YJCqitHcljUpCGf058WOIw.png)
![alt text](https://blog.paperspace.com/content/images/2018/06/adam.png)


**CROSS-ENTROPY LOSS FUNCTION IS USED**
![alt text](https://miro.medium.com/max/778/1*JZ-qea3BYaGOT4Vdhds9mQ.png)


In [None]:
# selected optimizer is Adam
optimizer = tf.keras.optimizers.Adam()

#selected loss function is categorical crossentropy for multiclass classification
#from logits is set to true 
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, predicted):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, predicted)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

checkpoints(object based saving)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# TRAINING THE MODEL
NOTE :- FOR MORE ROBUSTNESS OF MODEL WE IMPLEMENT TEACHER FORCING ALGORITHM IN DECODER 
Teacher forcing works by using the actual or expected output from the training dataset at the current time step y(t) as input in the next time step X(t+1), rather than the output generated by the network.
![alt text](https://i.ytimg.com/vi/fAQ-yV__168/maxresdefault.jpg)

In [None]:
@tf.function
def train_step(inp, target, encoder_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    encoder_output, encoder_hidden = encoder(inp, encoder_hidden)

    decoder_hidden = encoder_hidden

    decoder_input = tf.expand_dims([target_language.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, target.shape[1]):
      # passing enc_output to the decoder
      predictions, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_output)

      loss += loss_function(target[:, t], predictions)

      # using teacher forcing
      decoder_input = tf.expand_dims(target[:, t], 1)

  batch_loss = (loss / int(target.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 100

for epoch in range(EPOCHS):
  start = time.time()

  encoder_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, target)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, target, encoder_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  # saving (checkpoint) the model every 10 epochs
  if (epoch + 1) % 10 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.5576


#EVALUATION AND RESULTS
FUNCTION FOR TESTING MODEL ON USER INPUTTED DATA

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [input_language.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    encoder_output, encoder_hidden = encoder(inputs, hidden)

    decoder_hidden = encoder_hidden
    decoder_input = tf.expand_dims([target_language.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, decoder_hidden, attention_weights = decoder(decoder_input,
                                                             decoder_hidden,
                                                             encoder_output)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_language.index_word[predicted_id] + ' '

        if target_language.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        decoder_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

# TESTING

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))