## Neural Machine Translation

In [1]:
import os
os.environ['TF_CCP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import tensorflow as tf
from collections import Counter
from utils import (sentences, train_data, val_data, english_vectorizer, french_vectorizer,
                   masked_loss, masked_acc, tokens_to_text)

### Lets look at a random sentence and its equivalent translation

In [2]:
french_sentences, english_sentences = sentences

print(f"English (to translate) sentence:\n\n{english_sentences[-5]}\n")
print(f"French (translation) sentence:\n\n{french_sentences[-5]}\n")

English (to translate) sentence:

"Top-down economics never works," said Obama. "The country does not succeed when just those at the very top are doing well. We succeed when the middle class gets bigger, when it feels greater security."

French (translation) sentence:

« L'économie en partant du haut vers le bas, ça ne marche jamais, » a dit Obama. « Le pays ne réussit pas lorsque seulement ceux qui sont au sommet s'en sortent bien. Nous réussissons lorsque la classe moyenne s'élargit, lorsqu'elle se sent davantage en sécurité. »



In [3]:
del english_sentences
del french_sentences
del sentences

### Vocabulary

In [4]:
print(f"First 10 words of the English vocabulary:\n\n{english_vectorizer.get_vocabulary()[:10]}\n")
print(f"First 10 words of the French vocabulary:\n\n{french_vectorizer.get_vocabulary()[:10]}\n")

First 10 words of the English vocabulary:

['', '[UNK]', '[SOS]', '[EOS]', '.', 'i', 'you', 'to', 'the', '?']

First 10 words of the French vocabulary:

['', '[UNK]', '[SOS]', '[EOS]', '.', 'je', 'de', 'a', '?', 'pas']



In [5]:
vocab_size = french_vectorizer.vocabulary_size()

print(f"The French vocabulary is made up of {vocab_size} words.")

The French vocabulary is made up of 12000 words.


In [6]:
# helpers to convert words to ids and vice-versa
word_to_id = tf.keras.layers.StringLookup(
    vocabulary=french_vectorizer.get_vocabulary(),
    mask_token="",
    oov_token="[UNK]"
)

id_to_word = tf.keras.layers.StringLookup(
    vocabulary=french_vectorizer.get_vocabulary(),
    mask_token="",
    oov_token="[UNK]",
    invert=True
)

#### Trying out the above functions to some special words

In [7]:
unk_id = word_to_id("[UNK]")
sos_id = word_to_id("[SOS]")
eos_id = word_to_id("[EOS]")
lorsque_id = word_to_id("lorsque")

print(f"The id for the [UNK] token is {unk_id}")
print(f"The id for the [SOS] token is {sos_id}")
print(f"The id for the [EOS] token is {eos_id}")
print(f"The id for the lorsque (when) token is {lorsque_id}")

The id for the [UNK] token is 1
The id for the [SOS] token is 2
The id for the [EOS] token is 3
The id for the lorsque (when) token is 301


Now lets take a look on the actual tokenized data to be fed into the neural networks

In [10]:
for (to_translate, sr_translation), translation in train_data.take(1):
    print(f"Tokenized English sentence:\n{to_translate[0, :].numpy()}\n\n")
    print(f"Tokenized French shifted right sentence:\n{sr_translation[0, :].numpy()}\n\n")
    print(f"Tokenized French output sentence:\n{translation[0, :].numpy()}\n\n")

Tokenized English sentence:
[  2  17 252  20  10 147  55   7  61 254   7   8 778   4   3   0   0]


Tokenized French shifted right sentence:
[   2 5229   47  647   42  219   19  691  218    4    0    0    0    0
    0    0    0    0]


Tokenized French output sentence:
[5229   47  647   42  219   19  691  218    4    3    0    0    0    0
    0    0    0    0]




### NMT Model with Attention

In [11]:
VOCAB_SIZE = 12000
UNITS = 256

#### Encoder

In [12]:
# class encoder inherits from tensorflow keras layers class
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, units):
        """
        vocab_size : size of the vocabulary
        units : Number of units in the LSTM Layer
        
        """
        super(Encoder, self).__init__()
        
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True
        )
        
        self.rnn = tf.keras.layers.Bidirectional(
            merge_mode='sum',
            layer=tf.keras.layers.LSTM(
                units=units,
                return_sequences=True
            ),
        )
        
    def call(self, context):
        """
        context : The sentence to translate
        """
        # passing the context through embedding layer
        x = self.embedding(context)
        
        # passing the embedding layer through rnn
        x = self.rnn(x)
        
        return x

In [13]:
# creating an instance of the Encoder class
encoder = Encoder(VOCAB_SIZE, UNITS)

# passing the english (context) sentence to the encoder
encoder_output = encoder(to_translate)

print(f"Tensor of sentences in english has shape: {to_translate.shape}\n")
print(f"Encoder output has shape: {encoder_output.shape}\n")

Tensor of sentences in english has shape: (64, 17)

Encoder output has shape: (64, 17, 256)



### CrossAttention

In [14]:
#creating a cross-attention class
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        """
        units : number of units in the LSTM Layer
        """
        super().__init__()
        
        self.mha = (
            tf.keras.layers.MultiHeadAttention(
                key_dim=units,
                num_heads=1
            )
        )
        
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        
    def call(self, context, target):
        """
        context : The sentence to translate
        target : The equivalent shifted right translation
        """
        # calling the multi-head attention by passing query and key value
        attn_output = self.mha(
            query=target,
            value=context
        )
            
        x = self.add([target, attn_output])
            
        x = self.layernorm(x)
            
        return x

In [15]:
# instanciating cross-attention class
attention_layer = CrossAttention(UNITS)

# embedding the shifted-right translation
sr_translation_embedded = tf.keras.layers.Embedding(VOCAB_SIZE, output_dim=UNITS, mask_zero=True)(sr_translation)

# finally getting the attention scores
attention_result = attention_layer(encoder_output, sr_translation_embedded)

print(f"Tensor of context has shape: {encoder_output.shape}\n")
print(f"Tensor of sentences in french (shifted-right) has shape: {sr_translation_embedded.shape}\n")
print(f"Tensor of attention scores has shape: {attention_result.shape}\n")

Tensor of context has shape: (64, 17, 256)

Tensor of sentences in french (shifted-right) has shape: (64, 18, 256)

Tensor of attention scores has shape: (64, 18, 256)



### Decoder

In [16]:
# creating a class Decoder
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, units):
        """
        vocab_size : size of the vocabulary
        units : Number of units in the LSTM Layer
        """
        super(Decoder, self).__init__()
        
        # The embedding layer
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True
        )
        
        # The pre-attention RNN
        self.pre_attn_rnn = tf.keras.layers.LSTM(
            units=units,
            return_sequences=True,
            return_state=True
        )
        
        # The attention layer
        self.attention = CrossAttention(units)
        
        # The post-attention RNN
        self.post_attn_rnn = tf.keras.layers.LSTM(
            units=units,
            return_sequences=True
        )
        
        # The final dense layer
        self.output_layer = tf.keras.layers.Dense(
            units=vocab_size,
            activation=tf.nn.log_softmax
        )
        
    def call(self, context, target, state=None, return_state=False):
        """
        context : The sentence to translate
        target : The equivalent shifted right translation
        state : Hidden states of the pre-attention LSTM
        return_state : Set it to true if want to return the hidden states
        """
        # embedding of the input (target)
        x = self.embedding(target)
        
        # embedded input through pre-attention LSTM
        x, hidden_state, cell_state = self.pre_attn_rnn(x, initial_state=state)
        
        # performing cross-attention between the context and the target
        x = self.attention(context, x)
        
        # passing the attention through post-attention LSTM
        x = self.post_attn_rnn(x)
        
        # finally the logits
        logits = self.output_layer(x)
        
        if return_state:
            return logits, [hidden_state, cell_state]
        
        return logits

In [17]:
# instanciating the decoder class
decoder = Decoder(VOCAB_SIZE, UNITS)

# computing the logits from the decoder
logits = decoder(encoder_output, sr_translation)

print(f"The context has shape: {encoder_output.shape}")
print(f"The target (shifted-right) has shape: {sr_translation.shape}")
print(f"The logits has shape: {logits.shape}")

The context has shape: (64, 17, 256)
The target (shifted-right) has shape: (64, 18)
The logits has shape: (64, 18, 12000)


### Translator

In [18]:
# creating the final Translator class
class Translator(tf.keras.Model):
    def __init__(self, vocab_size, units):
        """
        vocab_size : size of the vocabulary
        units : Number of units in the LSTM Layer
        """
        super().__init__()
        
        # defining the encoder with vocab size and units
        self.encoder = Encoder(vocab_size, units)
        
        # defining the decoder with vocab size and units
        self.decoder = Decoder(vocab_size, units)
        
    def call(self, inputs):
        """
        inputs : Tuple containing context and target (shifted-right)
        """
        # unpacking the inputs tuple
        context, target = inputs
        
        # passing the context through the encoder
        encoded_context = self.encoder(context)
        
        # getting the logits by passing encoded context and target to the decoder
        logits = self.decoder(encoded_context, target)
        
        return logits

In [19]:
# instanciating Translator class
translator = Translator(VOCAB_SIZE, UNITS)

# computing the logits for every word in vocabulary
logits = translator((to_translate, sr_translation))

print(f"The context has shape: {to_translate.shape}")
print(f"The target (shifted-right) has shape: {sr_translation.shape}")
print(f"The logits of translator output has shape: {logits.shape}")

The context has shape: (64, 17)
The target (shifted-right) has shape: (64, 18)
The logits of translator output has shape: (64, 18, 12000)


### Training

In [30]:
def compile_and_train(model, epochs=20, steps_per_epoch=500):
    model.compile(optimizer="adam",
                  loss=masked_loss, metrics=[masked_acc, masked_loss])
    
    history = model.fit(
        train_data.repeat(),
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_data,
        validation_steps=50,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
    )
    
    model.save_weights("NMT_model.h5")
    
    return model, history

In [31]:
# Training the translator

trained_translator, history = compile_and_train(translator)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
trained_translator.save("NMT_Model_trained")



INFO:tensorflow:Assets written to: NMT_Model_trained\assets


INFO:tensorflow:Assets written to: NMT_Model_trained\assets


In [20]:
translator.summary()

Model: "translator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  4122624   
                                                                 
 decoder_1 (Decoder)         multiple                  7470304   
                                                                 
Total params: 11,592,928
Trainable params: 11,592,928
Non-trainable params: 0
_________________________________________________________________


In [19]:
translator.load_weights("NMT_model.h5")

The shape of weights of the last log_softmax layer

In [20]:
translator.layers[-1].get_weights()[-1].shape

(12000,)

In [21]:
def resume_training(model, epochs=2, steps_per_epoch=500):
    model.compile(optimizer="adam",
                  loss=masked_loss, metrics=[masked_acc, masked_loss])
    
    history = model.fit(
        train_data.repeat(),
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_data,
        validation_steps=50,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
    )
    
    model.save_weights("NMT_model_chkpt_1.h5")
    
    return model, history

In [22]:
trained_translator, history = resume_training(translator)

Epoch 1/2
Epoch 2/2


### Using the model for inference

Loading the trained model weights

In [21]:
translator.load_weights("NMT_model_chkpt_1.h5")

In [22]:
def generate_next_token(decoder, context, next_token, done, state, temperature=0.0):
    """
    decoder : The decoder
    context : Encoded sentence to translate
    next_token : The predicted next_token
    done : True if translation is complete
    state : Hidden states of the pre-attention LSTM layer
    temperature : Controls randomness of the predicted tokens
    """
    # getting the logits and state from the decoder
    logits, state = decoder(context, next_token, state=state, return_state=True)
    
    # getting the last predicted logit
    logits = logits[:, -1, :]
    
    # if temperature is 0.0 take the argmax of logits
    if temperature == 0.0:
        next_token = tf.argmax(logits, axis=-1)
        
    # if temperature is not 0.0 take the next_token sampled out of logits
    else:
        logits = logits / temperature
        next_token = tf.random.categorical(logits, num_samples=1)
        
    # trimming the dimension of size 1
    logits = tf.squeeze(logits)
    next_token = tf.squeeze(next_token)
    
    # getting the logit of the selected next_token
    logit = logits[next_token].numpy()
    
    # reshaping the next token to (1, 1)
    next_token = tf.reshape(next_token, shape=(1, 1))
    
    # if the next token is "eos" setting done to true
    if next_token == eos_id:
        done = True
        
    return next_token, logit, state, done

In [29]:
# A sentence to translate
eng_sentence = "I love languages"

# Converting the sentence to a tensor
texts = tf.convert_to_tensor(eng_sentence)[tf.newaxis]

# vectorizing it and passing it through the encoder
context = english_vectorizer(texts).to_tensor()
context = encoder(context)

# The first token should be "SOS"
next_token = tf.fill((1, 1), sos_id)

# Hidden and cell states of the LSTM mocked using uniform samples
state = [tf.random.uniform((1, UNITS)), tf.random.uniform((1, UNITS))]

# Not done until the next token is "EOS"
done = False

# Generating next token
next_token, logit, state, done = generate_next_token(decoder, context, next_token, done, state, temperature=0.5)

print(f"Next token: {next_token}\nLogit: {logit:.4f}\nDone? {done}")

Next token: [[9980]]
Logit: -18.9135
Done? False


### Translate

In [31]:
# Function to perform translation
def translate(model, text, max_length=50, temperature=0.0):
    """
    model : The trained translator
    text : The sentence to translate
    max_length : The maximum length of the translation
    temperature : Controls randomness of the predicted tokens
    """
    # list to save tokens and logits
    tokens, logits = [], []
    
    # converting the original text to a tensor
    text = tf.convert_to_tensor(text)[tf.newaxis]
    
    # vectorizing the text
    context = english_vectorizer(text).to_tensor()
    
    # passing through encoder
    context = model.encoder(context)
    
    # The first token should be "SOS"
    next_token = tf.fill((1, 1), sos_id)
    
    # # Hidden and cell states of the LSTM should be tensors of zeros
    state = [tf.zeros((1, UNITS)), tf.zeros((1, UNITS))]
    
    # Not done until the next token is "EOS"
    done = False
    
    # iterating for length max_length
    for _ in range(max_length):
        
        # generating next token
        next_token, logit, state, done = generate_next_token(
            decoder=model.decoder,
            context=context,
            next_token=next_token,
            done=done,
            state=state,
            temperature=temperature
        )
        
        # if done is true breaking the loop
        if done == True:
            break
        
        # adding the next token to the list
        tokens.append(next_token)
        
        # adding the logit to the list
        logits.append(logit)
        
    # concating all tokens into a tensor
    tokens = tf.concat(tokens, axis=-1)
    
    # converting the translated tokens into texts
    translation = tf.squeeze(tokens_to_text(tokens, id_to_word))
    translation = translation.numpy().decode()
    
    return translation, logits[-1], tokens

In [49]:
temp = 0.0
sentence = "England is a wonderful country. I would love to visit it some day!"

translation, logit, tokens = translate(translator, sentence, temperature=temp)

print(f"Original sentence : {sentence}\n\nTranslation : {translation}\n\nlogit : {logit}\n\nTokens : {tokens}")

Original sentence : England is a wonderful country. I would love to visit it some day!

Translation : langleterre est un pays merveilleux . je [UNK] seul un jour !

logit : -0.2357894629240036

Tokens : [[2997   18   19  483 1871    4    5    1  182   19  216   34]]
