### Encoder Decoder for Machine Translation

In [5]:
import tensorflow as tf
import numpy as np
from pathlib import Path

In [6]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

In [7]:
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

In [8]:
#Printing some samples
for i in range(5):
    print(f"{i+1} :  {sentences_en[i]} => {sentences_es[i]}")

1 :  How boring! => Qué aburrimiento!
2 :  I love sports. => Adoro el deporte.
3 :  Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?
4 :  My mother did nothing but weep. => Mi madre no hizo nada sino llorar.
5 :  Croatia is in the southeastern part of Europe. => Croacia está en el sudeste de Europa.


In [9]:
vocab_size = 1000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)

text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)

text_vec_layer_en.adapt(sentences_en)

text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])


In [10]:
print(f'Length of english vocabulary : {len(text_vec_layer_en.get_vocabulary())}')
text_vec_layer_en.get_vocabulary()[:10]

Length of english vocabulary : 1000


['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [11]:
print(f'Length of spanish vocabulary : {len(text_vec_layer_es.get_vocabulary())}')

text_vec_layer_es.get_vocabulary()[:10]

Length of spanish vocabulary : 1000


['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [12]:
#Max sentence length = 50
#Corresponding index of the word is stored in an array of length = max_length 
s1 = text_vec_layer_es(["de"])
s1

<tf.Tensor: shape=(1, 50), dtype=int64, numpy=
array([[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]], dtype=int64)>

In [13]:
#Max sentence length = 50
#Corresponding index of the word is stored in an array of length = max_length
s2 = text_vec_layer_es(["de la no"])
s2

<tf.Tensor: shape=(1, 50), dtype=int64, numpy=
array([[4, 9, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]], dtype=int64)>

In [14]:
#X_train - first 100000 sentences from english data 
X_train = tf.constant(sentences_en[:100000])

#X_valid - 100000 - last sentences from english data 
X_valid = tf.constant(sentences_en[100000:])

#X_train_dec - startofseq {sent} for first 100000 sentences from spanish data 
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100000]])

#X_train_dec - startofseq {sent} for 100000 to last sentences from spanish data
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100000:]])

#See previous cell for example
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100000:]])

In [15]:
tf.random.set_seed(42)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [16]:
embed_size = 128

In [17]:
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [18]:
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [19]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [20]:
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)


In [52]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=1,
          validation_data=((X_valid, X_valid_dec), Y_valid))



<keras.src.callbacks.History at 0x13117abe820>

In [23]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq " + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [32]:
def load_tf_model():
    model = tf.keras.models.load_model("..\Models\TranslationBiRnn")
    return model

model = load_tf_model()

In [33]:
translate("I like music")



'me gusta la música'

#### Bidirectional RNNs

In [21]:
tf.random.set_seed(42)
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True))

In [22]:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)

In [94]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
translate("I like soccer")



'me gusta el fútbol'

In [97]:
#Save the model 
model.save('../Models/TranslationBiRnn') 

INFO:tensorflow:Assets written to: ../Models/TranslationBiRnn\assets


INFO:tensorflow:Assets written to: ../Models/TranslationBiRnn\assets


BEAM SEARCH

In [1]:
def beam_search(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])
    X_dec = np.array(["startofseq"])
    y_proba = model.predict((X, X_dec))[0,0]
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]

    if verbose:
        print("top first words: ", top_translations)

    for idx in range(1,max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue

            X = np.array([sentence_en])
            X_dec = np.array(["startofseq " + translation])
            y_proba = model.predict((X, X_dec))[0,idx]

            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba), 
                                f"{translation} {word}"))
                
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _,tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()
            
        


In [35]:
# extra code – shows how the model making an error
sentence_en = "I love cats and dogs"
translate(sentence_en)



'me gustan los gatos y los perros'

In [36]:
beam_search(sentence_en, beam_width=3, verbose=True)

top first words:  [(-0.009752829, 'me'), (-5.1400514, 'yo'), (-6.78987, 'los')]
Top translations so far: [(-0.53081965, 'me gustan'), (-1.1078852, 'me [UNK]'), (-3.4103315, 'me gusta')]
Top translations so far: [(-0.5720572, 'me gustan los'), (-1.3023738, 'me [UNK] los'), (-3.7777119, 'me [UNK] el')]
Top translations so far: [(-0.88856196, 'me gustan los gatos'), (-1.465327, 'me [UNK] los gatos'), (-1.8880031, 'me gustan los perros')]
Top translations so far: [(-1.3661273, 'me gustan los gatos y'), (-2.0261981, 'me [UNK] los gatos y'), (-2.4221363, 'me gustan los perros y')]
Top translations so far: [(-1.6273166, 'me gustan los gatos y los'), (-2.5033646, 'me [UNK] los gatos y los'), (-2.5182374, 'me gustan los perros y los')]
Top translations so far: [(-2.1052697, 'me gustan los gatos y los perros'), (-2.535176, 'me gustan los perros y los gatos'), (-2.5956624, 'me gustan los gatos y los gatos')]
Top translations so far: [(-2.1133938, 'me gustan los gatos y los perros endofseq'), (-2.

'me gustan los gatos y los perros'

Attention Mechanisms

In [37]:
tf.random.set_seed(42)
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences=True, return_state=True))

In [39]:
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),
                 tf.concat(encoder_state[1::2], axis=1)]

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [41]:
attention_layer = tf.keras.layers.Attention()

attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [45]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])

model.fit((X_train, X_train_dec), Y_train, epochs=5,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1c94921b4c0>

In [46]:
translate("I like soccer and also going to the beach")



'me gusta fútbol y también va a la playa'

In [47]:
beam_search("I like soccer and also going to the beach", beam_width=3,
            verbose=True)

top first words:  [(-0.1095495, 'me'), (-3.8042102, 'prefiero'), (-4.0412292, 'yo')]
Top translations so far: [(-0.22534326, 'me gusta'), (-2.6338477, 'me gustan'), (-4.02202, 'me [UNK]')]
Top translations so far: [(-0.7561035, 'me gusta fútbol'), (-1.2608948, 'me gusta el'), (-3.330087, 'me gustan fútbol')]
Top translations so far: [(-0.8028421, 'me gusta fútbol y'), (-1.2796283, 'me gusta el fútbol'), (-3.4050558, 'me gustan fútbol y')]
Top translations so far: [(-1.3133388, 'me gusta el fútbol y'), (-1.3273625, 'me gusta fútbol y también'), (-3.476686, 'me gusta fútbol y él')]
Top translations so far: [(-1.493983, 'me gusta el fútbol y también'), (-1.7836051, 'me gusta fútbol y también va'), (-3.6202354, 'me gusta fútbol y también ir')]
Top translations so far: [(-1.7995749, 'me gusta fútbol y también va a'), (-2.484838, 'me gusta el fútbol y también va'), (-3.3895035, 'me gusta el fútbol y también ir')]
Top translations so far: [(-1.8242681, 'me gusta fútbol y también va a la'), (-

'me gusta fútbol y también va a la playa'

In [51]:
model.save('../Models/Attention/TranslationModel') 

INFO:tensorflow:Assets written to: ../Models/Attention/TranslationModel\assets


INFO:tensorflow:Assets written to: ../Models/Attention/TranslationModel\assets


In [69]:
max_tokens = text_vec_layer_es.get_config()["max_tokens"]
output_sequence_length = text_vec_layer_es.get_config()["output_sequence_length"]
vocab_data = text_vec_layer_es.get_vocabulary()

In [84]:
import json

x = {
  "max_tokens": max_tokens,
  "output_sequence_length":output_sequence_length,
  "vocab_data": text_vec_layer_es.get_vocabulary()
}

data = json.dumps(x)

f = open("../Models/Attention/text_vectorizer.json", "a")
f.write(data)
f.close()