In [1]:
import tensorflow as tf
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import keras
from pathlib import Path




In [3]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
 extract=True)
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text(encoding='utf-8')

In [4]:
import numpy as np

In [5]:
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs) # separates the pairs into 2 lists

In [6]:
vocab_size = 5000
max_length = 50
text_vec_layer_en = keras.layers.TextVectorization(
 vocab_size, output_sequence_length=max_length)
text_vec_layer_es = keras.layers.TextVectorization(
 vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

In [9]:
a = np.array(text_vec_layer_es.get_vocabulary())
np.save('vocabulary.npy', a)

In [None]:
a = np.load('vocabulary.npy')
list(a)

In [6]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

In [7]:
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [8]:
encoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = keras.layers.Input(shape=[], dtype=tf.string)

In [9]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = keras.layers.Embedding(vocab_size, embed_size,
 mask_zero=True)
decoder_embedding_layer = keras.layers.Embedding(vocab_size, embed_size,
 mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)


In [10]:
encoder = keras.layers.Bidirectional(
    keras.layers.LSTM(256, return_sequences=True, return_state=True))
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)
decoder = keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [11]:
attention_layer = keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [12]:
model = keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7bc07ee671f0>

In [15]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = tf.constant([sentence_en]) # encoder input
        X_dec = tf.constant(["startofseq " + translation]) # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx] # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [16]:
translate("I like soccer")



'me gusta el fútbol'

In [22]:
translate("I like to travel and also going to the beach with many girls")



'me gusta viajar y también a la playa a muchas niñas'

In [23]:
model.save("encodeco_attention_nmt_5000.keras")

In [None]:
loaded_model = keras.saving.load_model("encodeco_attention_nmt_5000.keras")

In [25]:
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = tf.constant([sentence_en]) # encoder input
        X_dec = tf.constant(["startofseq " + translation]) # decoder input
        y_proba = loaded_model.predict((X, X_dec))[0, word_idx] # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [None]:
text_vec_layer_en.get_vocabulary()

In [None]:
translate("I like doing homework")

In [3]:
print(keras.__version__)

2.15.0
