In [None]:
# -*- coding: utf-8 -*-
"""
RNN Encoder-Decoder Translation Demo
Dataset: English -> Spanish (from TensorFlow Datasets, small subset)
"""

### Problem Statement

The goal of this project is to develop a sequence-to-sequence (Seq2Seq) model using an RNN Encoder–Decoder architecture for neural machine translation. Specifically, we will build a model that can translate sentences from Portuguese to English using the TED Talks Translation dataset available in TensorFlow Datasets.

#### Objectives:

Preprocess and tokenize bilingual sentence pairs (Portuguese → English).

Train an Encoder–Decoder network where:

- The encoder reads the source (Portuguese) sentence and compresses it into a context vector.

- The decoder generates the target (English) sentence word by word, using teacher forcing during training.

- Evaluate the model’s ability to translate unseen Portuguese sentences into meaningful English outputs.



In [5]:
pip install tensorflow tensorflow-datasets numpy

Collecting tensorflow-datasets
  Using cached tensorflow_datasets-4.9.9-py3-none-any.whl.metadata (11 kB)
Collecting dm-tree (from tensorflow-datasets)
  Using cached dm_tree-0.1.9-cp312-cp312-win_amd64.whl.metadata (2.5 kB)
Collecting etils>=1.9.1 (from etils[edc,enp,epath,epy,etree]>=1.9.1; python_version >= "3.11"->tensorflow-datasets)
  Using cached etils-1.13.0-py3-none-any.whl.metadata (6.5 kB)
Collecting immutabledict (from tensorflow-datasets)
  Using cached immutabledict-4.2.2-py3-none-any.whl.metadata (3.5 kB)
Collecting promise (from tensorflow-datasets)
  Using cached promise-2.3-py3-none-any.whl
Collecting simple_parsing (from tensorflow-datasets)
  Using cached simple_parsing-0.1.7-py3-none-any.whl.metadata (7.3 kB)
Collecting tensorflow-metadata (from tensorflow-datasets)
  Using cached tensorflow_metadata-1.17.2-py3-none-any.whl.metadata (2.5 kB)
Collecting tqdm (from tensorflow-datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting einops (fr

In [1]:


import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

# ---- Load Dataset ----
dataset, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

train_examples, val_examples = dataset['train'], dataset['validation']

# ---- Tokenizers ----
# We’ll build Subword tokenizers (learn from data)
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)

tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

vocab_size_en = tokenizer_en.vocab_size + 2
vocab_size_pt = tokenizer_pt.vocab_size + 2
max_len = 40

# ---- Encode Function ----
def encode(pt, en):
    pt_tokens = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(pt.numpy()) + [tokenizer_pt.vocab_size+1]
    en_tokens = [tokenizer_en.vocab_size] + tokenizer_en.encode(en.numpy()) + [tokenizer_en.vocab_size+1]
    return pt_tokens, en_tokens

def tf_encode(pt, en):
    pt, en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
    pt.set_shape([None])
    en.set_shape([None])
    return pt, en

# ---- Prepare Data ----
BUFFER_SIZE = 20000
BATCH_SIZE = 64

train_dataset = (train_examples.map(tf_encode)
                 .filter(lambda x, y: tf.logical_and(tf.size(x) <= max_len, tf.size(y) <= max_len))
                 .cache()
                 .shuffle(BUFFER_SIZE)
                 .padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))
                 .prefetch(tf.data.AUTOTUNE))

val_dataset = (val_examples.map(tf_encode)
               .filter(lambda x, y: tf.logical_and(tf.size(x) <= max_len, tf.size(y) <= max_len))
               .padded_batch(BATCH_SIZE, padded_shapes=([None], [None])))

# ---- Build Seq2Seq Model ----
embed_dim = 32
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size_pt, embed_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(vocab_size_en, embed_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_en, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ---- Prepare Data for Training (teacher forcing) ----
def prepare_batch(src, tgt):
    decoder_inp = tgt[:, :-1]
    decoder_out = tgt[:, 1:]
    return (src, decoder_inp), decoder_out

train_dataset_tf = train_dataset.map(prepare_batch)
val_dataset_tf = val_dataset.map(prepare_batch)

# ---- Train ----
model.fit(train_dataset_tf, epochs=5, validation_data=val_dataset_tf)

# ---- Simple Inference Demo ----
def translate(sentence):
    pt_tokens = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(sentence) + [tokenizer_pt.vocab_size+1]
    pt_tokens = tf.keras.preprocessing.sequence.pad_sequences([pt_tokens], maxlen=max_len, padding='post')

    en_input = np.zeros((1, max_len))
    states = None

    # Greedy decoding
    for t in range(max_len-1):
        output_tokens = model.predict([pt_tokens, en_input], verbose=0)
        sampled_token = np.argmax(output_tokens[0, t, :])
        en_input[0, t+1] = sampled_token
        if sampled_token == tokenizer_en.vocab_size+1:  # <eos>
            break

    decoded = tokenizer_en.decode([int(x) for x in en_input[0] if x > 0])
    return decoded

print("Portuguese: Olá, como você está?")
print("English (predicted):", translate("Olá, como você está?"))


Epoch 1/5
    319/Unknown [1m280s[0m 764ms/step - accuracy: 0.1029 - loss: 6.6638

KeyboardInterrupt: 