# Sequence-to-Sequence Modelling
- We will convert from one sequence to another
- The data comes from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

In [27]:
# inputs
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
import os, shutil, pathlib, random, string, re

## Data Preprocessing

In [2]:
# load in the pairs
text_file = pathlib.Path("spa-eng/spa-eng/spa.txt")
# text_pairs = [(english, spanish)]
text_pairs = []
with open(text_file, "rt", encoding="utf-8") as f:
    for line in f:
        english, spanish = line.split('\t')
        text_pairs.append((english, f"[start] {spanish[:-1]} [end]"))

In [3]:
# split & shuffle the data
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples

random.shuffle(text_pairs)

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples+num_val_samples]
test_pairs = text_pairs[num_train_samples+num_val_samples:]

In [4]:
# keep [, ]; and remove ¿
strip_chars = (string.punctuation + "¿").replace('[', '').replace(']', '')

# custom standardization; lowercase and remove strip_char characters
def custom_standardize(input_string: tf.string) -> tf.string:
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f'[{re.escape(strip_chars)}]', "")

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(max_tokens=vocab_size, standardize=custom_standardize, output_mode='int', output_sequence_length=sequence_length)
target_vectorization = layers.TextVectorization(max_tokens=vocab_size, standardize=custom_standardize, output_mode='int', output_sequence_length=sequence_length + 1)

# extract the english and spanish parts, and initialize the vectorizers with their vocabulary
train_english_texts = [i[0] for i in train_pairs]
train_spanish_texts = [i[1] for i in train_pairs]

source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [5]:
batch_size = 64
@tf.function
def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return({
        "english": eng, "spanish": spa[:,:-1]
    }, spa[:,1:])

def make_dataset(pairs):
    eng, spa = map(list, zip(*pairs))
    dataset = tf.data.Dataset.from_tensor_slices((eng, spa)).batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

# format each dataset; {'english': [batch x int sequence]; 'spanish': [batch x int sequence]}
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [6]:
for inputs, targets in train_ds.take(1):
    print(f"English Shape: {inputs['english'].shape}")
    print(f"Spanish Shape: {inputs['spanish'].shape}")
    print(f"Target Shape: {targets.shape}")

English Shape: (64, 20)
Spanish Shape: (64, 20)
Target Shape: (64, 20)


In [23]:
# word embedding dim, intermediate dim
embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x)

past_target = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)(past_target)
x = layers.GRU(latent_dim, return_sequences=True)(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation="softmax")(x)

seq2seq_rnn = keras.Model([source, past_target], target_next_step)

# notice that the GRU layer, for each word, only has information on previous words
# the output is a [length x vocab_size] vector; it's predicting that entire sentence
seq2seq_rnn.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds)

Epoch 1/15
   2/1302 [..............................] - ETA: 28:16 - loss: 4.0325 - accuracy: 0.0995      

KeyboardInterrupt: 

In [29]:
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = {spa_vocab[i]: i for i in range(len(spa_vocab))}
max_decoded_sequence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sequence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sequence_length):
        tokenized_target_sequence = target_vectorization(decoded_sentence)
        next_token_predictions = seq2seq_rnn({
            'english': tokenized_input_sequence,
            'spanish': tf.reshape(tokenized_target_sequence[:-1], (1, -1))})
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = spa_vocab[sampled_token_index]
        decoded_sentence = f"{decoded_sentence} {sampled_token}"
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
I'm not your enemy.


KeyboardInterrupt: 

In [30]:
model = keras.models.load_model("gru-encoded-decoded.keras")
model.evaluate(val_ds)

 43/279 [===>..........................] - ETA: 1:38 - loss: 2.2963 - accuracy: 0.3036

KeyboardInterrupt: 