In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import tensorflow as tf
from collections import Counter
import pathlib

In [2]:
path_to_file = pathlib.Path("por-eng/por.txt")

np.random.seed(1234)
tf.random.set_seed(1234)


In [3]:
def load_data(path):
    text = path.read_text(encoding="utf-8")

    lines = text.splitlines()
    pairs = [line.split("\t") for line in lines]

    context = np.array([context for _, context, _ in pairs])
    target = np.array([target for target, _, _ in pairs])

    return context, target

In [4]:
portuguese_sentences, english_sentences = load_data(path_to_file)
sentences = (portuguese_sentences, english_sentences)
print(portuguese_sentences,"\n\n",english_sentences)

['Vai.' 'Vá.' 'Oi.' ...
 'Uma criança que é falante nativa geralmente sabe muitas coisas sobre sua língua que um falante não-nativo que tem estudado há anos ainda não sabe e talvez nunca saberá.'
 'Contos de docilidade, de honra, de justiça, de coragem, de fortaleza perante o sofrimento, de intrepidez perante o perigo, de resolução destemida, de vontade de ferro inspiram as crianças a uma emulação dessas virtudes.'
 'A utilização de opções de alto calor quando se passa a ferro malhas sintéticas derreterá as fibras sintéticas e causará danos visíveis permanentes, que muitas vezes têm a aparência de uma área reluzente onde o ferro demasiado quente esteve mais tempo em contacto com a malha.'] 

 ['Go.' 'Go.' 'Hi.' ...
 'A child who is a native speaker usually knows many things about his or her language that a non-native speaker who has been studying for years still does not know and perhaps will never know.'
 'Tales of gentleness, of honor, of justice, of courage, of fortitude in sufferin

In [5]:
english_sentences[9]

'Who?'

In [6]:
BUFFER_SIZE = len(english_sentences)
BATCH_SIZE = 64

is_train = np.random.uniform(size=(len(portuguese_sentences),)) < 0.8
print(is_train)

[ True  True  True ...  True  True  True]


In [7]:
train_raw = (
    tf.data.Dataset.from_tensor_slices(
        (english_sentences[is_train], portuguese_sentences[is_train])
    )
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
)

val_raw = (
    tf.data.Dataset.from_tensor_slices(
        (english_sentences[~is_train], portuguese_sentences[~is_train])
    )
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
)

In [8]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, r"[^ a-z.?!,¿]", "")
    text = tf.strings.regex_replace(text, r"[.?!,¿]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text

max_vocab_size = 12000

In [9]:
english_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size, output_mode='int', ragged=True
)

english_vectorizer.adapt(train_raw.map(lambda context, target: context))

In [10]:
portuguese_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size, output_mode='int', ragged=True
)

portuguese_vectorizer.adapt(train_raw.map(lambda context, target: target))

In [11]:
train_raw.map(lambda x, y: x)

<_MapDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

In [12]:
[i for i in train_raw.map(lambda x, y: y)]

[<tf.Tensor: shape=(64,), dtype=string, numpy=
 array([b'Aquele \xc3\xb4nibus vai te deixar no zool\xc3\xb3gico.',
        b'Quase n\xc3\xa3o visitei Boston.',
        b'Agora n\xc3\xa3o \xc3\xa9 hora de jogar seguro.',
        b'N\xc3\xa3o sei o que responder.', b'Protegerei o Tom.',
        b'Vou comprar um carro novo na pr\xc3\xb3xima semana.',
        b'Tom apostou trinta d\xc3\xb3lares comigo que Mary n\xc3\xa3o faria isso.',
        b'Por que isto sempre acontece comigo?',
        b'Tom ligou o computador.', b'Eu estou \xc3\xb3timo!',
        b'Tom quer que a mesma coisa aconte\xc3\xa7a aqui.',
        b'Estou de acordo com o plano de voc\xc3\xaas.',
        b'Convidei a minha vizinha para tomar caf\xc3\xa9 da manh\xc3\xa3.',
        b'Eu estava planejando me tornar um professor.',
        b'Achei que voc\xc3\xaa e eu poder\xc3\xadamos nos encontrar mais tarde.',
        b'Tom s\xc3\xb3 est\xc3\xa1 tentando te assustar.',
        b'Eu ouvi boatos sobre o Tom e a Mary.',
        b

In [13]:
def process_text(context, target):
    context = english_vectorizer(context).to_tensor()
    target = portuguese_vectorizer(target)
    targ_in = target[:, :-1].to_tensor()
    targ_out = target[:, 1:].to_tensor()
    return (context, targ_in), targ_out


train_data = train_raw.map(lambda x, y: process_text(x, y), tf.data.AUTOTUNE)
val_data = val_raw.map(lambda x, y: process_text(x, y), tf.data.AUTOTUNE)

del train_raw
del val_raw

In [14]:
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)


def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)


def tokens_to_text(tokens, id_to_word):
    words = id_to_word(tokens)
    result = tf.strings.reduce_join(words, axis=-1, separator=" ")
    return result

In [15]:
portuguese_sentences, english_sentences = sentences

print(f"English (to translate) sentence:\n\n{english_sentences[-5]}\n")
print(f"Portuguese (translation) sentence:\n\n{portuguese_sentences[-5]}")

English (to translate) sentence:

No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.

Portuguese (translation) sentence:

Não importa o quanto você tenta convencer os outros de que chocolate é baunilha, ele ainda será chocolate, mesmo que você possa convencer a si mesmo e poucos outros de que é baunilha.


In [16]:
del portuguese_sentences
del english_sentences
del sentences

In [17]:
print(f"First 10 words of the english vocabulary:\n\n{english_vectorizer.get_vocabulary()[:10]}\n")
print(f"First 10 words of the portuguese vocabulary:\n\n{portuguese_vectorizer.get_vocabulary()[:10]}")

First 10 words of the english vocabulary:

['', '[UNK]', '[SOS]', '[EOS]', '.', 'tom', 'i', 'to', 'you', 'the']

First 10 words of the portuguese vocabulary:

['', '[UNK]', '[SOS]', '[EOS]', '.', 'tom', 'que', 'no', 'o', 'eu']


In [18]:
vocab_size_por = portuguese_vectorizer.vocabulary_size()
vocab_size_eng = english_vectorizer.vocabulary_size()

print(f"Portuguese vocabulary is made up of {vocab_size_por} words")
print(f"English vocabulary is made up of {vocab_size_eng} words")

Portuguese vocabulary is made up of 12000 words
English vocabulary is made up of 12000 words


In [19]:
word_to_id = tf.keras.layers.StringLookup(
    vocabulary=portuguese_vectorizer.get_vocabulary(), 
    mask_token="", 
    oov_token="[UNK]"
)

In [20]:
id_to_word = tf.keras.layers.StringLookup(
    vocabulary=portuguese_vectorizer.get_vocabulary(),
    mask_token="",
    oov_token="[UNK]",
    invert=True,
)

In [21]:
unk_id = word_to_id("[UNK]")
sos_id = word_to_id("[SOS]")
eos_id = word_to_id("[EOS]")
baunilha_id = word_to_id("baunilha")

print(f"The id for the [UNK] token is {unk_id}")
print(f"The id for the [SOS] token is {sos_id}")
print(f"The id for the [EOS] token is {eos_id}")
print(f"The id for baunilha (vanilla) is {baunilha_id}")

The id for the [UNK] token is 1
The id for the [SOS] token is 2
The id for the [EOS] token is 3
The id for baunilha (vanilla) is 7026


In [22]:
for (to_translate, sr_translation), translation in train_data.take(1):
    print(f"Tokenized english sentence:\n{to_translate[0, :].numpy()}\n\n")
    print(f"Tokenized portuguese sentence (shifted to the right):\n{sr_translation[0, :].numpy()}\n\n")
    print(f"Tokenized portuguese sentence:\n{translation[0, :].numpy()}\n\n")

Tokenized english sentence:
[  2  26 103 226   4   3   0   0   0   0   0   0   0   0   0   0   0]


Tokenized portuguese sentence (shifted to the right):
[  2  62  32 268   4   0   0   0   0   0   0   0   0   0   0   0]


Tokenized portuguese sentence:
[ 62  32 268   4   3   0   0   0   0   0   0   0   0   0   0   0]




In [23]:
VOCAB_SIZE = 12000
UNITS = 256

In [24]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, units):
        """Initializes an instance of this class

        Args:
            vocab_size (int): Size of the vocabulary
            units (int): Number of units in the LSTM layer
        """
        super(Encoder, self).__init__()

        self.embedding = tf.keras.layers.Embedding(  
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True
        )

        self.rnn = tf.keras.layers.Bidirectional(
            merge_mode="sum",
            layer=tf.keras.layers.LSTM(
                units=units,
                return_sequences=True
            ),
        )

    def call(self, context):
        """Forward pass of this layer

        Args:
            context (tf.Tensor): The sentence to translate

        Returns:
            tf.Tensor: Encoded sentence to translate
        """

        x = self.embedding(context)
        x = self.rnn(x)

        return x

In [25]:
encoder = Encoder(VOCAB_SIZE, UNITS)
encoder_output = encoder(to_translate)

print(f'Tensor of sentences in english has shape: {to_translate.shape}\n')
print(f'Encoder output has shape: {encoder_output.shape}')

Tensor of sentences in english has shape: (64, 17)

Encoder output has shape: (64, 17, 256)


In [26]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        """Initializes an instance of this class

        Args:
            units (int): Number of units in the LSTM layer
        """
        super().__init__()

        self.mha = ( 
            tf.keras.layers.MultiHeadAttention(
                key_dim=units,
                num_heads=1
            ) 
        )  

        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, context, target):
        """Forward pass of this layer

        Args:
            context (tf.Tensor): Encoded sentence to translate
            target (tf.Tensor): The embedded shifted-to-the-right translation

        Returns:
            tf.Tensor: Cross attention between context and target
        """
        attn_output = self.mha(
            query=target,
            value=context
        )

        x = self.add([target, attn_output])
        x = self.layernorm(x)

        return x

In [27]:
attention_layer = CrossAttention(UNITS)
sr_translation_embed = tf.keras.layers.Embedding(VOCAB_SIZE, output_dim=UNITS, mask_zero=True)(sr_translation)
attention_result = attention_layer(encoder_output, sr_translation_embed)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of translations has shape: {sr_translation_embed.shape}')
print(f'Tensor of attention scores has shape: {attention_result.shape}')

Tensor of contexts has shape: (64, 17, 256)
Tensor of translations has shape: (64, 16, 256)
Tensor of attention scores has shape: (64, 16, 256)




In [28]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, units):
        """Initializes an instance of this class

        Args:
            vocab_size (int): Size of the vocabulary
            units (int): Number of units in the LSTM layer
        """
        super(Decoder, self).__init__()

        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=units,
            mask_zero=True
        ) 

        self.pre_attention_rnn = tf.keras.layers.LSTM(
            units=units,
            return_sequences=True,
            return_state=True
        )

        self.attention = CrossAttention(units)

        self.post_attention_rnn = tf.keras.layers.LSTM(
            units=units,
            return_sequences=True
        )

        self.output_layer = tf.keras.layers.Dense(
            units=vocab_size,
            activation=tf.nn.log_softmax
        )

    def call(self, context, target, state=None, return_state=False):
        """Forward pass of this layer

        Args:
            context (tf.Tensor): Encoded sentence to translate
            target (tf.Tensor): The shifted-to-the-right translation
            state (list[tf.Tensor, tf.Tensor], optional): Hidden state of the pre-attention LSTM. Defaults to None.
            return_state (bool, optional): If set to true return the hidden states of the LSTM. Defaults to False.

        Returns:
            tf.Tensor: The log_softmax probabilities of predicting a particular token
        """
        x = self.embedding(target)
        x, hidden_state, cell_state = self.pre_attention_rnn(x, initial_state=state)
        x = self.attention(context, x)
        x = self.post_attention_rnn(x)
        logits = self.output_layer(x)

        if return_state:
            return logits, [hidden_state, cell_state]

        return logits

In [29]:
decoder = Decoder(VOCAB_SIZE, UNITS)

logits = decoder(encoder_output, sr_translation)

print(f'Tensor of contexts has shape: {encoder_output.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')



Tensor of contexts has shape: (64, 17, 256)
Tensor of right-shifted translations has shape: (64, 16)
Tensor of logits has shape: (64, 16, 12000)




In [30]:
class Translator(tf.keras.Model):
    def __init__(self, vocab_size, units):
        """Initializes an instance of this class

        Args:
            vocab_size (int): Size of the vocabulary
            units (int): Number of units in the LSTM layer
        """
        super().__init__()

        self.encoder = Encoder(vocab_size, units)
        self.decoder = Decoder(vocab_size, units)

    def call(self, inputs):
        """Forward pass of this layer

        Args:
            inputs (tuple(tf.Tensor, tf.Tensor)): Tuple containing the context (sentence to translate) and the target (shifted-to-the-right translation)

        Returns:
            tf.Tensor: The log_softmax probabilities of predicting a particular token
        """

        context, target = inputs
        encoded_context = self.encoder(context)
        logits = self.decoder(encoded_context, target)

        return logits

In [31]:
translator = Translator(VOCAB_SIZE, UNITS)
logits = translator((to_translate, sr_translation))

print(f'Tensor of sentences to translate has shape: {to_translate.shape}')
print(f'Tensor of right-shifted translations has shape: {sr_translation.shape}')
print(f'Tensor of logits has shape: {logits.shape}')



Tensor of sentences to translate has shape: (64, 17)
Tensor of right-shifted translations has shape: (64, 16)
Tensor of logits has shape: (64, 16, 12000)


In [32]:
def compile_and_train(model, epochs=20, steps_per_epoch=500):
    model.compile(optimizer="adam", loss=masked_loss, metrics=[masked_acc, masked_loss])

    history = model.fit(
        train_data.repeat(),
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_data,
        validation_steps=50,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)],
    )

    return model, history


In [33]:
trained_translator, history = compile_and_train(translator)

Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 231ms/step - loss: 5.7692 - masked_acc: 0.1592 - masked_loss: 5.7692 - val_loss: 4.2304 - val_masked_acc: 0.3538 - val_masked_loss: 4.2304
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 243ms/step - loss: 3.9669 - masked_acc: 0.3899 - masked_loss: 3.9669 - val_loss: 3.0532 - val_masked_acc: 0.5003 - val_masked_loss: 3.0532
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 235ms/step - loss: 2.9147 - masked_acc: 0.5230 - masked_loss: 2.9147 - val_loss: 2.4160 - val_masked_acc: 0.5879 - val_masked_loss: 2.4160
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 242ms/step - loss: 2.3548 - masked_acc: 0.5974 - masked_loss: 2.3548 - val_loss: 2.0265 - val_masked_acc: 0.6366 - val_masked_loss: 2.0265
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 266ms/step - loss: 1.9847 - masked_acc: 0.6515 - 

  self.gen.throw(typ, value, traceback)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 320ms/step - loss: 1.0784 - masked_acc: 0.7691 - masked_loss: 1.0784 - val_loss: 1.1378 - val_masked_acc: 0.7640 - val_masked_loss: 1.1378
Epoch 14/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 317ms/step - loss: 1.0345 - masked_acc: 0.7753 - masked_loss: 1.0345 - val_loss: 1.1420 - val_masked_acc: 0.7645 - val_masked_loss: 1.1420
Epoch 15/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 338ms/step - loss: 0.9784 - masked_acc: 0.7815 - masked_loss: 0.9784 - val_loss: 1.0751 - val_masked_acc: 0.7686 - val_masked_loss: 1.0751
Epoch 16/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 314ms/step - loss: 0.8771 - masked_acc: 0.7957 - masked_loss: 0.8771 - val_loss: 1.0714 - val_masked_acc: 0.7709 - val_masked_loss: 1.0714
Epoch 17/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 325ms/step - loss: 0.8924 - masked_acc: 0.7940 - masked_

In [57]:
def generate_next_token(decoder, context, next_token, done, state, temperature=0.0):
    """Generates the next token in the sequence

    Args:
        decoder (Decoder): The decoder
        context (tf.Tensor): Encoded sentence to translate
        next_token (tf.Tensor): The predicted next token
        done (bool): True if the translation is complete
        state (list[tf.Tensor, tf.Tensor]): Hidden states of the pre-attention LSTM layer
        temperature (float, optional): The temperature that controls the randomness of the predicted tokens. Defaults to 0.0.

    Returns:
        tuple(tf.Tensor, np.float, list[tf.Tensor, tf.Tensor], bool): The next token, log prob of said token, hidden state of LSTM and if translation is done
    """
    logits, state = decoder(context, next_token, state=state, return_state=True)
    logits = logits[:, -1, :]

    if temperature == 0.0:
        next_token = tf.argmax(logits, axis=-1)
    else:
        logits = logits / temperature
        next_token = tf.random.categorical(logits, num_samples=1)
    
    logits = tf.squeeze(logits)
    next_token = tf.squeeze(next_token)
    logit = logits[next_token].numpy()
    next_token = tf.reshape(next_token, shape=(1,1))

    if next_token == eos_id:
        done = True
    
    return next_token, logit, state, done

In [58]:
eng_sentence = "I love languages"

texts = tf.convert_to_tensor(eng_sentence)[tf.newaxis]
context = english_vectorizer(texts).to_tensor()
context = encoder(context)
next_token = tf.fill((1,1), sos_id)

state = [tf.random.uniform((1, UNITS)), tf.random.uniform((1, UNITS))]
done = False

next_token, logit, state, done = generate_next_token(decoder, context, next_token, done, state, temperature=0.5)
print(f"Next token: {next_token}\nLogit: {logit:.4f}\nDone? {done}")

Next token: [[396]]
Logit: -18.8229
Done? False




In [59]:
def translate(model, text, max_length=50, temperature=0.0):
    """Translate a given sentence from English to Portuguese

    Args:
        model (tf.keras.Model): The trained translator
        text (string): The sentence to translate
        max_length (int, optional): The maximum length of the translation. Defaults to 50.
        temperature (float, optional): The temperature that controls the randomness of the predicted tokens. Defaults to 0.0.

    Returns:
        tuple(str, np.float, tf.Tensor): The translation, logit that predicted <EOS> token and the tokenized translation
    """
    tokens, logits = [], []
    text = tf.convert_to_tensor([text])[tf.newaxis]
    context = english_vectorizer(text).to_tensor()
    context = model.encoder(context)
    next_token = tf.fill((1, 1), sos_id)
    state = [tf.zeros((1, UNITS)), tf.zeros((1, UNITS))]
    done = False

    for _ in range(max_length):
        try:
            next_token, logit, state, done = generate_next_token(
                decoder=model.decoder,
                context=context,
                next_token=next_token,
                done=done,
                state=state,
                temperature=temperature
            )
        except:
             raise Exception("Problem generating the next token")
        if done:
            break
    
        tokens.append(next_token)
        logits.append(logit)
    
    tokens = tf.concat(tokens, axis=-1)
    
    translation = tf.squeeze(tokens_to_text(tokens, id_to_word))
    translation = translation.numpy().decode()
    
    return translation, logits[-1], tokens

In [60]:
temp = 0.0 
original_sentence = "I love languages"

translation, logit, tokens = translate(trained_translator, original_sentence, temperature=temp)

print(f"Temperature: {temp}\n\nOriginal sentence: {original_sentence}\nTranslation: {translation}\nTranslation tokens:{tokens}\nLogit: {logit:.3f}")

Temperature: 0.0

Original sentence: I love languages
Translation: eu adoro as lnguas de segunda .
Translation tokens:[[   9  563   43 1032   11  695    4]]
Logit: -0.733




In [61]:
temp = 0.7
original_sentence = "I love languages"

translation, logit, tokens = translate(trained_translator, original_sentence, temperature=temp)

print(f"Temperature: {temp}\n\nOriginal sentence: {original_sentence}\nTranslation: {translation}\nTranslation tokens:{tokens}\nLogit: {logit:.3f}")

Temperature: 0.7

Original sentence: I love languages
Translation: eu adoro as lnguas de segunda .
Translation tokens:[[   9  563   43 1032   11  695    4]]
Logit: -1.047


In [62]:
def generate_samples(model, text, n_samples=4, temperature=0.6):
    samples, log_probs = [], []
    for _ in range(n_samples):
        _, logp, sample = translate(model, text, temperature=temperature)
        samples.append(np.squeeze(sample.numpy()).tolist())
        log_probs.append(logp)
                
    return samples, log_probs

In [63]:
samples, log_probs = generate_samples(trained_translator, 'I love languages')

for s, l in zip(samples, log_probs):
    print(f"Translated tensor: {s} has logit: {l:.3f}")

Translated tensor: [101, 11, 850, 618, 4] has logit: -0.435
Translated tensor: [9, 9, 563, 850, 11, 850, 4] has logit: -0.501
Translated tensor: [9, 101, 11, 1032, 1032, 4] has logit: -0.581
Translated tensor: [9, 563, 43, 1032, 11, 2704, 12, 191, 217, 2779, 4, 9, 521, 8, 416, 4, 9, 9, 1, 4, 9, 521, 9, 9, 521, 4] has logit: -2.763


In [64]:
def jaccard_similarity(candidate, reference):
    candidate_set = set(candidate)
    reference_set = set(reference)
    
    common_tokens = candidate_set.intersection(reference_set)
    all_tokens = candidate_set.union(reference_set)
    overlap = len(common_tokens) / len(all_tokens)
        
    return overlap

In [65]:
l1 = [1, 2, 3]
l2 = [1, 2, 3, 4]

js = jaccard_similarity(l1, l2)

print(f"jaccard similarity between lists: {l1} and {l2} is {js:.3f}")

jaccard similarity between lists: [1, 2, 3] and [1, 2, 3, 4] is 0.750


In [66]:
def rouge1_similarity(candidate, reference):
    """Computes the ROUGE 1 score between two token lists

    Args:
        candidate (list[int]): Tokenized candidate translation
        reference (list[int]): Tokenized reference translation

    Returns:
        float: Overlap between the two token lists
    """
    candidate_word_counts = Counter(candidate)
    reference_word_counts = Counter(reference)
    overlap = 0
    
    for token in candidate_word_counts.keys():
        token_count_candidate = candidate_word_counts[token]
        token_count_reference = reference_word_counts[token]
        overlap += min(token_count_candidate, token_count_reference)
    precision = overlap / len(candidate)
    recall = overlap / len(reference)
    
    if precision + recall != 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score
    
    return 0

In [67]:
l1 = [1, 2, 3]
l2 = [1, 2, 3, 4]

r1s = rouge1_similarity(l1, l2)

print(f"rouge 1 similarity between lists: {l1} and {l2} is {r1s:.3f}")

rouge 1 similarity between lists: [1, 2, 3] and [1, 2, 3, 4] is 0.857


In [68]:
def average_overlap(samples, similarity_fn):
    """Computes the arithmetic mean of each candidate sentence in the samples

    Args:
        samples (list[list[int]]): Tokenized version of translated sentences
        similarity_fn (Function): Similarity function used to compute the overlap

    Returns:
        dict[int, float]: A dictionary mapping the index of each translation to its score
    """
    scores = {}
    
    for index_candidate, candidate in enumerate(samples):    
        
        overlap = 0
        for index_sample, sample in enumerate(samples):
            if index_candidate == index_sample:
                continue
            sample_overlap = similarity_fn(candidate, sample)
            overlap += sample_overlap

        score = overlap / (len(samples) - 1)

        score = round(score, 3)
        
        scores[index_candidate] = score
        
    return scores

In [69]:
l1 = [1, 2, 3]
l2 = [1, 2, 4]
l3 = [1, 2, 4, 5]

avg_ovlp = average_overlap([l1, l2, l3], jaccard_similarity)

print(f"average overlap between lists: {l1}, {l2} and {l3} using Jaccard similarity is:\n\n{avg_ovlp}")

average overlap between lists: [1, 2, 3], [1, 2, 4] and [1, 2, 4, 5] using Jaccard similarity is:

{0: 0.45, 1: 0.625, 2: 0.575}


In [70]:
l1 = [1, 2, 3]
l2 = [1, 4]
l3 = [1, 2, 4, 5]
l4 = [5,6]

avg_ovlp = average_overlap([l1, l2, l3, l4], rouge1_similarity)

print(f"average overlap between lists: {l1}, {l2}, {l3} and {l4} using Rouge1 similarity is:\n\n{avg_ovlp}")

average overlap between lists: [1, 2, 3], [1, 4], [1, 2, 4, 5] and [5, 6] using Rouge1 similarity is:

{0: 0.324, 1: 0.356, 2: 0.524, 3: 0.111}


In [71]:
def weighted_avg_overlap(samples, log_probs, similarity_fn):
    
    scores = {}
    for index_candidate, candidate in enumerate(samples):
        overlap, weight_sum = 0.0, 0.0

        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):        
            if index_candidate == index_sample:
                continue
            sample_p = float(np.exp(logp))
            weight_sum += sample_p
            sample_overlap = similarity_fn(candidate, sample)
            overlap += sample_p * sample_overlap
        score = overlap / weight_sum
        score = round(score, 3)
        scores[index_candidate] = score
    
    return scores

In [72]:
l1 = [1, 2, 3]
l2 = [1, 2, 4]
l3 = [1, 2, 4, 5]
log_probs = [0.4, 0.2, 0.5]

w_avg_ovlp = weighted_avg_overlap([l1, l2, l3], log_probs, jaccard_similarity)

print(f"weighted average overlap using Jaccard similarity is:\n\n{w_avg_ovlp}")

weighted average overlap using Jaccard similarity is:

{0: 0.443, 1: 0.631, 2: 0.558}


In [73]:
def mbr_decode(model, text, n_samples=5, temperature=0.6, similarity_fn=jaccard_similarity):

    samples, log_probs = generate_samples(model, text, n_samples=n_samples, temperature=temperature)
    scores = weighted_avg_overlap(samples, log_probs, similarity_fn)
    decoded_translations = [tokens_to_text(s, id_to_word).numpy().decode('utf-8') for s in samples]
    max_score_key = max(scores, key=lambda k: scores[k])
    translation = decoded_translations[max_score_key]
    
    return translation, decoded_translations

In [74]:
english_sentence = "I love languages"

translation, candidates = mbr_decode(trained_translator, english_sentence, n_samples=10, temperature=0.6)

print("Translation candidates:")
for c in candidates:
    print(c)

print(f"\nSelected translation: {translation}")

Translation candidates:
eu adoro as flores de segunda .
eu adoro idiomas de segunda .
eu adoro idiomas .
eu adoro idiomas .
eu adoro lnguas roupas .
eu adoro as lnguas dgua .
adoro lnguas so idiomas .
eu adoro as lnguas de atraso .
eu adoro as idiomas de segunda .
eu adoro lnguas estrangeiras .

Selected translation: eu adoro as lnguas de atraso .
