## 0. Notebook Preparation

In [1]:
!pip install -q nltk

In [2]:
import nltk
import numpy as np
import re
import shutil
import tensorflow as tf
import os
import unicodedata
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction



2023-10-22 12:20:36.267296: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tf.random.set_seed(123)
np.random.seed(123)

## 1. Neural Machine Translation Without Attention Mechanism

In [4]:
def preprocess_sentence(sent):
    # Normalize the input sentence to decompose accented characters into their base form
    sent = "".join([c for c in unicodedata.normalize("NFD", sent) if unicodedata.category(c) != "Mn"])
    
    # Add space before punctuation marks like '.', '!', or '?' to tokenize them separately
    sent = re.sub(r"([!.?])", r" \1", sent)
    
    # Replace any sequence of characters that are not letters or punctuation with a single space
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
    
    # Replace multiple consecutive spaces with a single space
    sent = re.sub(r"\s+", " ", sent)
    
    # Convert the sentence to lowercase to ensure consistency in case
    sent = sent.lower()
    return sent

In [5]:
def read_data(num_sent_pairs =20000):
    en_sents, fr_sents_in, fr_sents_out = [], [], []
    local_file = os.path.join("datasets", "fra.txt")
    with open(local_file, "r") as fin:
        for i, line in enumerate(fin):
            en_sent, fr_sent, _ = line.strip().split('\t')
            en_sent = [w for w in preprocess_sentence(en_sent).split()]
            fr_sent = preprocess_sentence(fr_sent)
            fr_sent_in = [w for w in ("BOS " + fr_sent).split()]
            fr_sent_out = [w for w in (fr_sent + " EOS").split()]
            en_sents.append(en_sent)
            fr_sents_in.append(fr_sent_in)
            fr_sents_out.append(fr_sent_out)
            if i >= num_sent_pairs - 1:
                break
    return en_sents, fr_sents_in, fr_sents_out

In [6]:
NUM_SENT_PAIRS = 1000
sents_en, sents_fr_in, sents_fr_out = read_data(NUM_SENT_PAIRS)

In [7]:
print(sents_en[0:5])

[['go', '.'], ['hi', '.'], ['hi', '.'], ['run', '!'], ['run', '!']]


In [8]:
print(sents_fr_in[0:5])

[['BOS', 'va', '!'], ['BOS', 'salut', '!'], ['BOS', 'salut', '.'], ['BOS', 'cours', '!'], ['BOS', 'courez', '!']]


In [9]:
print(sents_fr_out[0:5])

[['va', '!', 'EOS'], ['salut', '!', 'EOS'], ['salut', '.', 'EOS'], ['cours', '!', 'EOS'], ['courez', '!', 'EOS']]


In [10]:
# We declare the tokenizers and use them to transform texts to sequences of indices

tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
print(f'tokenizer_en: {tokenizer_en}')
tokenizer_en.fit_on_texts(sents_en)
data_en = tokenizer_en.texts_to_sequences(sents_en)
print(f'data en: {data_en[0:5]}')
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding="post")
print(f'data en: \n{data_en[0:5]}')

tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
tokenizer_fr.fit_on_texts(sents_fr_in)
tokenizer_fr.fit_on_texts(sents_fr_out)
data_fr_in = tokenizer_fr.texts_to_sequences(sents_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in, padding="post")
data_fr_out = tokenizer_fr.texts_to_sequences(sents_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out, padding="post")

tokenizer_en: <keras.preprocessing.text.Tokenizer object at 0x7fdb50770130>
data en: [[7, 1], [130, 1], [130, 1], [75, 3], [75, 3]]
data en: 
[[  7   1   0   0   0]
 [130   1   0   0   0]
 [130   1   0   0   0]
 [ 75   3   0   0   0]
 [ 75   3   0   0   0]]


In [11]:
# We then build up dictionaries and vocabularies

vocab_size_en = len(tokenizer_en.word_index)
vocab_size_fr = len(tokenizer_fr.word_index)

word2idx_en = tokenizer_en.word_index
idx2word_en = {v:k for k, v in word2idx_en.items()}
word2idx_fr = tokenizer_fr.word_index
idx2word_fr = {v:k for k, v in word2idx_fr.items()}

print("vocab size (en): {:d}, vocab size (fr): {:d}".format(vocab_size_en, vocab_size_fr))
maxlen_en = data_en.shape[1]
maxlen_fr = data_fr_out.shape[1]
print("seqlen (en): {:d}, (fr): {:d}".format(maxlen_en, maxlen_fr))

print(f'\nword2idx_en: \n\n{word2idx_en}')

vocab size (en): 380, vocab size (fr): 709
seqlen (en): 5, (fr): 10

word2idx_en: 

{'.': 1, 'i': 2, '!': 3, 'm': 4, 'it': 5, 's': 6, 'go': 7, 'tom': 8, '?': 9, 'me': 10, 'up': 11, 'you': 12, 'be': 13, 'll': 14, 'get': 15, 'this': 16, 'lost': 17, 'we': 18, 'come': 19, 'let': 20, 'back': 21, 'take': 22, 'stay': 23, 'down': 24, 'on': 25, 'won': 26, 'they': 27, 'ok': 28, 'saw': 29, 'here': 30, 'that': 31, 'away': 32, 'he': 33, 'is': 34, 'mine': 35, 'try': 36, 'calm': 37, 'us': 38, 'stop': 39, 'no': 40, 'nice': 41, 'how': 42, 'look': 43, 'see': 44, 'in': 45, 'out': 46, 'am': 47, 'a': 48, 'fun': 49, 'over': 50, 'who': 51, 'help': 52, 'got': 53, 'way': 54, 'fair': 55, 'keep': 56, 'still': 57, 'him': 58, 'she': 59, 'call': 60, 'hold': 61, 'off': 62, 'grab': 63, 'must': 64, 'ahead': 65, 'good': 66, 'job': 67, 'did': 68, 'use': 69, 'don': 70, 't': 71, 'lie': 72, 'excuse': 73, 'forget': 74, 'run': 75, 'left': 76, 'home': 77, 'hurry': 78, 'sure': 79, 'leave': 80, 'what': 81, 'sorry': 82, 'fell': 

In [12]:
# Convert to dataset format

batch_size = 64
dataset = tf.data.Dataset.from_tensor_slices((data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(10000)
test_size = NUM_SENT_PAIRS // 4
test_dataset = dataset.take(test_size).batch(batch_size, drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(batch_size, drop_remainder=True)

print(train_dataset)

<_BatchDataset element_spec=(TensorSpec(shape=(64, 5), dtype=tf.int32, name=None), TensorSpec(shape=(64, 10), dtype=tf.int32, name=None), TensorSpec(shape=(64, 10), dtype=tf.int32, name=None))>


In [14]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, encoder_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.encoder_dim = encoder_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(encoder_dim, return_sequences=False, return_state=True)
    
    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state) # x is output, and state is hidden state
        return x, state
    
    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.encoder_dim))

In [13]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, decoder_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.decoder_dim = decoder_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= num_timesteps)
        self.rnn = tf.keras.layers.GRU(decoder_dim, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, state)
        x = self.dense(x) # only return logits
        return x, state

In [15]:
embedding_dim = 256
encoder_dim, decoder_dim = 1024, 1024

# vocab size + 1 for word that is not in the library usually called as out-of-vocabulary (OOV)
encoder = Encoder(vocab_size_en+1, embedding_dim, maxlen_en, encoder_dim)
decoder = Decoder(vocab_size_fr+1, embedding_dim, maxlen_fr, decoder_dim)

In [16]:
encoder_in, decoder_in, decoder_out = next(iter(train_dataset))

encoder_state = encoder.init_state(batch_size)
encoder_out, encoder_state = encoder(encoder_in, encoder_state)
decoder_state = encoder_state
decoder_pred, decoder_state = decoder(decoder_in, decoder_state)

print("encoder input :", encoder_in.shape)
print("encoder output :", encoder_out.shape, "state:", encoder_state.shape)
print("decoder output (logits):", decoder_pred.shape, "state:", decoder_state.shape)
print("decoder output (labels):", decoder_out.shape)

2023-10-22 12:49:59.019898: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int32 and shape [1000,10]
	 [[{{node Placeholder/_2}}]]


encoder input : (64, 5)
encoder output : (64, 1024) state: (64, 1024)
decoder output (logits): (64, 10, 710) state: (64, 1024)
decoder output (labels): (64, 10)


In [17]:
def loss_fn(ytrue, ypred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(ytrue, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = scce(ytrue, ypred, sample_weight=mask)
    return loss

In [18]:
@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
    with tf.GradientTape() as tape:
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state
        decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
        loss = loss_fn(decoder_out, decoder_pred)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss

In [19]:
def predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr):
    random_id = np.random.choice(len(sents_en))
    print("input : ", " ".join(sents_en[random_id]))
    print("label : ", " ".join(sents_fr_out[random_id]))
    encoder_in = tf.expand_dims(data_en[random_id], axis=0)
    decoder_out = tf.expand_dims(sents_fr_out[random_id], axis=0)
    
    encoder_state = encoder.init_state(1)
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    
    decoder_state = encoder_state
    decoder_in = tf.expand_dims(tf.constant([word2idx_fr["BOS"]]), axis=0)
    
    pred_sent_fr = []
    decoding_step = 0
    while decoding_step < maxlen_fr:
        decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
        decoder_pred = tf.argmax(decoder_pred, axis=-1)
        pred_word = idx2word_fr[decoder_pred.numpy()[0][0]]
        pred_sent_fr.append(pred_word)
        if pred_word == "EOS":
            break
        decoder_in = decoder_pred
        decoding_step += 1
    print("predicted: ", " ".join(pred_sent_fr))

In [None]:
checkpoint_dir = "./checkpoints"
optimizer = tf.keras.optimizers.Adam()
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
num_epochs = 25
eval_scores = []
for e in range(num_epochs):
    encoder_state = encoder.init_state(batch_size)
    for batch, data in enumerate(train_dataset):
        encoder_in, decoder_in, decoder_out = data
        # print(encoder_in.shape, decoder_in.shape, decoder_out.shape)
        loss = train_step(encoder_in, decoder_in, decoder_out, encoder_state)
        # print("Batch {}: loss = {}".format(batch, loss))
    print("Epoch: {}, Loss: {:.4f}".format(e + 1, loss.numpy()))
    if e % 10 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr)

checkpoint.save(file_prefix=checkpoint_prefix)

2023-10-22 12:57:17.221632: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-10-22 12:57:17.223254: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-10-22 12:57:17.224492: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch: 1, Loss: 2.1057
input :  birds fly .
label :  les oiseaux volent . EOS
predicted:  je ! ! ! ! ! ! ! ! !
Epoch: 2, Loss: 2.0041
input :  i m free .
label :  je suis libre . EOS
predicted:  je . EOS
Epoch: 3, Loss: 1.8604
input :  i m sick .
label :  je suis malade . EOS
predicted:  je suis . EOS
Epoch: 4, Loss: 1.5524
input :  i looked .
label :  j ai regarde . EOS
predicted:  je suis . EOS
Epoch: 5, Loss: 1.4870
input :  tom spoke .
label :  tom a parle . EOS
predicted:  je suis . EOS
Epoch: 6, Loss: 1.4353
input :  call me .
label :  appelez moi ! EOS
predicted:  c est . EOS
Epoch: 7, Loss: 1.2998
input :  i m early .
label :  je suis en avance . EOS
predicted:  je suis suis . EOS
Epoch: 8, Loss: 1.2780
input :  go on .
label :  poursuivez . EOS
predicted:  nous ! EOS
Epoch: 9, Loss: 1.0698
input :  he s lazy .
label :  il est paresseux . EOS
predicted:  c est . EOS
Epoch: 10, Loss: 1.1802
input :  come on .
label :  allez ! EOS
predicted:  venez ! EOS
Epoch: 11, Loss: 1.1607
i