In [1]:
import numpy as np
import re
import tensorflow as tf
import os
import unicodedata

In [2]:
tf.random.set_seed(6789)
np.random.seed(6789)

In [3]:
def preprocess_sentence(sent):
    sent = "".join([c for c in unicodedata.normalize("NFD", sent) if unicodedata.category(c) != "Mn"])
    sent = re.sub(r"([!.?])", r" \1", sent)
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
    sent = re.sub(r"\s+", " ", sent)
    sent = sent.lower()
    return sent

In [4]:
def read_data(num_sent_pairs =20000):
    en_sents, fr_sents_in, fr_sents_out = [], [], []
    local_file = "/content/fra.txt"
    with open(local_file, "r") as fin:
        for i, line in enumerate(fin):
            en_sent, fr_sent, _ = line.strip().split('\t')
            en_sent = [w for w in preprocess_sentence(en_sent).split()]
            fr_sent = preprocess_sentence(fr_sent)
            fr_sent_in = [w for w in ("BOS " + fr_sent).split()]
            fr_sent_out = [w for w in (fr_sent + " EOS").split()]
            en_sents.append(en_sent)
            fr_sents_in.append(fr_sent_in)
            fr_sents_out.append(fr_sent_out)
            if i >= num_sent_pairs - 1:
                break
    return en_sents, fr_sents_in, fr_sents_out

In [5]:
NUM_SENT_PAIRS = 1000
sents_en, sents_fr_in, sents_fr_out = read_data(NUM_SENT_PAIRS)

In [7]:
sents_en[0:5]

[['go', '.'], ['hi', '.'], ['hi', '.'], ['run', '!'], ['run', '!']]

In [8]:
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
tokenizer_en.fit_on_texts(sents_en)
data_en = tokenizer_en.texts_to_sequences(sents_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding="post")

tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(filters="", lower=False)
tokenizer_fr.fit_on_texts(sents_fr_in)
tokenizer_fr.fit_on_texts(sents_fr_out)

data_fr_in = tokenizer_fr.texts_to_sequences(sents_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in, padding="post")
data_fr_out = tokenizer_fr.texts_to_sequences(sents_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out, padding="post")

In [9]:
data_en[0:5]

array([[  7,   1,   0,   0,   0],
       [130,   1,   0,   0,   0],
       [130,   1,   0,   0,   0],
       [ 75,   3,   0,   0,   0],
       [ 75,   3,   0,   0,   0]], dtype=int32)

In [10]:
vocab_size_en = len(tokenizer_en.word_index)
vocab_size_fr = len(tokenizer_fr.word_index)
word2idx_en = tokenizer_en.word_index
idx2word_en = {v:k for k, v in word2idx_en.items()}
word2idx_fr = tokenizer_fr.word_index
idx2word_fr = {v:k for k, v in word2idx_fr.items()}
print("vocab size (en): {:d}, vocab size (fr): {:d}".format(vocab_size_en, vocab_size_fr))
maxlen_en = data_en.shape[1]
maxlen_fr = data_fr_out.shape[1]
print("seqlen (en): {:d}, (fr): {:d}".format(maxlen_en, maxlen_fr))

vocab size (en): 380, vocab size (fr): 709
seqlen (en): 5, (fr): 10


In [11]:
batch_size = 64
dataset = tf.data.Dataset.from_tensor_slices((data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(10000)
test_size = NUM_SENT_PAIRS // 4
test_dataset = dataset.take(test_size).batch(batch_size, drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(batch_size, drop_remainder=True)

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, encoder_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.encoder_dim = encoder_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(encoder_dim, return_sequences=True, return_state=True)

    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state)
        return x, state

    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.encoder_dim))

In [13]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, num_units, phase= 'training'):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(num_units)
        self.W2 = tf.keras.layers.Dense(num_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, d_state, e_states):
        # d_sate is the decoder state at time step j of decoder RNN
        # d_state.shape: [batch_size, decoder_dim]
        # e_states are encoder states at every timestes
        # e_states.shape: [batch_size, num_timesteps, encoder_dim]
        # add time axis to query: [batch_size, 1, decoder_dim]
        d_state_extend = tf.expand_dims(d_state, axis=1)
        # compute score [batch_size, num_timesteps, 1]
        attention_score = self.V(tf.keras.activations.tanh(self.W1(e_states) + self.W2(d_state_extend)))
        # compute weight [batch_size, num_timesteps, 1]
        attention_weight = tf.nn.softmax(attention_score, axis=1)
        # compute context [batch_size, decoder_dim]
        context = tf.reduce_sum(tf.linalg.matmul(tf.transpose(attention_weight, perm=[0,2,1]), e_states), axis=1)
        return context, attention_weight

Note:

1. tf.transpose(attention_weight, perm=[0,2,1]) changes the shape of attention_weight from [batch_size, num_timesteps, 1] to [batch_size, 1, num_timesteps]. This is done because you want to perform a batched matrix multiplication next, and the dimensions must be aligned properly for this operation.

2. tf.linalg.matmul(..., e_states) performs a batched matrix multiplication between the transposed attention_weight and e_states. Given that attention_weight is now of shape [batch_size, 1, num_timesteps] and e_states is of shape [batch_size, num_timesteps, encoder_dim], the result of this multiplication is a tensor of shape [batch_size, 1, encoder_dim]. Each element in the resulting tensor is the weighted sum of the encoder states, with the weights specified by attention_weight. In other words, it's a sum of the encoder states, but where each state is scaled by how much attention the decoder is paying to that particular state.

In [14]:
class LuongAttention(tf.keras.layers.Layer):
    def __init__(self, num_units):
        super(LuongAttention, self).__init__()
        self.W = tf.keras.layers.Dense(num_units)

    def call(self, d_state, e_states):
        # d_sate is the decoder state at time step j of decoder RNN
        # d_state.shape: [batch_size, decoder_dim]
        # e_states are encoder states at every timestep
        # e_states.shape: [batch_size, num_timesteps, encoder_dim]
        # add time axis to query: [batch_size, 1, decoder_dim]
        d_state_extend = tf.expand_dims(d_state, axis=1)
        # compute score [batch_size, num_timesteps, 1]
        attention_score = tf.linalg.matmul(self.W(e_states), tf.transpose(d_state_extend, perm=(0,2,1)))
        # compute softmax attention_weight
        attention_weight = tf.nn.softmax(attention_score, axis=1)
        # compute attended output
        context = tf.linalg.matmul(tf.transpose(attention_weight, perm=[0,2,1]), e_states)
        context = tf.squeeze(context, axis=1) # The tf.squeeze function in TensorFlow is used to remove dimensions of size 1 from a tensor
        return context, attention_weight

1. e_states: This tensor contains the encoder states and has a shape of [batch_size, num_timesteps, encoder_dim].

2. self.W: This is a dense layer that projects the encoder states down to a new dimension, which is num_units. When the e_states tensor is passed through this layer, the resulting tensor has a shape of [batch_size, num_timesteps, num_units].

3. d_state: This tensor is the decoder state and has a shape of [batch_size, decoder_dim].

4. d_state_extend: By using tf.expand_dims(d_state, axis=1), a time axis is added to the decoder state, resulting in a shape of [batch_size, 1, decoder_dim].

5. tf.transpose(d_state_extend, perm=(0,2,1)): This operation transposes the d_state_extend tensor, resulting in a shape of [batch_size, decoder_dim, 1].

6. attention_score: The tf.linalg.matmul(self.W(e_states), tf.transpose(d_state_extend, perm=(0,2,1))) operation performs a batched matrix multiplication between the projected encoder states and the transposed extended decoder state. The shapes involved in this operation are:

7. self.W(e_states): [batch_size, num_timesteps, num_units]
tf.transpose(d_state_extend, perm=(0,2,1)): [batch_size, decoder_dim, 1]
Assuming that decoder_dim is equal to num_units (i.e., the decoder's hidden state size is the same as the projection size of the encoder states), the resulting attention_score tensor will have a shape of [batch_size, num_timesteps, 1]. This tensor contains the raw attention scores for each encoder state, which are then passed through a softmax layer to produce the normalized attention_weight.

8. If decoder_dim is not equal to num_units, the self.W layer won't be able to directly compute the scores because the inner dimensions won't match for matrix multiplication. In this case, the decoder states should also be projected to a dimension of num_units, or the num_units should be set equal to decoder_dim when initializing the LuongAttention layer.







In [None]:
batch_size = 64
num_timesteps = 100
num_units = 200
d_state = np.random.random(size=(batch_size, num_units)).astype(np.float32)
e_states = np.random.random(size=(batch_size, num_timesteps, num_units)).astype(np.float32)
# check out dimensions for Bahdanau attention
b_attn = BahdanauAttention(num_units)
context, attention_weight = b_attn(d_state, e_states)
print("Bahdanau: context.shape:", context.shape, "attention_weight.shape:", attention_weight.shape)
# check out dimensions for Luong attention
l_attn = LuongAttention(num_units)
context, attention_weights = l_attn(d_state, e_states)
print("Luong: context.shape:", context.shape, "attention_weight:", attention_weight.shape)

Bahdanau: context.shape: (64, 200) attention_weight.shape: (64, 100, 1)
Luong: context.shape: (64, 200) attention_weight: (64, 100, 1)


In [15]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, decoder_dim, attention_type= 'Bahdanau', **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.decoder_dim = decoder_dim
        if attention_type == 'Bahdanau':
            self.attention = BahdanauAttention(decoder_dim)
        else:
            self.attention = LuongAttention(decoder_dim)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(decoder_dim, return_sequences=False, return_state=True)
        self.Wc = tf.keras.layers.Dense(decoder_dim, activation="tanh")
        self.Ws = tf.keras.layers.Dense(vocab_size)

    def call(self, x, state, encoder_out):
        x = self.embedding(x)
        h, state = self.rnn(x, state)
        context, attention_weight = self.attention(h, encoder_out)
        h= tf.concat([h, context], axis=1)
        h = self.Wc(h)
        logits = self.Ws(h)
        return logits, state,attention_weight

In [16]:
embedding_dim = 256
encoder_dim, decoder_dim = 100, 100
encoder = Encoder(vocab_size_en+1, embedding_dim, maxlen_en, encoder_dim)
decoder = Decoder(vocab_size_fr+1, embedding_dim, maxlen_fr, decoder_dim, 'Bahdanau')

In [17]:
def loss_fn(ytrue, ypred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(ytrue, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = scce(ytrue, ypred, sample_weight=mask)
    return loss

In [18]:
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
    with tf.GradientTape() as tape:
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state
        loss = 0
        for t in range(decoder_out.shape[1]):
            decoder_in_t = decoder_in[:, t]  # the t-th word of sentences in current batch
            decoder_in_t = tf.reshape(decoder_in_t, [-1,1])
            decoder_pred_t, decoder_state, _ = decoder(decoder_in_t, decoder_state, encoder_out)
            loss += loss_fn(decoder_out[:, t], decoder_pred_t)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss / decoder_out.shape[1]

In [19]:
def predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr):
    random_id = np.random.choice(len(sents_en))
    print("input : ", " ".join(sents_en[random_id]))
    print("label : ", " ".join(sents_fr_out[random_id]))
    encoder_in = tf.expand_dims(data_en[random_id], axis=0)
    decoder_out = tf.expand_dims(sents_fr_out[random_id], axis=0)
    encoder_state = encoder.init_state(1)
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_in = tf.expand_dims(tf.constant([word2idx_fr["BOS"]]), axis=0)
    pred_sent_fr = []
    decoding_step = 0
    while decoding_step < maxlen_fr:
        decoder_pred, decoder_state,_ = decoder(decoder_in, decoder_state, encoder_out)
        decoder_pred = tf.argmax(decoder_pred, axis=-1)
        pred_word = idx2word_fr[decoder_pred.numpy()[0]]
        pred_sent_fr.append(pred_word)
        if pred_word == "EOS":
            break
        decoder_in = tf.reshape(decoder_pred, [1,-1])
        decoding_step += 1
    print("predicted: ", " ".join(pred_sent_fr))

In [20]:
def train_all():
    num_epochs = 25
    for e in range(num_epochs):
        encoder_state = encoder.init_state(batch_size)
        for batch, data in enumerate(train_dataset):
            encoder_in, decoder_in, decoder_out = data
            # print(encoder_in.shape, decoder_in.shape, decoder_out.shape)
            loss = train_step(encoder_in, decoder_in, decoder_out, encoder_state)
            # print("Batch {}: loss = {}".format(batch, loss))
        print("Epoch: {}, Loss: {:.4f}".format(e + 1, loss.numpy()))
        predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr)

train_all()

Epoch: 1, Loss: 3.1847
input :  call me .
label :  appelez moi ! EOS
predicted:  EOS
Epoch: 2, Loss: 2.1612
input :  let s see .
label :  voyons voir ! EOS
predicted:  EOS
Epoch: 3, Loss: 2.0937
input :  it stinks .
label :  ca pue . EOS
predicted:  EOS
Epoch: 4, Loss: 1.9888
input :  open up .
label :  ouvre moi ! EOS
predicted:  je . EOS
Epoch: 5, Loss: 1.9875
input :  i m right .
label :  j ai raison . EOS
predicted:  je je EOS
Epoch: 6, Loss: 1.7733
input :  they fell .
label :  ils sont tombes . EOS
predicted:  je je . EOS
Epoch: 7, Loss: 1.6487
input :  see you !
label :  a la prochaine ! EOS
predicted:  je je . EOS
Epoch: 8, Loss: 1.8946
input :  i retired .
label :  j ai pris ma retraite . EOS
predicted:  je suis je . EOS
Epoch: 9, Loss: 1.7535
input :  it was ok .
label :  c etait ok . EOS
predicted:  je suis je . EOS
Epoch: 10, Loss: 1.5737
input :  i m home .
label :  je suis chez moi . EOS
predicted:  je suis je . EOS
Epoch: 11, Loss: 1.5606
input :  i phoned .
label :  j a

In [None]:
embedding_dim = 256
encoder_dim, decoder_dim = 100, 100
encoder = Encoder(vocab_size_en+1, embedding_dim, maxlen_en, encoder_dim)
decoder = Decoder(vocab_size_fr+1, embedding_dim, maxlen_fr, decoder_dim, 'Luong')
optimizer = tf.keras.optimizers.Adam()

@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
    with tf.GradientTape() as tape:
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state
        loss = 0
        for t in range(decoder_out.shape[1]):
            decoder_in_t = decoder_in[:, t]  # the t-th word of sentences in current batch
            decoder_in_t = tf.reshape(decoder_in_t, [-1,1])
            decoder_pred_t, decoder_state, _ = decoder(decoder_in_t, decoder_state, encoder_out)
            loss += loss_fn(decoder_out[:, t], decoder_pred_t)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss / decoder_out.shape[1]

train_all()

Epoch: 1, Loss: 2.9254
input :  i m right .
label :  j ai raison . EOS
predicted:  EOS
Epoch: 2, Loss: 2.2139
input :  back off .
label :  recule ! EOS
predicted:  EOS
Epoch: 3, Loss: 1.9798
input :  i m well .
label :  je me porte bien . EOS
predicted:  EOS
Epoch: 4, Loss: 2.0428
input :  jump .
label :  saute . EOS
predicted:  EOS
Epoch: 5, Loss: 2.0269
input :  it s hers .
label :  c est le sien . EOS
predicted:  EOS
Epoch: 6, Loss: 1.9922
input :  it poured .
label :  il pleuvait a verse . EOS
predicted:  . EOS
Epoch: 7, Loss: 1.9475
input :  don t die .
label :  ne mourez pas ! EOS
predicted:  je je EOS
Epoch: 8, Loss: 1.6496
input :  i m angry .
label :  je suis enerve . EOS
predicted:  je . EOS
Epoch: 9, Loss: 1.7760
input :  i relaxed .
label :  je me suis detendu . EOS
predicted:  je je . EOS
Epoch: 10, Loss: 1.7943
input :  he s sexy .
label :  il est sexy . EOS
predicted:  je je . EOS
Epoch: 11, Loss: 1.6851
input :  take care !
label :  soyez prudente ! EOS
predicted:  je s