<a href="https://colab.research.google.com/github/UKJaagadhep/Data-science-and-machine-learning/blob/main/Neural_Machine_Translation/Neural_Machine_translation_(English_to_French)_with_Bahdanau_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Input, GRU, Embedding, Bidirectional, Dropout, Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

In [24]:
!wget https://www.manythings.org/anki/fra-eng.zip
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

--2024-05-11 22:20:09--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2024-05-11 22:20:11 (6.38 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


# **Data Preparation**

In [25]:
text_dataset = tf.data.TextLineDataset('/content/dataset/fra.txt')

In [26]:
def selector(input_text):
  split_text = tf.strings.split(input_text,'\t')
  return {'input_1' : split_text[0:1], 'input_2' : 'starttoken ' + split_text[1:2]}, split_text[1:2] + ' endtoken'
  #We specify [0:1] instead of just [0] to get output in the form of a vector (enclosed by []) and not a scaler
  #Dictionary contains inputs and  split_text[1:2] + ' [end]' is output
  '''So for each sentence in french sequence, we will have [start] token and the sentence representing inputs to the
  French output RNN from itself within the dictionary and we will also have the sentence and [end] token representing
  the outputs from the French output RNN'''
print(selector('Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'))

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)


In [27]:
split_dataset = text_dataset.map(selector) #We use split dataset into inputs and outputs (only input_1 is given by user)

In [28]:
def separator(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return split_text[0:1], 'starttoken ' + split_text[1:2] + ' endtoken'

In [29]:
init_dataset = text_dataset.map(separator)
#We split dataset into english and french (alongwith starttoken and endtoken for french) to get the vocabulary for the 2 languages

In [30]:
vocabulary_size = 20000
english_sequence_length = 64
french_sequence_length = 64
embedding_dimension = 300
batch_size = 64

In [31]:
english_vectorization_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = vocabulary_size,
    output_sequence_length = english_sequence_length,
    output_mode = 'int'
)

In [32]:
french_vectorization_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = vocabulary_size,
    output_sequence_length = french_sequence_length,
    output_mode = 'int'
)

In [33]:
english_training_data = init_dataset.map(lambda x, y : x)
english_vectorization_layer.adapt(english_training_data)

In [34]:
french_training_data = init_dataset.map(lambda x, y : y)
french_vectorization_layer.adapt(french_training_data)

In [35]:
def vectorizer(inputs, output):
  return {'input_1' : english_vectorization_layer(inputs['input_1']),
          'input_2' : french_vectorization_layer(inputs['input_2'])}, french_vectorization_layer(output)

In [36]:
dataset = split_dataset.map(vectorizer)

In [37]:
#checking indeices for starttoken and endtoken
print(french_vectorization_layer.get_vocabulary()[2])
print(french_vectorization_layer.get_vocabulary()[3])

starttoken
endtoken


In [38]:
dataset = dataset.shuffle(2048).unbatch().batch(batch_size).prefetch(buffer_size = tf.data.AUTOTUNE)

In [39]:
num_batches = int(200000/batch_size)
print(num_batches)

3125


In [40]:
train_dataset = dataset.take(int(0.88 * num_batches))
temp_dataset = dataset.skip(int(0.88 * num_batches))
val_dataset = temp_dataset.take(int(0.67 * num_batches))
test_dataset = temp_dataset.skip(int(0.67 * num_batches))

In [41]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [42]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [43]:
val_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [44]:
test_dataset

<_SkipDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

# **Modelling**

In [45]:
class Encoder(tf.keras.Model):
  def __init__(self, vocabulary_size, embedding_dimension, units):
    super(Encoder, self).__init__()
    self.embedding_dimension = embedding_dimension
    self.vocabulary_size = vocabulary_size
    self.units = units

  def build(self, input_shape):
    self.embedding = Embedding(self.vocabulary_size, self.embedding_dimension)
    self.lstm = LSTM(self.units, return_sequences = True)

  def call(self, x):
    x = self.embedding(x)
    #print(x.shape)
    output = self.lstm(x)
    return output

In [46]:
embedding_dimension = 256
hidden_units = 256

encoder = Encoder(vocabulary_size, embedding_dimension, hidden_units)
encoder_output = encoder(tf.zeros([128, 64])) #128 here is batch_size and 64 in sequence_length
print(encoder_output.shape)

(128, 64, 256)


In [47]:
encoder.summary()

Model: "encoder_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     multiple                  5120000   
                                                                 
 lstm_2 (LSTM)               multiple                  525312    
                                                                 
Total params: 5645312 (21.54 MB)
Trainable params: 5645312 (21.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [48]:
#Refer to formula in the research paper for clearer understanding
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.units = units

  def build(self, input_shape):
    self.w_1 = Dense(self.units)
    self.w_2 = Dense(self.units)
    self.w = Dense(1)

  def call(self, prev_dec_state, enc_states):
    scores = self.w(tf.nn.tanh(
        self.w_1(tf.expand_dims(prev_dec_state, -2)) + #Expand dimension to convert prev_dec_state from 2D to 3D by adding sequence_length dimension
        self.w_2(enc_states)
    )) #shape = [batch_size, sequence_length, 1]

    attention_weights = tf.nn.softmax(scores, axis = 1)
    context_vector = attention_weights * enc_states #shape = [batch_size, sequence_length, embedding_dimension]
    context_vector = tf.reduce_sum(context_vector, axis = 1) #shape = [batch_size, embedding_dimension]

    return context_vector , attention_weights

In [49]:
bahdanau_attention=BahdanauAttention(256)
context_vector,attention_weights=bahdanau_attention(tf.zeros([128,32]),tf.zeros([128,8,32]))
print(context_vector.shape)
print(attention_weights.shape)

(128, 32)
(128, 8, 1)


In [57]:
class Decoder(tf.keras.Model):
  def __init__(self, vocabulary_size, embedding_dimension, decoder_units, sequence_length):
    super(Decoder, self).__init__()
    self.vocabulary_size = vocabulary_size
    self.embedding_dimension = embedding_dimension
    self.decoder_units = decoder_units
    self.sequence_length = sequence_length

  def build(self, input_shape):
    self.gru = GRU(self.decoder_units, return_sequences = True, return_state = True)
    #return_sequences = True to output french words, return_state = True to send previous hidden state (S[i-1]) to encoder block
    self.dense = Dense(self.vocabulary_size, activation = 'softmax')
    self.embedding = Embedding(self.vocabulary_size, self.embedding_dimension) #To embed shifted target
    self.attention = BahdanauAttention(self.decoder_units)

  def call(self, x, previous_hidden, shifted_target):
    outputs=[]
    context_vectors=[]
    attention_weightss=[]
    shifted_target=self.embedding(shifted_target)

    for t in range(self.sequence_length):
      context_vector, attention_weights = self.attention(previous_hidden, x) #x is encoder output
      decoder_input = context_vector + shifted_target[:, t] #block t
      output, previous_hidden = self.gru(tf.expand_dims(decoder_input, 1))
      outputs.append(output[:, 0]) #output shape = [batch_size, decoder_units]
      #so outputs list shape = [sequence_length, batch_size, decoder_units] before we transpose them in the next step

    outputs=tf.convert_to_tensor(outputs)
    outputs=tf.transpose(outputs, perm=[1,0,2])

    outputs=self.dense(outputs)
    return outputs, attention_weights

In [58]:
decoder = Decoder(vocabulary_size, embedding_dimension, hidden_units, french_sequence_length)
outputs, attention_weights = decoder(encoder_output,tf.zeros([128, hidden_units]), tf.zeros([128,64]))
print(outputs.shape)
print(attention_weights.shape)

(128, 64, 20000)
(128, 64, 1)


In [59]:
### ENCODER
input = Input(shape = (english_sequence_length), dtype = "int64", name = "input_1")
encoder = Encoder(vocabulary_size, embedding_dimension, hidden_units)
encoder_output = encoder(input)

### DECODER
shifted_target = Input(shape=(french_sequence_length), dtype = "int64", name = "input_2")
decoder = Decoder(vocabulary_size, embedding_dimension, hidden_units, french_sequence_length)
decoder_output, attention_weightss = decoder(encoder_output, tf.zeros([1, hidden_units]), shifted_target) #tf.zeros([1, hidden_units]) is initialized hidden state

### OUTPUT
bahdanau = Model([input, shifted_target], decoder_output)

bahdanau.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 encoder_5 (Encoder)         (None, 64, 256)              5645312   ['input_1[0][0]']             
                                                                                                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 decoder_6 (Decoder)         ((None, 64, 20000),          1078659   ['encoder_5[0][0]',           
                              (None, 64, 1))              3          'input_2[0][0]']       

In [72]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self, name='bleu_score'):
        super(BLEU, self).__init__(name=name)
        self.total_matches = self.add_weight(name='total_matches', initializer='zeros')
        self.total_words = self.add_weight(name='total_words', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.argmax(y_pred, axis=-1)
        mask = tf.cast(y_pred != 0, tf.float32)

        # Compute total matches
        matches = tf.reduce_sum(tf.cast(tf.equal(y_pred, y_true), tf.float32) * mask)
        self.total_matches.assign_add(matches)

        # Compute total words
        words = tf.reduce_sum(mask)
        self.total_words.assign_add(words)

    def result(self):
        return self.total_matches / self.total_words


In [76]:
bahdanau.compile(
    loss = SparseCategoricalCrossentropy(),
    optimizer = Adam(5e-4),
    metrics = [BLEU()],
    #run_eagerly=True
    )

In [77]:
checkpoint_filepath = '/content/bahdanau_attention.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

In [None]:
history = bahdanau.fit(
    train_dataset,
    validation_data = val_dataset,
    epochs = 2,
    callbacks=[model_checkpoint_callback])

Epoch 1/2
    376/Unknown - 3293s 8s/step - loss: 1.1946 - bleu_score: 0.0000e+00

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'upper left')
plt.show()

In [None]:
bahdanau.evaluate(test_dataset)

# **TESTING**

In [None]:
index_to_word = {x:y for x, y in zip(range(len(french_vectorization_layer.get_vocabulary())), french_vectorization_layer.get_vocabulary())}

In [None]:
word_to_index = {y:x for x, y in zip(range(len(french_vectorization_layer.get_vocabulary())), french_vectorization_layer.get_vocabulary())}

In [None]:
def translator(english_sentence):
  tokenized_english_sentence = english_vectorization_layer([english_sentence])
  shifted_target = 'starttoken'

  for i in range(french_sequence_length):
    tokenized_shifted_target = french_vectorization_layer([shifted_target])
    output = bahdanau.predict([tokenized_english_sentence,tokenized_shifted_target])
    french_word_index = tf.argmax(output,axis=-1)[0][i].numpy()
    current_word = index_to_word[french_word_index]
    if current_word == 'endtoken':
      break
    shifted_target += ' ' + current_word
  return shifted_target[11:]

In [None]:
translator('Everyone should water his or her tomato plants')