<a href="https://colab.research.google.com/github/UKJaagadhep/Data-science-and-machine-learning/blob/main/Neural_Machine_Translation/English_to_French_Neural_Machine_Translation_using_Transformers(from_scratch).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Layer, Embedding, MultiHeadAttention, Dense, LayerNormalization, Input, Dropout, TextVectorization
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

In [32]:
!wget https://www.manythings.org/anki/fra-eng.zip
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

--2024-05-15 11:08:30--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2024-05-15 11:08:31 (15.9 MB/s) - ‘fra-eng.zip.1’ saved [7943074/7943074]

Archive:  /content/fra-eng.zip
replace /content/dataset/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# **DATA PREPARATION**

In [33]:
text_dataset = tf.data.TextLineDataset('/content/dataset/fra.txt')

In [34]:
def selector(input_text):
  split_text = tf.strings.split(input_text,'\t')
  return {'input_1' : split_text[0:1], 'input_2' : 'starttoken ' + split_text[1:2]}, split_text[1:2] + ' endtoken'
  #We specify [0:1] instead of just [0] to get output in the form of a vector (enclosed by []) and not a scaler
  #Dictionary contains inputs and  split_text[1:2] + ' [end]' is output
  '''So for each sentence in french sequence, we will have [start] token and the sentence representing inputs to the
  French output RNN from itself within the dictionary and we will also have the sentence and [end] token representing
  the outputs from the French output RNN'''
print(selector('Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'))

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)


In [35]:
split_dataset = text_dataset.map(selector) #We use split dataset into inputs and outputs (only input_1 is given by user)

In [36]:
def separator(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return split_text[0:1], 'starttoken ' + split_text[1:2] + ' endtoken'

In [37]:
init_dataset = text_dataset.map(separator)
#We split dataset into english and french (alongwith starttoken and endtoken for french) to get the vocabulary for the 2 languages

In [38]:
vocabulary_size = 20000
english_sequence_length = 64
french_sequence_length = 64
embedding_dimension = 300
batch_size = 64

In [39]:
english_vectorization_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = vocabulary_size,
    output_sequence_length = english_sequence_length,
    output_mode = 'int'
)

In [40]:
french_vectorization_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = vocabulary_size,
    output_sequence_length = french_sequence_length,
    output_mode = 'int'
)

In [41]:
english_training_data = init_dataset.map(lambda x, y : x)
english_vectorization_layer.adapt(english_training_data)

In [42]:
french_training_data = init_dataset.map(lambda x, y : y)
french_vectorization_layer.adapt(french_training_data)

In [43]:
def vectorizer(inputs, output):
  return {'input_1' : english_vectorization_layer(inputs['input_1']),
          'input_2' : french_vectorization_layer(inputs['input_2'])}, french_vectorization_layer(output)

In [44]:
dataset = split_dataset.map(vectorizer)

In [45]:
#checking indeices for starttoken and endtoken
print(french_vectorization_layer.get_vocabulary()[2])
print(french_vectorization_layer.get_vocabulary()[3])

starttoken
endtoken


In [46]:
dataset = dataset.shuffle(2048).unbatch().batch(batch_size).prefetch(buffer_size = tf.data.AUTOTUNE)

In [47]:
num_batches = int(200000/batch_size)
print(num_batches)

3125


In [48]:
train_dataset = dataset.take(int(0.88 * num_batches))
temp_dataset = dataset.skip(int(0.88 * num_batches))
val_dataset = temp_dataset.take(int(0.67 * num_batches))
test_dataset = temp_dataset.skip(int(0.67 * num_batches))

In [49]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [50]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [51]:
val_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [52]:
test_dataset

<_SkipDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

# **MODELLING**

## **POSITIONAL ENCODING**

In [53]:
def positional_encoding(d_model, sequence_length):
  outputs = []
  for pos in range(sequence_length):
    PE = np.zeros((d_model))
    for i in range(d_model):
      if i % 2 == 0:
        PE[i] = np.sin(pos / (10000 ** (i / d_model)))
      else:
        PE[i] = np.cos(pos / (10000 ** ((i - 1)/ d_model)))
    outputs.append(tf.expand_dims(PE, axis = 0)) #shape = [1, d_model]
  out = tf.concat(outputs, axis = 0) #shape = [sequence_length, d_model]
  out = tf.expand_dims(out, axis = 0) #shape = [1, sequence_length, d_model]
  return tf.cast(out, dtype = tf.float32)

In [54]:
print(positional_encoding(256, 64).shape)

(1, 64, 256)


## **EMBEDDINGS**

In [55]:
class Embeddings(Layer):
  def __init__(self, sequence_length, vocabulary_size, embedding_dimension):
    super(Embeddings, self).__init__()
    self.embedding = Embedding(vocabulary_size, embedding_dimension)
    self.vocabulary_size = vocabulary_size
    self.embedding_dimension = embedding_dimension
    self.sequence_length = sequence_length

  def call(self, inputs):
    embedded_tokens = self.embedding(inputs)
    embedded_positions = positional_encoding(self.embedding_dimension, self.sequence_length)
    return embedded_tokens + embedded_positions

  def compute_mask(self, inputs, mask = None):
    return tf.math.not_equal(inputs, 0) #To avoid computation for places in the sequence where value is 0 (no word) as they don't contain any info
    #True if element in inputs not equal to 0 and False if element in inputs equal to 0

In [56]:
test_input=tf.constant([[2,4,7,21,3,5,0,0]])
emb=Embeddings(8,20000,512)
emb_out=emb(test_input)
print(emb_out.shape)

(1, 8, 512)


In [66]:
mask = emb.compute_mask(test_input)
print(mask)

tf.Tensor([[ True  True  True  True  True  True False False]], shape=(1, 8), dtype=bool)


In [58]:
mask = tf.cast(mask, dtype = tf.int32)
mask

<tf.Tensor: shape=(1, 8), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 0, 0]], dtype=int32)>

In [59]:
#Adding batch dimension
mask = mask[:, tf.newaxis, :]
print(mask)

tf.Tensor([[[1 1 1 1 1 1 0 0]]], shape=(1, 1, 8), dtype=int32)


In [60]:
#To repeat mask across the decoder sequence length so that each timestep has a mask
mask = tf.repeat(mask, 4, axis = 1)
mask

<tf.Tensor: shape=(1, 4, 8), dtype=int32, numpy=
array([[[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0]]], dtype=int32)>

## **TRANSFORMER ENCODER**

In [84]:
class TransformerEncoder(Layer):
  def __init__(self, embedding_dimension, inner_dense_dimension, num_heads):
    super(TransformerEncoder, self).__init__()

    self.mh_attention = MultiHeadAttention(num_heads = num_heads, key_dim = embedding_dimension)
    #num_heads is no. of self attention units in multi head attention

    self.norm1 = LayerNormalization()
    self.norm2 = LayerNormalization()

    self.feed_forward = Sequential([
        Dense(inner_dense_dimension, activation = 'relu'),
        Dense(embedding_dimension)
    ])

  def call(self, inputs, mask = None):
    if mask is not None: #mask shape = [batch_size, sequence_length]
      mask = tf.cast(mask, dtype = tf.int32) #Converting from True and False to 1 and 0
      mask = mask[:, tf.newaxis, :] #Adding a dimension
      sequence_length = mask.shape[2]
      padding_mask = tf.repeat(mask, sequence_length, axis = 1) #To get mask of shape = [batch_size, sequence_length, sequence_length] which is the shape of attention weight matrix

    attention_output = self.mh_attention(key = inputs, query = inputs, value = inputs, attention_mask = padding_mask)

    add_norm_and_feed_forward_input = self.norm1(inputs + attention_output)

    feed_forward_output = self.feed_forward(add_norm_and_feed_forward_input)

    return self.norm2(add_norm_and_feed_forward_input + feed_forward_output)

In [62]:
encoder = TransformerEncoder(512, 2048, 8)
encoder_outputs = encoder(emb_out)
print(encoder_outputs.shape)

(1, 8, 512)


## **TRANSFORMER DECODER**

In [63]:
print(tf.ones([1, 8, 8], dtype = tf.int32))

tf.Tensor(
[[[1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]
  [1 1 1 1 1 1 1 1]]], shape=(1, 8, 8), dtype=int32)


In [64]:
print(tf.linalg.band_part(
        tf.ones([1, 8, 8],dtype = tf.int32), -1, 0))

tf.Tensor(
[[[1 0 0 0 0 0 0 0]
  [1 1 0 0 0 0 0 0]
  [1 1 1 0 0 0 0 0]
  [1 1 1 1 0 0 0 0]
  [1 1 1 1 1 0 0 0]
  [1 1 1 1 1 1 0 0]
  [1 1 1 1 1 1 1 0]
  [1 1 1 1 1 1 1 1]]], shape=(1, 8, 8), dtype=int32)


In [65]:
class TransformerDecoder(Layer):
  def __init__(self, embedding_dimension, inner_dense_dimension, num_heads):
    super(TransformerDecoder, self).__init__()

    self.mh_attention = MultiHeadAttention(num_heads = num_heads, key_dim = embedding_dimension)
    self.masked_mh_attention = MultiHeadAttention(num_heads = num_heads, key_dim = embedding_dimension)

    self.norm1 = LayerNormalization()
    self.norm2 = LayerNormalization()
    self.norm3 = LayerNormalization()

    self.feed_forward = Sequential([
        Dense(inner_dense_dimension, activation = 'relu'),
        Dense(embedding_dimension)
    ])

  def call(self, inputs, encoder_outputs, encoder_mask, mask = None):
    if mask is not None:

      #Masks for masked multi head attention
      #For causal mask
      causal_mask = tf.linalg.band_part(
          tf.ones([tf.shape(inputs)[0], #batch_size
                   tf.shape(inputs)[1], #sequence_length
                   tf.shape(inputs)[1]], dtype = tf.int32),
          -1, 0 #band_part specifications to get lower triangular martrix fro causal mask
      )
      #For padding mask
      mask = tf.cast(mask, dtype = tf.int32)
      mask = mask[:, tf.newaxis, :]
      sequence_length = mask.shape[2]
      padding_mask = tf.repeat(mask, sequence_length, axis = 1)
      #Combined mask
      combined_mask = tf.minimum(causal_mask, padding_mask)

      #Masks for multi head attention
      enc_mask = tf.cast(encoder_mask[:, tf.newaxis, :], dtype = "int32")
      cross_attn_mask = tf.repeat(enc_mask, sequence_length, axis = 1)

    masked_attention_output = self.masked_mh_attention(key = inputs, query = inputs, value = inputs, attention_mask = combined_mask)
    add_norm_1 = self.norm1(inputs + masked_attention_output)
    attention_2_output, scores = self.mh_attention(query = add_norm_1, key = encoder_outputs, value = encoder_outputs, attention_mask = cross_attn_mask, return_attention_scores = True)
    add_norm_2 = self.norm2(add_norm_1 + attention_2_output)
    feed_forward_output = self.feed_forward(add_norm_2)
    return self.norm3(add_norm_2 + feed_forward_output) #, scores #Output scores to visualize attention

In [67]:
enc_mask = mask
decoder_outputs = TransformerDecoder(512, 2048, 4)(emb_out, encoder_outputs, enc_mask)
print(decoder_outputs.shape)

(1, 8, 512)


## **TRANSFORMER**

In [68]:
embedding_dimension = 512
feed_forward_inner_dimension = 2048
num_heads = 8
num_layers = 1
num_epochs = 10

In [85]:
encoder_inputs = Input(shape = (english_sequence_length,), dtype = "int64", name = "input_1")
embedding = Embeddings(english_sequence_length, vocabulary_size, embedding_dimension)
x = embedding(encoder_inputs)
encoder_mask = embedding.compute_mask(encoder_inputs)

for i in range(num_layers):
  x = TransformerEncoder(embedding_dimension, feed_forward_inner_dimension, num_heads)(x)
encoder_outputs = x

decoder_inputs = Input(shape = (french_sequence_length,), dtype = "int64", name = "input_2")
x = Embeddings(french_sequence_length, vocabulary_size, embedding_dimension)(decoder_inputs)
for j in range(num_layers):
  x = TransformerDecoder(embedding_dimension, feed_forward_inner_dimension, num_heads)(x, encoder_outputs, encoder_mask)

x = Dropout(0.5)(x)

decoder_outputs = Dense(vocabulary_size, activation = "softmax")(x)

Transformer = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")

Transformer.summary()


Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 embeddings_20 (Embeddings)  (None, 64, 512)              1024000   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 embeddings_21 (Embeddings)  (None, 64, 512)              1024000   ['input_2[0][0]']   

## **Custom Self Attention**

In [None]:
class CustomSelfAttention(Layer):
  def __init__(self, model_size):
    super(CustomSelfAttention, self).__init__()
    self.model_size = model_size
  def call(self, query, key, value, masking):
    #Compute scores
    score = tf.matmul(query, key, transpose_b = True)

    #Scaling
    score /= tf.math.sqrt(tf.cast(self.model_size, tf.float32))

    #Masking
    masking = tf.cast(masking, dtype = tf.float32)
    score += (1. - masking) * -1e10
    # 1. - masking gives an inverted mask where all 0s are 1s and all 1s are 0s
    #We multiply the mask by a large negative number to convert the numbers in scores where original mask is 0 to large negative numbers
    #The softmax of large negative numbers is almost 0 and most of the value (totalling to 1) is concentrated in the elements where original mash is 1

    #Attention_weights
    attention = tf.nn.softmax(score, axis = -1) * masking
    #We multiply by masking to solve the situation when all the elements in the row in original mash is 0
    #In this case, since all row elements are large negative numbers softmax sum 1 will be shared equally among them instead of all being masked to 0

    #Output
    head = tf.matmul(attention, value)
    return head

## **Custom Multi Head Attention**

In [None]:
class CustomMultiHeadAttention(Layer):
  def __init__(self, num_heads, key_dim):
    super(CustomMultiHeadAttention, self).__init__()

    self.num_heads = num_heads
    self.dense_q = [Dense(key_dim // num_heads) for _ in range(num_heads)]
    self.dense_k = [Dense(key_dim // num_heads) for _ in range(num_heads)]
    self.dense_v = [Dense(key_dim // num_heads) for _ in range(num_heads)]
    self.dense_o = Dense(key_dim)
    self.self_attention = CustomSelfAttention(key_dim)

  def call(self, query, key, value, attention_mask):
    heads = []

    for i in range(self.num_heads):
      head = self.self_attention(self.dense_q[i](query), self.dense_k[i](key),
                              self.dense_v[i](value), attention_mask)
      heads.append(head)
    heads = tf.concat(heads, axis = 2)
    heads = self.dense_o(heads)
    return heads

# **TRAINING**

In [None]:
class BLEU(tf.keras.metrics.Metric):
    def __init__(self,name='bleu_score'):
        super(BLEU,self).__init__()
        self.bleu_score=0

    def update_state(self,y_true,y_pred,sample_weight=None):
      y_pred=tf.argmax(y_pred,-1)
      self.bleu_score=0
      for i,j in zip(y_pred,y_true):
        tf.autograph.experimental.set_loop_options()

        total_words=tf.math.count_nonzero(i)
        total_matches=0
        for word in i:
          if word==0:
            break
          for q in range(len(j)):
            if j[q]==0:
              break
            if word==j[q]:
              total_matches+=1
              j=tf.boolean_mask(j,[False if y==q else True for y in range(len(j))])
              break

        self.bleu_score+=total_matches/total_words

    def result(self):
        return self.bleu_score/BATCH_SIZE

In [89]:
class Scheduler(LearningRateSchedule):
  def __init__(self, d_model, warmup_steps):
    super(Scheduler, self).__init__()
    self.d_model = tf.cast(d_model, tf.float64)
    self.warmup_steps = tf.cast(warmup_steps, dtype = tf.float64)

  def __call__(self, step):
    step = tf.cast(step, dtype = tf.float64) #Since we multiply by decimals like 0.5, 1.5 etc
    return (self.d_model ** (-0.5)) * tf.math.minimum(step ** (-0.5), step * (self.warmup_steps ** -1.5))

In [90]:
warmup_steps = 4000
lr_scheduled = Scheduler(embedding_dimension, warmup_steps)

In [92]:
Transformer.compile(
    loss = SparseCategoricalCrossentropy(),
    optimizer = Adam(lr_scheduled, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9),)
    #metrics = [BLEU()],
    #run_eagerly = True)

In [93]:
checkpoint_filepath = '/content/transformers.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

In [None]:
history = Transformer.fit(
    train_dataset,
    validation_data = val_dataset,
    epochs = 2,
    callbacks=[model_checkpoint_callback])

Epoch 1/2
    395/Unknown - 10864s 27s/step - loss: 3.1833

In [None]:
Transformer.save_weights('/content/transformers_weights.h5')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'upper left')
plt.show()

In [None]:
Transformer.evaluate(test_dataset)

# **TESTING**

In [94]:
index_to_word = {x:y for x, y in zip(range(len(french_vectorization_layer.get_vocabulary())), french_vectorization_layer.get_vocabulary())}

In [95]:
word_to_index = {y:x for x, y in zip(range(len(french_vectorization_layer.get_vocabulary())), french_vectorization_layer.get_vocabulary())}

In [96]:
def translator(english_sentence):
  tokenized_english_sentence = english_vectorization_layer([english_sentence])
  shifted_target = 'starttoken'

  for i in range(french_sequence_length):
    tokenized_shifted_target = french_vectorization_layer([shifted_target])
    output = Transformer.predict([tokenized_english_sentence,tokenized_shifted_target])
    french_word_index = tf.argmax(output,axis=-1)[0][i].numpy()
    current_word = index_to_word[french_word_index]
    if current_word == 'endtoken':
      break
    shifted_target += ' ' + current_word
  return shifted_target[11:]

In [None]:
translator('Everyone should water his or her tomato plants')

VISUALIZING ATTENTION WEIGHTS

In [None]:
def visualize(english_sentence):
  tokenized_english_sentence=english_vectorize_layer([english_sentence])
  shifted_target='starttoken je lai fait très bien'

  tokenized_shifted_target=french_vectorize_layer([shifted_target])
  attention_weights=attention_score_model.predict([tokenized_english_sentence,
                                                   tokenized_shifted_target])

  return attention_weights

out=visualize('I did it very well')


In [None]:
print(out['decoder_layer1_block2'][0].shape)

In [None]:
plt.figure(figsize = (12,12))

for i in range(NUM_HEADS):
  ax = plt.subplot(2,4, i+1)

  plt.imshow(out['decoder_layer1_block2'][0][i][0:10,0:10])
  plt.title("Attention Scores for head:->"+str(i+1))