<a href="https://colab.research.google.com/github/UKJaagadhep/Data-science-and-machine-learning/blob/main/Neural_Machine_Translation/English_to_French_translation_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Input, GRU, Embedding, Bidirectional, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

# **Data Downloading**

In [None]:
!wget https://www.manythings.org/anki/fra-eng.zip
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

--2024-05-09 14:27:05--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2024-05-09 14:27:06 (19.9 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


# **Data Preparation**

In [None]:
text_dataset = tf.data.TextLineDataset('/content/dataset/fra.txt')

In [None]:
for i in text_dataset.take(5):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)
tf.Tensor(b'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)', shape=(), dtype=string)


In [None]:
len_count = 0
for i in text_dataset.take(1000):
  len_count += len(tf.strings.split(i, ""))
print(len_count/100)

141.85


In [None]:
def selector(input_text):
  split_text = tf.strings.split(input_text,'\t')
  return {'input_1' : split_text[0:1], 'input_2' : 'starttoken ' + split_text[1:2]}, split_text[1:2] + ' endtoken'
  #We specify [0:1] instead of just [0] to get output in the form of a vector (enclosed by []) and not a scaler
  #Dictionary contains inputs and  split_text[1:2] + ' [end]' is output
  '''So for each sentence in french sequence, we will have [start] token and the sentence representing inputs to the
  French output RNN from itself within the dictionary and we will also have the sentence and [end] token representing
  the outputs from the French output RNN'''
print(selector('Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'))

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)


In [None]:
split_dataset = text_dataset.map(selector) #We use split dataset into inputs and outputs (only input_1 is given by user)

In [None]:
def separator(input_text):
  split_text = tf.strings.split(input_text, '\t')
  return split_text[0:1], 'starttoken ' + split_text[1:2] + ' endtoken'

In [None]:
init_dataset = text_dataset.map(separator)
#We split dataset into english and french (alongwith starttoken and endtoken for french) to get the vocabulary for the 2 languages

In [None]:
print(separator('Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'))

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)


In [None]:
vocabulary_size = 20000
english_sequence_length = 64
french_sequence_length = 64
embedding_dimension = 300
batch_size = 64

In [None]:
english_vectorization_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = vocabulary_size,
    output_sequence_length = english_sequence_length,
    output_mode = 'int'
)

In [None]:
french_vectorization_layer = TextVectorization(
    standardize = 'lower_and_strip_punctuation',
    max_tokens = vocabulary_size,
    output_sequence_length = french_sequence_length,
    output_mode = 'int'
)

In [None]:
english_training_data = init_dataset.map(lambda x, y : x)
english_vectorization_layer.adapt(english_training_data)

In [None]:
french_training_data = init_dataset.map(lambda x, y : y)
french_vectorization_layer.adapt(french_training_data)

In [None]:
print(len(english_vectorization_layer.get_vocabulary()))
print(len(french_vectorization_layer.get_vocabulary()))

16952
20000


In [None]:
def vectorizer(inputs, output):
  return {'input_1' : english_vectorization_layer(inputs['input_1']),
          'input_2' : french_vectorization_layer(inputs['input_2'])}, french_vectorization_layer(output)

In [None]:
dataset = split_dataset.map(vectorizer)

In [None]:
for i in dataset.take(4):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[104,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

In [None]:
#checking indeices for starttoken and endtoken
print(french_vectorization_layer.get_vocabulary()[2])
print(french_vectorization_layer.get_vocabulary()[3])

starttoken
endtoken


In [None]:
dataset = dataset.shuffle(2048).unbatch().batch(batch_size).prefetch(buffer_size = tf.data.AUTOTUNE)

In [None]:
num_batches = int(200000/batch_size)
print(num_batches)

3125


In [None]:
train_dataset = dataset.take(int(0.88 * num_batches))
temp_dataset = dataset.skip(int(0.88 * num_batches))
val_dataset = temp_dataset.take(int(0.67 * num_batches))
test_dataset = temp_dataset.skip(int(0.67 * num_batches))

In [None]:
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [None]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [None]:
val_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [None]:
test_dataset

<_SkipDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

# **Modelling**

In [None]:
num_units = 256

In [None]:
#ENCODER
english_input = Input(shape = (english_sequence_length), dtype = 'int64', name = 'input_1')
english_embedding = Embedding(vocabulary_size, embedding_dimension, name = "english_embedding")(english_input)
encoded_input = Bidirectional(GRU(num_units), name = "encoded_input")(english_embedding)

#DECODER
right_shifted_target_french_input = Input(shape = (french_sequence_length), dtype = 'int64', name = "input_2")
french_input_embedding = Embedding(vocabulary_size, embedding_dimension, name = "french_input_embedding")(right_shifted_target_french_input)
encoded_french_input = GRU(num_units * 2, return_sequences = True, name = "encoded_french_input")(french_input_embedding, initial_state = encoded_input)
#We have num_units * 2 units here because encoded_input is bidirectional with each direction GRU having num_units units

#OUTPUT
dropout = Dropout(0.5)(encoded_french_input)
outputs = Dense(vocabulary_size, activation = "softmax")(dropout)
#At each block in the sequence of french_sequence_length blocks, we want to choose one word out of vocabulary_size words

seq2seq_gru_model = Model([english_input, right_shifted_target_french_input], outputs)

seq2seq_gru_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 english_embedding (Embeddi  (None, 64, 300)              6000000   ['input_1[0][0]']             
 ng)                                                                                              
                                                                                                  
 french_input_embedding (Em  (None, 64, 300)              6000000   ['input_2[0][0]']       

In [None]:
class BLEU(tf.keras.metrics.Metric):
  def __init__(self, name = 'bleu_score'):
    super(BLEU, self).__init__()
    self.bleu_score = 0

  def update_state(self, y_true, y_pred, sample_weight = None):
    y_pred = tf.argmax(y_pred, -1)

    self.bleu_score = 0

    for i,j in zip(y_pred, y_true):
      tf.autograph.experimental.set_loop_options() #For looping through zip
      total_words = tf.math.count_nonzero(i)
      total_matches = 0

      for word in i:
        if word == 0: #Predicted sentence is over
          break
        for q in range(len(j)):
          if j[q] == 0: #Actual sentence is over
            break
          if word == j[q]:
            total_matches += 1
            j = tf.boolean_mask(j,[False if y==q else True for y in range(len(j))]) #To strike present words so they don't get considered again
            #The word indices that get boolean_mask as False get removed
            break

      self.bleu_score += total_matches / total_words

  def result(self):
    return self.bleu_score / batch_size

In [None]:
seq2seq_gru_model.compile(metrics = [BLEU()], loss = SparseCategoricalCrossentropy(), optimizer = Adam(1e-4), #run_eagerly = True

                          )

In [None]:
checkpoint_filepath = '/content/seq2seq_gru_model.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

In [None]:
histroy = seq2seq_gru_model.fit(train_dataset, epochs = 10, validation_data = val_dataset, callbacks = [model_checkpoint_callback])

Epoch 1/10
    441/Unknown - 3527s 8s/step - loss: 1.4588 - accuracy: 0.9231

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
seq2seq_gru_model.evaluate(test_dataset)

# **Testing**

In [None]:
index_to_word = {x:y for x, y in zip(range(len(french_vectorization_layer.get_vocabulary())), french_vectorization_layer.get_vocabulary())}

In [None]:
word_to_index = {y:x for x, y in zip(range(len(french_vectorization_layer.get_vocabulary())), french_vectorization_layer.get_vocabulary())}

In [None]:
def translator(english_sentence):
  tokenized_english_sentence = english_vectorization_layer([english_sentence])

  shifted_target = 'starttoken'

  for i in range(french_sequence_length):
    tokenized_shifted_target = french_vectorization_layer([shifted_target])

    output = seq2seq_gru_model([tokenized_english_sentence, tokenized_shifted_target])

    french_sentence = tf.argmax(output, axis = -1)
    french_word_index = french_sentence[0][i].numpy()
    current_word = index_to_word[french_word_index]

    if current_word == 'endtoken':
      break
    shifted_target += ' ' + current_word
  return shifted_target[11:] # We specify [11:] to avoid having starttoken in answer



In [None]:
translator('How is the weather today?')