<a href="https://colab.research.google.com/github/Singhsansar/Machine-Translator/blob/main/Machine_Translator_French_to_English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Machine Translation form English to French**

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


In [None]:
#Download the trining set, dataset is from the french to english
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
# Download validation set pairs.
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_val.txt
# Retrieve the test dataset.
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_test.txt

In [None]:
import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf
import unicodedata
from google.colab import files
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

##**Recurrence-based Seq2Seq Neural machine translation without Attention**

In [None]:
with open('hun_eng_pairs_train.txt') as file:
  train = [line.rstrip() for line in file]

In [None]:
train[:2]

In [None]:
# toatal 88647 sentences pair are there to train our model
len(train)

In [None]:
separator = '<sep>'
train_input,train_input_target = map(list,zip(*[pair.split(separator) for pair in train]))

In [None]:
train_input[:4]

In [None]:
train_input_target[:3]

In [None]:
def normalize_unicode(s):
  return ''.join(c for c in unicodedata.normalize('NFD',s)
    if unicodedata.category(c)!='Mn')

In [None]:
def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [None]:
train_data = [preprocess_sentence(s) for s in train_input]
train_data_target = [preprocess_sentence(s) for s in train_input_target]

In [None]:
train_data_target[:3]

In [None]:
train_data[:3]

In [None]:
def tag_target_sentence(sentences):
  tagged_sentence = map(lambda s: (' ').join(['<start>',s,'<end>']),sentences)
  return list(tagged_sentence)

In [None]:
train_data_target_tagged = tag_target_sentence(train_data_target)

In [None]:
train_data_target_tagged[:3]

In [None]:
#tokenized the source sentence
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(train_data)
source_tokenizer.get_config()

In [None]:
source_vocab_size = len(source_tokenizer.word_index)+1
source_vocab_size

In [None]:
#tokenize the target sentence
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(train_data_target)
target_tokenizer.get_config()

In [None]:
target_vocab_size = len(target_tokenizer.word_index) + 1
print(target_vocab_size)

##**Train the encoder Input**

In [None]:
train_encoder_inputs = source_tokenizer.texts_to_sequences(train_data)

In [None]:
print(train_encoder_inputs[:3])

In [None]:
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[:3]))

In [None]:
def generate_decoder_inputs_targets(sentences,tokenizer):
  seqs = tokenizer.texts_to_sequences(sentences)
  decoder_input = [s[:-1] for s in seqs]
  decoder_target = [s[1:] for s in seqs]
  return decoder_input , decoder_target


In [None]:
train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_data_target_tagged,target_tokenizer)

In [None]:
print(train_decoder_inputs[0], train_decoder_targets[0])
print(target_tokenizer.sequences_to_texts(train_decoder_inputs[:1]),
      target_tokenizer.sequences_to_texts(train_decoder_targets[:1]))

In [None]:
max_encoding_len = len(max(train_encoder_inputs, key=len))
max_encoding_len

In [None]:
max_decoding_len = len(max(train_decoder_inputs, key=len))
max_decoding_len

In [None]:
padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

In [None]:
print(padded_train_encoder_inputs[5])
print(padded_train_decoder_inputs[5])
print(padded_train_decoder_targets[5])

In [None]:
#for all the unknown words it will add unk
target_tokenizer.sequences_to_texts([padded_train_decoder_inputs[0]])

#**All preprocessign together for validation set**

In [None]:
with open('hun_eng_pairs_val.txt') as file:
  val = [line.rstrip() for line in file]

In [None]:
def process_dataset(dataset):

  # Split the Hungarian and English sentences into separate lists.
  input, output = map(list, zip(*[pair.split(separator) for pair in dataset]))

  # Unicode normalization and inserting spaces around punctuation.
  preprocessed_input = [preprocess_sentence(s) for s in input]
  preprocessed_output = [preprocess_sentence(s) for s in output]

  # Tag target sentences with <sos> and <eos> tokens.
  tagged_preprocessed_output = tag_target_sentence(preprocessed_output)

  # Vectorize encoder source sentences.
  encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)

  # Vectorize and create decoder input and target sentences.
  decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output,
                                                                    target_tokenizer)

  # Pad all collections.
  padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
  padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
  padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

  return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets


In [None]:
# Process validation dataset
padded_val_encoder_inputs, padded_val_decoder_inputs, padded_val_decoder_targets = process_dataset(val)

In [None]:
padded_val_encoder_inputs[:4]

##**Model Building**

In [None]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 32
epochs = 30

###***Encoder***

In [None]:
#designing the encoder for our model

encoder_inputs = layers.Input(shape=[None], name ='encoder_inputs')
encoder_embeddings = layers.Embedding(source_vocab_size,embedding_dim,mask_zero=True, name = 'encoder_embedding')

#sending the encoder inputs to the encoder embedding
encoder_embedding_output = encoder_embeddings(encoder_inputs)

encoder_lstm = layers.LSTM(hidden_dim,return_state= True, dropout= default_dropout, name ='encoder_lstm')

encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding_output)
encoder_states = (state_h, state_c)


###***Decoder***

In [None]:
#designing the decoder for our model
decoder_inputs = layers.Input(shape=[None], name='decoder_inputs')

decoder_embeddings = layers.Embedding(target_vocab_size, embedding_dim, mask_zero= True , name='decoder_embeddin')

#passing the decoder Input to the decoder_enbedding
decoder_embedding_output = decoder_embeddings(decoder_inputs)

decoder_lstm = layers.LSTM(hidden_dim, return_sequences=True,
                           return_state =True,
                           dropout = default_dropout,
                           name='decoder_lstm')
decoder_outputs,_,_ = decoder_lstm(decoder_embedding_output,initial_state=encoder_states)

decoder_dense = layers.Dense(target_vocab_size, activation='softmax',name= 'decoder_dense')

#The probablity distribution of the output word
y_proba = decoder_dense(decoder_outputs)

###***Model***

In [None]:
#defining the model taking the encoder and the decoder together
model = tf.keras.Model([encoder_inputs,decoder_inputs],y_proba,name='French_to_English_Without_attention')
model.compile(optimizer='adam',loss = 'sparse_categorical_crossentropy',metrics='sparse_categorical_accuracy')
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='hun_eng_seq2seq_nmt_no_attention.png', show_shapes=True, show_layer_names=True)

In [None]:
print('encoder_inputs layer\n input dimension {}\n output dimension: {}'.format((batch_size, max_encoding_len), (batch_size, max_encoding_len)))
print()
print('encoder_embeddings layer\n input dimension {}\n output dimension: {}'.format((batch_size, max_encoding_len), (batch_size, max_encoding_len, embedding_dim)))
print()
print('encoder_lstm layer\n input dimension {}\n output dimension: {}'.format((batch_size, max_encoding_len, embedding_dim), [(batch_size, hidden_dim), (batch_size, hidden_dim), (batch_size, hidden_dim)]))
print()
print()
print('decoder_inputs layer\n input dimension {}\n output dimension: {}'.format((batch_size, max_decoding_len), (batch_size, max_decoding_len)))
print()
print('decoder_embeddings layer\n input dimension {}\n output dimension: {}'.format((batch_size, max_decoding_len), (batch_size, max_decoding_len, embedding_dim)))
print()
print('decoder_lstm layer\n input dimension {}\n output dimension: {}'.format([(batch_size, max_decoding_len, embedding_dim), (batch_size, hidden_dim), (batch_size, hidden_dim)], [(batch_size, max_decoding_len, hidden_dim), (batch_size, hidden_dim), (batch_size, hidden_dim)]))
print()
print('decoder_dense layer(softmax)\n input dimension {}\n output dimension: {}'.format((batch_size, max_decoding_len, hidden_dim), (batch_size, max_decoding_len, target_vocab_size)))

In [None]:
# Saving this to a folder on my local machine.
filepath="/content/drive/MyDrive/NLP/HunEngNMTNoAttention/training1/cp.ckpt"

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_targets,
                     batch_size=batch_size,
                     epochs=20, #changed
                     validation_data=([padded_val_encoder_inputs, padded_val_decoder_inputs], padded_val_decoder_targets),
                     callbacks=[cp_callback, es_callback])

Epoch 1/20
Epoch 1: saving model to /content/drive/MyDrive/NLP/HunEngNMTNoAttention/training1/cp.ckpt
Epoch 2/20
Epoch 2: saving model to /content/drive/MyDrive/NLP/HunEngNMTNoAttention/training1/cp.ckpt
Epoch 3/20

In [None]:
model.save("/content/drive/MyDrive/NLP/French_to_English_Without_attention")

In [None]:
#### Zip and download the model.
!zip -r /content/drive/MyDrive/NLP/English_to_French_Without_attention.zip /content/drive/MyDrive/NLP/English_to_French_Without_attention
files.download("/content/drive/MyDrive/NLP/French_to_English_Without_attention")


##### Save the tokenizers as JSON files. can be used now
source_tokenizer_json = source_tokenizer.to_json()
with io.open('source_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))

target_tokenizer_json = target_tokenizer.to_json()
with io.open('target_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))

In [None]:
##### Save the tokenizers as JSON files. The resulting files can be downloaded by left-clicking on them. for the further use only
source_tokenizer_json = source_tokenizer.to_json()
with io.open('/content/drive/MyDrive/NLP/source_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))

target_tokenizer_json = target_tokenizer.to_json()
with io.open('/content/drive/MyDrive/NLP/target_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))

In [None]:
def translate_without_attention(sentence: str,
                                source_tokenizer, encoder,
                                target_tokenizer, decoder,
                                max_translated_len = 30):

  # Vectorize the source sentence and run it through the encoder.
  input_seq = source_tokenizer.texts_to_sequences([sentence])

  # Get the tokenized sentence to see if there are any unknown tokens.
  tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)

  states = encoder.predict(input_seq)

  current_word = '<sos>'
  decoded_sentence = []

  while len(decoded_sentence) < max_translated_len:

    # Set the next input word for the decoder.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_tokenizer.word_index[current_word]

    # Determine the next word.
    target_y_proba, h, c = decoder.predict([target_seq] + states)
    target_token_index = np.argmax(target_y_proba[0, -1, :])
    current_word = target_tokenizer.index_word[target_token_index]

    if (current_word == '<eos>'):
      break

    decoded_sentence.append(current_word)
    states = [h, c]

  return tokenized_sentence[0], ' '.join(decoded_sentence)


#**Testing the Modal Predection**

In [None]:
with open('source_tokenizer.json') as f:
    data = json.load(f)
    source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open('target_tokenizer.json') as f:
    data = json.load(f)
    target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

In [None]:
# Load the model.
model = tf.keras.models.load_model('/content/drive/MyDrive/NLP/French_to_English_Without_attention')

###The *test* dataset contains sentences (and most certainly words) unseen by the model.

In [None]:
with open('hun_eng_pairs_test.txt') as file:
  test = [line.rstrip() for line in file]

In [None]:
test[:3]

In [None]:
# Preprocess test dataset
padded_test_encoder_inputs, padded_test_decoder_inputs, padded_test_decoder_targets = process_dataset(test)

In [None]:
# Evaluate the model on the test set.
model.evaluate([padded_test_encoder_inputs, padded_test_decoder_inputs], padded_test_decoder_targets)

In [None]:
# These are the layers of our trained model.
[layer.name for layer in model.layers]

In [None]:
encoder_inputs = model.get_layer('encoder_inputs').input

encoder_embedding_layer = model.get_layer('encoder_embedding')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)

encoder_lstm = model.get_layer('encoder_lstm')

_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)

encoder_states = [encoder_state_h, encoder_state_c]

# Our stand-alone encoder model. encoder_inputs is the input to the encoder,
# and encoder_states is the expected output.
encoder_model_no_attention = tf.keras.Model(encoder_inputs, encoder_states)

In [None]:
plot_model(encoder_model_no_attention, to_file='encoder_model_no_attention_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
decoder_inputs = model.get_layer('decoder_inputs').input

decoder_embedding_layer = model.get_layer('decoder_embeddin')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# Inputs to represent the decoder's LSTM hidden and cell states. We'll populate
# these manually using the encoder's output for the initial state.
decoder_input_state_h = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_h')
decoder_input_state_c = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_c')
decoder_input_states = [decoder_input_state_h, decoder_input_state_c]

decoder_lstm = model.get_layer('decoder_lstm')

decoder_sequence_outputs, decoder_output_state_h, decoder_output_state_c = decoder_lstm(
    decoder_embeddings, initial_state=decoder_input_states
)

# Update hidden and cell states for the next time step.
decoder_output_states = [decoder_output_state_h, decoder_output_state_c]

decoder_dense = model.get_layer('decoder_dense')
y_proba = decoder_dense(decoder_sequence_outputs)

decoder_model_no_attention = tf.keras.Model(
    [decoder_inputs] + decoder_input_states,
    [y_proba] + decoder_output_states
)


In [None]:
def translate_without_attention(sentence: str,
                                source_tokenizer, encoder,
                                target_tokenizer, decoder,
                                max_translated_len = 30):

  # Vectorize the source sentence and run it through the encoder.
  input_seq = source_tokenizer.texts_to_sequences([sentence])

  # Get the tokenized sentence to see if there are any unknown tokens.
  tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)

  states = encoder.predict(input_seq)

  current_word = '<start>'
  decoded_sentence = []


  #matching the predection in to the bag of words
  while len(decoded_sentence) < max_translated_len:

    # Set the next input word for the decoder.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_tokenizer.word_index[current_word]

    # Determine the next word.
    target_y_proba, h, c = decoder.predict([target_seq] + states)
    target_token_index = np.argmax(target_y_proba[0, -1, :])
    current_word = target_tokenizer.index_word[target_token_index]

    if (current_word == '<end>'):
      break

    decoded_sentence.append(current_word)
    states = [h, c]

  return tokenized_sentence[0], ' '.join(decoded_sentence)


In [None]:
# random.seed is just here to re-create results.
random.seed(1)
sentences = random.sample(test, 15)
sentences

In [None]:
def translate_sentences(sentences, translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder):
  translations = {'Tokenized Original': [], 'Reference': [], 'Translation': []}

  for s in sentences:
    source, target = s.split(separator)
    source = preprocess_sentence(source)
    tokenized_sentence, translated = translation_func(source, source_tokenizer, encoder,
                                                      target_tokenizer, decoder)

    translations['Tokenized Original'].append(tokenized_sentence)
    translations['Reference'].append(target)
    translations['Translation'].append(translated)

  return translations

In [None]:
translations_no_attention = pd.DataFrame(translate_sentences(sentences, translate_without_attention,
                                                             source_tokenizer, encoder_model_no_attention,
                                                             target_tokenizer, decoder_model_no_attention))

In [None]:
translations_no_attention