In [14]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf
import unicodedata

from google.colab import files
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
#Download the trining set, dataset is from the french to english
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
# Download validation set pairs.
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_val.txt
# Retrieve the test dataset.
!wget https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_test.txt

--2023-11-23 16:43:19--  https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5518306 (5.3M) [text/plain]
Saving to: ‘hun_eng_pairs_train.txt’


2023-11-23 16:43:19 (78.2 MB/s) - ‘hun_eng_pairs_train.txt’ saved [5518306/5518306]

--2023-11-23 16:43:19--  https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_val.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 646226 (631K) 

In [16]:
with open('hun_eng_pairs_train.txt') as file:
  train = [line.rstrip() for line in file]

In [18]:
train[:4]

["Teszek rá, mit mondasz!<sep>I don't care what you say.",
 'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.<sep>We need more people like you on our team.',
 'Vigyázz a gyerekeimre!<sep>Take care of my children.',
 'Miért van szüksége önöknek két kerékpárra?<sep>Why do you need two bicycles?']

In [19]:
separator = '<sep>'
train_input,train_input_target = map(list,zip(*[pair.split(separator) for pair in train]))

In [20]:
train_input[:4]

['Teszek rá, mit mondasz!',
 'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.',
 'Vigyázz a gyerekeimre!',
 'Miért van szüksége önöknek két kerékpárra?']

In [21]:
train_input_target[:3]

["I don't care what you say.",
 'We need more people like you on our team.',
 'Take care of my children.']

In [22]:
def normalize_unicode(s):
  return ''.join(c for c in unicodedata.normalize('NFD',s)
    if unicodedata.category(c)!='Mn')

In [23]:
def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [24]:
train_data = [preprocess_sentence(s) for s in train_input]
train_data_target = [preprocess_sentence(s) for s in train_input_target]

In [25]:
train_data_target[:3]

["I don't care what you say .",
 'We need more people like you on our team .',
 'Take care of my children .']

In [26]:
train_data[:3]

['Teszek ra , mit mondasz !',
 'Tobb olyan ember kell nekunk a csapatba , mint amilyen te vagy .',
 'Vigyazz a gyerekeimre !']

In [27]:
def tag_target_sentence(sentences):
  tagged_sentence = map(lambda s: (' ').join(['<sos>',s,'<eos>']),sentences)
  return list(tagged_sentence)

In [28]:
train_data_target_tagged = tag_target_sentence(train_data_target)

In [29]:
train_data_target_tagged[:3]

["<sos> I don't care what you say . <eos>",
 '<sos> We need more people like you on our team . <eos>',
 '<sos> Take care of my children . <eos>']

In [32]:
#tokenized the source sentence
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(train_data)

In [33]:
source_vocab_size = len(source_tokenizer.word_index)+1
source_vocab_size

38539

In [35]:
#tokenize the target sentence
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(train_data_target_tagged)

In [36]:
target_vocab_size = len(target_tokenizer.word_index) + 1
print(target_vocab_size)

10556


In [37]:
train_encoder_inputs = source_tokenizer.texts_to_sequences(train_data)

In [38]:
print(train_encoder_inputs[:3])

[[1395, 91, 4, 27, 1080, 10], [153, 56, 145, 17, 152, 3, 7232, 4, 45, 1670, 44, 23, 2], [1026, 3, 8933, 10]]


In [39]:
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[:3]))

['teszek ra , mit mondasz !', 'tobb olyan ember kell nekunk a csapatba , mint amilyen te vagy .', 'vigyazz a gyerekeimre !']


In [40]:
def generate_decoder_inputs_targets(sentences,tokenizer):
  seqs = tokenizer.texts_to_sequences(sentences)
  decoder_input = [s[:-1] for s in seqs]
  decoder_target = [s[1:] for s in seqs]
  return decoder_input , decoder_target


In [41]:
train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_data_target_tagged,target_tokenizer)

In [56]:
print(train_decoder_inputs[0], train_decoder_targets[0])
print(target_tokenizer.sequences_to_texts(train_decoder_inputs[:1]),
      target_tokenizer.sequences_to_texts(train_decoder_targets[:1]))

[2, 5, 23, 306, 28, 7, 151, 4] [5, 23, 306, 28, 7, 151, 4, 3]
["<sos> i don't care what you say ."] ["i don't care what you say . <eos>"]


In [153]:
max_encoding_len = len(max(train_encoder_inputs, key=len))
max_encoding_len

37

In [154]:
max_decoding_len = len(max(train_decoder_inputs, key=len))
max_decoding_len

34

In [155]:
padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

In [156]:
print(padded_train_encoder_inputs[0])
print(padded_train_decoder_inputs[0])
print(padded_train_decoder_targets[0])

[1395   91    4   27 1080   10    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0]
[  2   5  23 306  28   7 151   4   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
[  5  23 306  28   7 151   4   3   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [157]:
#for all the unknown words it will add unk
target_tokenizer.sequences_to_texts([padded_train_decoder_inputs[0]])

["<sos> i don't care what you say . <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>"]

#**All preprocessign together for validation set**

In [158]:
with open('hun_eng_pairs_val.txt') as file:
  val = [line.rstrip() for line in file]

In [159]:
def process_dataset(dataset):

  # Split the Hungarian and English sentences into separate lists.
  input, output = map(list, zip(*[pair.split(separator) for pair in dataset]))

  # Unicode normalization and inserting spaces around punctuation.
  preprocessed_input = [preprocess_sentence(s) for s in input]
  preprocessed_output = [preprocess_sentence(s) for s in output]

  # Tag target sentences with <sos> and <eos> tokens.
  tagged_preprocessed_output = tag_target_sentence(preprocessed_output)

  # Vectorize encoder source sentences.
  encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)

  # Vectorize and create decoder input and target sentences.
  decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output,
                                                                    target_tokenizer)

  # Pad all collections.
  padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
  padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
  padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')

  return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets


In [160]:
# Process validation dataset
padded_val_encoder_inputs, padded_val_decoder_inputs, padded_val_decoder_targets = process_dataset(val)

In [161]:
padded_val_encoder_inputs[:2]

array([[   1,    4,   38, 2948,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [1056,   74,    2,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int32)

##**Model Building**

In [162]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 32
epochs = 30

###***Encoder***

##**Attention_Mechanism_for_the_Machine_Translation**

In [210]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()

        # No masking here. We'll handle it ourselves.
        self.embedding = layers.Embedding(source_vocab_size,
                                          embedding_dim,
                                          name='encoder_embedding_layer')

        # return_sequences is set to True this time.
        self.lstm = layers.LSTM(hidden_dim,
                                return_sequences=True,
                                return_state=True,
                                name='encoder_lstm')

    def call(self, input):
        embeddings = self.embedding(input)

        # output_seq will hold the encoder's hidden states from each time step.
        output_seq, state_h, state_c = self.lstm(embeddings)

        return output_seq, state_h, state_c

In [211]:
#made our own encoder , with the own Encoder class
test_encoder = Encoder(source_vocab_size, embedding_dim, hidden_dim)

In [212]:
test_encoder_batch = padded_train_encoder_inputs[:3]
test_encoder_batch

array([[1395,   91,    4,   27, 1080,   10,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [ 153,   56,  145,   17,  152,    3, 7232,    4,   45, 1670,   44,
          23,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [1026,    3, 8933,   10,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int32)

In [213]:
print(test_encoder_batch.shape)

(3, 37)


In [214]:
test_encoder_outputs, state_h, state_c = test_encoder(test_encoder_batch)

In [215]:
print(test_encoder_outputs.shape)
print(state_h.shape)
print(state_c.shape)

(3, 37, 256)
(3, 256)
(3, 256)


##**Hands on Attention**

In [216]:
# Sample encoder LSTM output for single sequence of length 4., hidden state from the encoder
encoder_out = tf.constant([[1., 2., 3.],
                           [2., 3., 4.],
                           [3., 4., 5.],
                           [4., 5. ,6.]])

In [217]:
encoder_out

<tf.Tensor: shape=(4, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [2., 3., 4.],
       [3., 4., 5.],
       [4., 5., 6.]], dtype=float32)>

In [218]:
print('encoder_out shape: {}'.format(encoder_out.shape))
print('Number of timesteps: {}'.format(encoder_out.shape[0]))
print('Number of hidden dimensions: {}'.format(encoder_out.shape[1]))

encoder_out shape: (4, 3)
Number of timesteps: 4
Number of hidden dimensions: 3


In [219]:
# Sample decoder LSTM output for a single timestep. hidden state from decoder at any timestep
decoder_out = tf.constant([[1., 3., 5.]])

In [220]:
print('decoder_out shape: {}'.format(decoder_out.shape))
print('Number of timesteps: {}'.format(decoder_out.shape[0]))
print('Number of hidden dimensions: {}'.format(decoder_out.shape[1]))

decoder_out shape: (1, 3)
Number of timesteps: 1
Number of hidden dimensions: 3


In [221]:
tf.transpose(encoder_out)

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[1., 2., 3., 4.],
       [2., 3., 4., 5.],
       [3., 4., 5., 6.]], dtype=float32)>

In [222]:
#The *tf.matmul* function can perform the transpose and the dot product in one step to yield the attention scores.<br>
attention_scores = tf.matmul(decoder_out, encoder_out, transpose_b=True)
print(attention_scores)

tf.Tensor([[22. 31. 40. 49.]], shape=(1, 4), dtype=float32)


In [223]:
attention_weights = tf.keras.activations.softmax(attention_scores, axis=-1)
print(attention_weights)

tf.Tensor([[1.8792971e-12 1.5228100e-08 1.2339458e-04 9.9987662e-01]], shape=(1, 4), dtype=float32)


In [224]:
attention_weights = tf.keras.activations.softmax(attention_scores, axis=-1)
print(attention_weights)

tf.Tensor([[1.8792971e-12 1.5228100e-08 1.2339458e-04 9.9987662e-01]], shape=(1, 4), dtype=float32)


In [225]:
context = tf.matmul(attention_weights, encoder_out)
print(context)

tf.Tensor([[3.9998767 4.9998765 5.999877 ]], shape=(1, 3), dtype=float32)


In [226]:
class LuongAttention(tf.keras.Model):
  def __init__(self, hidden_dim):
    super(LuongAttention, self).__init__()
    self.w = layers.Dense(hidden_dim, name='encoder_outputs_dense')
  def call(self, inputs):
    encoder_output_seq, decoder_output = inputs
    z = self.w(encoder_output_seq)
    attention_scores = tf.matmul(decoder_output, z, transpose_b=True)
    attention_weights = tf.keras.activations.softmax(attention_scores, axis=-1)
    context = tf.matmul(attention_weights, encoder_output_seq)
    return attention_weights, context


In [227]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(Decoder, self).__init__()

    self.embedding_layer = layers.Embedding(vocab_size,
                                            embedding_dim,
                                            name='decoder_embedding_layer')

    self.lstm = layers.LSTM(hidden_dim,
                            return_sequences=True,
                            return_state=True,
                            name='decoder_lstm')

    self.attention = LuongAttention(hidden_dim)

    self.w = tf.keras.layers.Dense(hidden_dim, activation='tanh', name='attended_outputs_dense')

    self.dense = layers.Dense(vocab_size, name='decoder_dense')


  def call(self, inputs):
    decoder_input, encoder_output_seq, lstm_state = inputs
    embeddings = self.embedding_layer(decoder_input)

    decoder_output, state_h, state_c = self.lstm(embeddings, initial_state=lstm_state)

    weights, context = self.attention([encoder_output_seq, decoder_output])

    decoder_output_with_attention = self.w(tf.concat(
        [tf.squeeze(context, 1), tf.squeeze(decoder_output, 1)], -1))

    logits = self.dense(decoder_output_with_attention)

    return logits, state_h, state_c, weights

In [228]:
test_decoder = Decoder(target_vocab_size, embedding_dim, hidden_dim)

In [229]:
test_decoder_batch = padded_train_decoder_inputs[:3]
print(test_decoder_batch.shape)
test_decoder_batch

(3, 34)


array([[  2,   5,  23, 306,  28,   7, 151,   4,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [  2,  25,  55, 117, 144,  33,   7,  35, 139, 794,   4,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [  2, 105, 306,  17,  24, 225,   4,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [230]:
test_decoder_batch[:, 1]

array([  5,  25, 105], dtype=int32)

In [231]:
next_decoder_inputs = tf.expand_dims(test_decoder_batch[:, 1], 1)
next_decoder_inputs

<tf.Tensor: shape=(3, 1), dtype=int32, numpy=
array([[  5],
       [ 25],
       [105]], dtype=int32)>

In [232]:
# Initial values for state_h and state_c are from the encoder.
test_decoder_logits, state_h, state_c, test_decoder_weights = test_decoder(
    [
      next_decoder_inputs,
      test_encoder_outputs,
      [state_h, state_c]
    ])

In [233]:
print(test_decoder_logits.shape)
print(test_decoder_weights.shape)

(3, 10556)
(3, 1, 37)


In [234]:
def loss_func(targets, logits):
  ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  mask = tf.cast(tf.math.not_equal(targets, 0), tf.float32)

  return ce_loss(targets, logits, sample_weight=mask)

In [235]:
dataset = tf.data.Dataset.from_tensor_slices((padded_train_encoder_inputs,
                                              padded_train_decoder_inputs,
                                              padded_train_decoder_targets)).batch(batch_size, drop_remainder=True)

In [242]:
class TranslatorTrainer(tf.keras.Model):
  def __init__(self, encoder, decoder):
    super(TranslatorTrainer, self).__init__()

    self.encoder = encoder
    self.decoder = decoder

  # This method will be called by model.fit for each batch.
  @tf.function
  def train_step(self, inputs):
      loss = 0.

      encoder_input_seq, decoder_input_seq, decoder_target_seq = inputs

      with tf.GradientTape() as tape:
          encoder_output_seq, state_h, state_c = self.encoder(encoder_input_seq)

          # We need to create a loop to iterate through the target sequences
          for i in range(decoder_target_seq.shape[1]):

              # Input to the decoder must have shape of (batch_size, length)
              # so we need to expand one dimension (just like in the previous example).
              next_decoder_input = tf.expand_dims(decoder_input_seq[:, i], 1)
              logits, state_h, state_c, _ = self.decoder(
                  [next_decoder_input, encoder_output_seq, (state_h, state_c)])

              # The loss is now accumulated through the whole batch
              loss += self.loss(decoder_target_seq[:, i], logits)

      # Update the parameters and the optimizer
      variables = encoder.trainable_variables + decoder.trainable_variables
      gradients = tape.gradient(loss, variables)
      self.optimizer.apply_gradients(zip(gradients, variables))

      return {'loss': loss / decoder_target_seq.shape[1]}

In [255]:
encoder = Encoder(source_vocab_size, embedding_dim, hidden_dim)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_dim)
optimizer = tf.keras.optimizers.Adam()

translator_trainer = TranslatorTrainer(encoder, decoder)
translator_trainer.compile(optimizer=optimizer, loss=loss_func)

In [256]:
epochs = 20

In [257]:
!wget https://github.com/futuremojo/nlp-demystified/raw/main/models/nmt_with_attention/attention_weights.zip
!unzip -o attention_weights.zip

--2023-11-23 18:40:09--  https://github.com/futuremojo/nlp-demystified/raw/main/models/nmt_with_attention/attention_weights.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/models/nmt_with_attention/attention_weights.zip [following]
--2023-11-23 18:40:09--  https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/models/nmt_with_attention/attention_weights.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36984175 (35M) [application/zip]
Saving to: ‘attention_weights.zip.1’


2023-11-23 18:40:09 (252 MB/s) - ‘attention_weights.zip.1’ saved [36984175/

In [249]:
# #saving the weights for the use in drive
# encoder.save_weights('/content/drive/MyDrive/NLP/attention_encoder_weights_with_dropout_ckpt')
# decoder.save_weights('/content/drive/MyDrive/NLP/attention_decoder_weights_with_dropout_ckpt')
# !zip -r ./content/drive/MyDrive/NLP/attention_weights.zip ./content/drive/MyDrive/NLP/attention_weights
# files.download('/content/drive/MyDrive/NLP/attention_weights.zip')

In [258]:
#loading at the same instance for the use rightnow
encoder.save_weights('attention_encoder_weights_with_dropout_ckpt')
decoder.save_weights('attention_decoder_weights_with_dropout_ckpt')

In [259]:
encoder.load_weights('attention_weights/attention_encoder_weights_ckpt')
decoder.load_weights('attention_weights/attention_decoder_weights_ckpt')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7914080d2d10>

In [286]:
def translate_with_attention(sentence: str,
                             source_tokenizer, encoder,
                             target_tokenizer, decoder,
                             max_translated_len = 30):
    input_seq = source_tokenizer.texts_to_sequences([sentence])
    tokenized = source_tokenizer.sequences_to_texts(input_seq)

    input_seq = pad_sequences(input_seq, maxlen=max_encoding_len, padding='post')
    encoder_output, state_h, state_c  = encoder.predict(input_seq)

    current_word = '<sos>'
    decoded_sentence = []

    while len(decoded_sentence) < max_translated_len:
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = target_tokenizer.word_index[current_word]

        logits, state_h, state_c, _ = decoder.predict([target_seq, encoder_output, (state_h, state_c)])
        current_token_index = np.argmax(logits[0])

        current_word = target_tokenizer.index_word[current_token_index]

        if (current_word == '<eos>'):
          break

        decoded_sentence.append(current_word)

    return tokenized[0], ' '.join(decoded_sentence)

In [287]:
def translate_sentences(sentences, translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder):
  translations = {'Tokenized Original': [], 'Reference': [], 'Translation': []}

  for s in sentences:
    source, target = s.split(separator)
    source = preprocess_sentence(source)
    tokenized_sentence, translated = translation_func(source, source_tokenizer, encoder,
                                                      target_tokenizer, decoder)

    translations['Tokenized Original'].append(tokenized_sentence)
    translations['Reference'].append(target)
    translations['Translation'].append(translated)

  return translations

##Testing the modal

In [288]:
with open('hun_eng_pairs_test.txt') as file:
  test = [line.rstrip() for line in file]

In [289]:
# Preprocess test dataset
padded_test_encoder_inputs, padded_test_decoder_inputs, padded_test_decoder_targets = process_dataset(test)

In [290]:
# random.seed is just here to re-create results.
random.seed(1)
sentences = random.sample(test, 25)
sentences

['Csinálom.<sep>I got it.',
 'Mondd el nekem.<sep>Let me know.',
 'Ritkán járok oda.<sep>I rarely go there.',
 "Mi a döntésed?<sep>What's your decision?",
 "Hol van a legközelebbi étterem?<sep>Where's the closest restaurant?",
 'Mégis csak van egy megoldás.<sep>There is a solution though.',
 'Csak pár diák maradt az osztályteremben.<sep>There were few students left in the classroom.',
 'Nagyra értékelem a segítségedet ebben.<sep>I appreciate your help on this.',
 'Ez az utolsó vonat.<sep>This is the last train.',
 'Milyen gyakran jönnek a buszok?<sep>How often do buses come?',
 "A boldogság nem tart örökké.<sep>Happiness doesn't last forever.",
 "Azért vagyok itt, hogy bocsánatot kérjek.<sep>I'm here to apologize.",
 "Tom szörnyű szakács, ugye?<sep>Tom is a terrible cook, isn't he?",
 "Gondolod, hogy ma esni fog?<sep>Do you think it'll rain today?",
 'Tom ismeri a járást.<sep>Tom knows the way.',
 'Egy régi barátom jött el hozzám vendégségbe.<sep>An old friend came to my house for a vi

In [292]:
shorter_translations_w_attention = pd.DataFrame(translate_sentences(sentences, translate_with_attention,
                                                                    source_tokenizer, encoder,                                                                  target_tokenizer, decoder))



In [276]:
shorter_translations_w_attention

Unnamed: 0,Tokenized Original,Reference,Translation
0,csinalom .,I got it.,i'm doing it .
1,mondd el nekem .,Let me know.,tell me .
2,ritkan jarok oda .,I rarely go there.,i rarely go there .
3,mi a dontesed ?,What's your decision?,what is your decision ?
4,hol van a legkozelebbi etterem ?,Where's the closest restaurant?,where's the nearest package ?
5,megis csak van egy megoldas .,There is a solution though.,there is no one solution .
6,csak par diak maradt az osztalyteremben .,There were few students left in the classroom.,there were only a few students left in the cla...
7,nagyra ertekelem a segitsegedet ebben .,I appreciate your help on this.,i appreciate your help .
8,ez az utolso vonat .,This is the last train.,this is the last train .
9,milyen gyakran jonnek a buszok ?,How often do buses come?,how often do the buses come ?


In [293]:
pairs = train.copy()
pairs.sort(key=lambda s: len(s))
longer_sentences = pairs[-10:]
longer_sentences

['- Tegnap este mikor mentél aludni? - 4 órakor. - Micsoda? Mit csináltál olyan sokáig? - Telefonon beszélgettem a volt barátommal.<sep>"When\'d you go to sleep last night?" "4 o\'clock." "What? What were you doing up so late?" "Talking on the phone with my ex-boyfriend."',
 'Mi az öregség? Először a neveket felejted el, majd az arcokat, utána elfelejted felhúzni a cipzáradat, aztán elfelejted lehúzni.<sep>What is old age? First you forget names, then you forget faces, then you forget to pull your zipper up, then you forget to pull it down.',
 "Bár a Föld felületének csupán két százalékát borítják őserdők, ott él a világon fellelhető állat-, növény- és rovarfaj fele.<sep>Although rainforests make up only two percent of the earth's surface, over half the world's wild plant, animal and insect species live there.",
 "Amikor gyerek volt Tom, összegyűjtötte az apja cigarettacsikkjeit, míg elég nem lett a dohány, hogy sodorjon egy szál cigarettát magának.<sep>When Tom was a kid, he used to c

In [294]:
longer_translations_with_attention = pd.DataFrame(translate_sentences(longer_sentences, translate_with_attention,
                                                                      source_tokenizer, encoder,
                                                                      target_tokenizer, decoder))
longer_translations_with_attention



Unnamed: 0,Tokenized Original,Reference,Translation
0,tegnap este mikor mentel aludni ? 4 orakor . m...,"""When'd you go to sleep last night?"" ""4 o'cloc...",when'd were you out of last night ? 4 o'clock ...
1,mi az oregseg ? eloszor a neveket felejted el ...,"What is old age? First you forget names, then ...","what is old age ? first , then you forget to f..."
2,bar a fold feluletenek csupan ket szazalekat b...,Although rainforests make up only two percent ...,though looking only million detectives and 100...
3,"amikor gyerek volt tom , osszegyujtotte az apj...","When Tom was a kid, he used to collect his fat...","when tom was a child , he burned his father's ..."
4,"nemreg , ahogy setaltam a jardan , egy auto sz...","Earlier, as I was walking down the sidewalk, a...","earlier , as i was walking down the sidewalk ,..."
5,"sok fiatal , akinek piercingje van , egesz ele...",Many young people with piercings stay together...,many young people we ate the whole memories we...
6,ne adj kolcson konyveket senki nem adja oket v...,Don't lend books; no one gives them back. The ...,don't lend books not to return here . the only...
7,a tatoeba korpusban a hibak szamanak csokkente...,One way to lower the number of errors in the T...,one way in the tatoeba number should be able t...
8,"a tatoeba projekt , amely megtalalhato az inte...","The Tatoeba Project, which can be found online...","the tatoeba project , which can i recommend no..."
9,"a busz , amelyik most erkezik , az egyes termi...",The bus now arriving is going to Domestic Term...,"the bus now arriving on domestic terminal , pl..."
