Reference : https://towardsdatascience.com/how-to-build-an-encoder-decoder-translation-model-using-lstm-with-python-and-keras-a31e9d864b9b

# Import Dependencies

In [3]:
!pip install keras_preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [4]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM,Input,TimeDistributed,Dense,Activation,RepeatVector,Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Loading data

In [5]:
# Path to translation file
path_to_data = '/content/drive/MyDrive/AI_Research/spa.txt'

In [27]:
# Read File
translation_file = open(path_to_data,"r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Data Preprocessing

In [28]:
# parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in raw_data]
pairs = pairs[1000:]

In [29]:
pairs[10]

['Sit tight.',
 'Tú mantente.',
 'CC-BY 2.0 (France) Attribution: tatoeba.org #40196 (CM) & #5769260 (arh)']

In [30]:
# Remove Capital 

def clean_sentence(sentence):
  # Lower case the sentence
  lower_case_sent = sentence.lower()

  # Strip punctuations
  string_punctuation = string.punctuation+'i'+ '¿'
  clean_sentence = lower_case_sent.translate(str.maketrans('','',string_punctuation))

  return clean_sentence

# Create Tokenizer

In [31]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [32]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
spa_text_tokenized,spa_text_tokenizer = tokenize(spanish_sentences)
eng_text_tokenized,eng_text_tokenizer = tokenize(english_sentences)


IndexError: ignored

# Maximum Sentence Lengths

In [None]:
# Finding Maximum Lengths of Sentences
print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

# Vocabulary Size

In [None]:
# Find Vocab Length
spanish_vocab = max(spa_text_tokenizer.word_index.values())
english_vocab = max(eng_text_tokenizer.word_index.values())

print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

# Padding

In [None]:
max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

In [None]:
spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

In [None]:
# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

# Model Creation

## Encoder

In [33]:
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder) # to have the hidden state vector to be passed in each time step of the decoder

## Decoder

In [34]:
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
output = TimeDistributed(Dense(english_vocab,activation='relu'))(decoder) # TimeDistributed is used to apply the same Dense layer for each time step

In [35]:
enc_dec_model =  Model(input_sequence, output)

In [46]:
# Compile the model
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
                     optimizer='adam',
                     metrics = ['accuracy'])
enc_dec_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 9)]               0         
                                                                 
 embedding_1 (Embedding)     (None, 9, 128)            939776    
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                                 
 repeat_vector_1 (RepeatVect  (None, 5, 64)            0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 5, 64)             33024     
                                                                 
 time_distributed_1 (TimeDis  (None, 5, 3705)          240825    
 tributed)                                                 

In [47]:
# Create early stopping callback
import tensorflow as tf
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.05,
                                                           patience=3, restore_best_weights=True)

In [48]:
# Train the Model

model_results = enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, batch_size=30, epochs=100,callbacks=early_stopping)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


# Making Translation

In [44]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [45]:
index = 14
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))

The english sentence is: stay away
The spanish sentence is: aléjate
The predicted sentence is :
<empty> <empty> <empty> <empty> <empty>
