# Import Libraries

In [1]:
import tensorflow as tf
import numpy as np
from read_data import read_file
from english_text_normalization import text_normalization
from text_processing import TextProcessing
import random
from tensorflow.keras.layers import LSTM, GRU, Attention, AdditiveAttention, MultiHeadAttention # type: ignore
from encoder_decoder_model import EncoderDecoderWithAttention, EncoderDecoderWithoutAttention
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
import warnings
import os
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Ignore Warnings

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings(action='ignore')

# Set Early Stop For Encoder-Decoder Model

In [3]:
early_stops = EarlyStopping(monitor='loss', mode = 'min', patience = 5)

# Read Data

In [4]:
Corpus = read_file(file_path='Question_Answering_Pairs.txt', text_normalization=text_normalization)

# Split The Data And Add Start And End Token

In [5]:
encoder_input, encoder_output = map(list, zip(*[pair.split('<sep>') for pair in Corpus])) # Split Data Into Input And Output
encoder_output = ["<sos>" + " " + line + " " + "<eos>"  for line in encoder_output] # Add Start And End Token To The Output

# Process The Data 

**Input Data**

In [6]:
text_processing_input = TextProcessing(data=encoder_input) # Input TextProcessing
all_words_input , words_input = text_processing_input.data_words(filter='[!"#$&*+,/:;=@[\\]^_`{|}~]') # Set With Unique Words And Find Number Of Unique Words
words_to_index_input = text_processing_input.words_to_index_(words=words_input) # Conver Word To Integer Index
index_to_words_input = text_processing_input.index_to_word_(words=words_input)
input_sequences = text_processing_input.text_to_sequence(word_index=words_to_index_input) # Convert Text Into Squences Of Integer
encoder_max_length = len(max(encoder_input, key = len)) # Find Max Length
pad_encoder_input = text_processing_input.sequences_padding(input_sequence=input_sequences, max_length=encoder_max_length) # Zero Padding (Add Zeros To End Of Sequence In Input Sequences To Make All Sequences In Same Length)
word_count_input = text_processing_input.word_counts() # Number Of Occurrence Each Word

**Output Data**

In [7]:
text_processing_output = TextProcessing(data=encoder_output) # Input TextProcessing
all_words_output , words_output = text_processing_output.data_words(filter='[!"#$%&()*+,./:;=@[\\]^_`{|}~]') # Set With Unique Words And Find Number Of Unique Words
words_to_index_output = text_processing_output.words_to_index_(words=words_output) # Conver Word To Integer Index
output_sequences = text_processing_output.text_to_sequence(word_index=words_to_index_output) # Convert Text Into Squences Of Integer
word_count_output = text_processing_output.word_counts() # Number Of Occurrence Each Word

In [8]:
all_words_output

609

# Generate Decoder Input And Output

In [9]:
def generate_decoder_inputs_targets(sequence):  # Generate Decoder Input And Output From Input Sequence
  decoder_inputs = [sentences[:-1] for sentences in sequence] 
  decoder_targets = [sentences[1:] for sentences in sequence]
  return decoder_inputs, decoder_targets
decoder_inputs, decoder_output = generate_decoder_inputs_targets(sequence= output_sequences)
decoder_max_length = len(max(decoder_inputs, key = len)) # Find Max Length Of Decoder
decoder_max_length

13

# Padding Decoder Data

In [10]:
pad_decoder_input = text_processing_output.sequences_padding(input_sequence=decoder_inputs, max_length=decoder_max_length)
pad_decoder_output = text_processing_output.sequences_padding(input_sequence=decoder_output, max_length=decoder_max_length)

# Encoder-Decoder Without Attention

**LSTM**

In [11]:
model_encoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
model_decoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
encoder_decoder_lstm = EncoderDecoderWithoutAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_lstm, model_decoder = model_decoder_lstm, unit = all_words_output)
encoder_inputs, encoder_states = encoder_decoder_lstm.encoder()
decoder_inputs, decoder_outputs = encoder_decoder_lstm.decoder(encoder_states=encoder_states)
model_lstm_1 = encoder_decoder_lstm.build_model()
encoder_decoder_lstm.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_lstm_1, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder_lstm.model_fit(model = model_lstm_1, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

**GRU**

In [12]:
model_encoder_gru = GRU(units=128, return_sequences=True, return_state=True)
model_decoder_gru = GRU(units=128, return_sequences=True, return_state=True)
encoder_decoder = EncoderDecoderWithoutAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_lstm, model_decoder = model_decoder_lstm, unit = all_words_output)
encoder_inputs, encoder_states = encoder_decoder.encoder()
decoder_inputs, decoder_outputs = encoder_decoder.decoder(encoder_states=encoder_states)
model_gru_1 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_gru_1, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder.model_fit(model = model_gru_1, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Encoder-Decoder With Attention

**1-Luong Attension**

*LSTM*

In [13]:
model_encoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
model_decoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
attention_layer = Attention()
encoder_decoder = EncoderDecoderWithAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_lstm, model_decoder = model_decoder_lstm, unit = all_words_output, attention_layer = attention_layer)
encoder_output, encoder_inputs, encoder_states = encoder_decoder.encoder_()
decoder_inputs, decoder_outputs = encoder_decoder.decoder_(encoder_states=encoder_states, encoder_outputs=encoder_output)
model_lstm_2 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_lstm_2, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder.model_fit(model = model_lstm_2, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

*GRU*

In [14]:
model_encoder_gru = GRU(units=128, return_sequences=True, return_state=True)
model_decoder_gru = GRU(units=128, return_sequences=True, return_state=True)
attention_layer = Attention()
encoder_decoder = EncoderDecoderWithAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_gru, model_decoder = model_decoder_gru, unit = all_words_output, attention_layer = attention_layer)
encoder_output, encoder_inputs, encoder_states = encoder_decoder.encoder_()
decoder_inputs, decoder_outputs = encoder_decoder.decoder_(encoder_states=encoder_states, encoder_outputs=encoder_output)
model_gru_2 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_gru_2, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder.model_fit(model = model_gru_2, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

**2-Bahdanau Attension**

*LSTM*

In [15]:
model_encoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
model_decoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
attention_layer = AdditiveAttention()
encoder_decoder = EncoderDecoderWithAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_lstm, model_decoder = model_decoder_lstm, unit = all_words_output, attention_layer = attention_layer)
encoder_output, encoder_inputs, encoder_states = encoder_decoder.encoder_()
decoder_inputs, decoder_outputs = encoder_decoder.decoder_(encoder_states=encoder_states, encoder_outputs=encoder_output)
model_lstm_3 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_lstm_3, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder.model_fit(model = model_lstm_3, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

*GRU*

In [16]:
model_encoder_gru = GRU(units=128, return_sequences=True, return_state=True)
model_decoder_gru = GRU(units=128, return_sequences=True, return_state=True)
attention_layer = Attention()
encoder_decoder = EncoderDecoderWithAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_gru, model_decoder = model_decoder_gru, unit = all_words_output, attention_layer = attention_layer)
encoder_output, encoder_inputs, encoder_states = encoder_decoder.encoder_()
decoder_inputs, decoder_outputs = encoder_decoder.decoder_(encoder_states=encoder_states, encoder_outputs=encoder_output)
model_gru_3 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_gru_3, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder.model_fit(model = model_gru_3, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

**3-MultiHead Attesnion**

*LSTM*

In [17]:
model_encoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
model_decoder_lstm = LSTM(units=128, return_sequences=True, return_state=True)
attention_layer = MultiHeadAttention(num_heads=5, key_dim=64)
encoder_decoder = EncoderDecoderWithAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_lstm, model_decoder = model_decoder_lstm, unit = all_words_output, attention_layer = attention_layer)
encoder_output, encoder_inputs, encoder_states = encoder_decoder.encoder_()
decoder_inputs, decoder_outputs = encoder_decoder.decoder_(encoder_states=encoder_states, encoder_outputs=encoder_output)
model_lstm_4 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_lstm_4, metrics=['sparse_categorical_accuracy'])
with tf.device('cpu'):
    history = encoder_decoder.model_fit(model = model_lstm_4, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100


*GRU*

In [18]:
model_encoder_gru = GRU(units=128, return_sequences=True, return_state=True)
model_decoder_gru = GRU(units=128, return_sequences=True, return_state=True)
attention_layer = MultiHeadAttention(num_heads=5, key_dim=64)
encoder_decoder = EncoderDecoderWithAttention(input_dim_encoder = all_words_input, input_dim_decoder = all_words_output , output_dim_encoder = 100, output_dim_decoder = 100, input_length_encoder = encoder_max_length , input_length_decoder = decoder_max_length, model_encoder = model_encoder_gru, model_decoder = model_decoder_gru, unit = all_words_output, attention_layer = attention_layer)
encoder_output, encoder_inputs, encoder_states = encoder_decoder.encoder_()
decoder_inputs, decoder_outputs = encoder_decoder.decoder_(encoder_states=encoder_states, encoder_outputs=encoder_output)
model_gru_4 = encoder_decoder.build_model()
encoder_decoder.model_compile(optimizer=tf.keras.optimizers.legacy.Adam(), loss='sparse_categorical_crossentropy', model = model_gru_4, metrics=['sparse_categorical_accuracy'])
history = encoder_decoder.model_fit(model = model_gru_4, epochs = 100, early_stop = early_stops, batch_size = 32, encoder_input = pad_encoder_input, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
