# Import Libraries

In [1]:
import tensorflow as tf
import numpy as np
from src.read_data import read_file
from src.text_normalization import english_text_normalization
from src.text_processing import data_words, words_to_index_, index_to_word_, word_counts, text_to_sequence, sequences_padding, word_sequence_to_text
import random
from tensorflow.keras.layers import Attention, AdditiveAttention, MultiHeadAttention # type: ignore
from tensorflow.keras.layers import LSTM, GRU, Bidirectional# type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
from src.data_split import input_output_split, decoder_input_output
from src.encoder import encoder_without_attention, encoder_with_attention
from src.decoder import decoder_without_attention, decoder_with_attention
from src.encoder_decoder_model import build_model, model_compile, model_fit, save_model, summary
from src.transformers import transformers
import warnings
import os
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Ignore Warnings

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings(action='ignore')

# Set Early Stop For Encoder-Decoder Model

In [3]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

# Read Data

In [5]:
Corpus = read_file(file_path='Question Answer Data.txt')
len(Corpus)

3838

In [6]:
random.seed(44)
Corpus = random.sample(Corpus, len(Corpus))

# Split The Data Into Encoder Input And Output

In [7]:
context, question, answer = input_output_split(data=Corpus, text_normalization1=english_text_normalization, text_normalization2=english_text_normalization, text_normalization3 =english_text_normalization)

# Text Processing

**1-Word To Integer (Tokenizer)**

In [8]:
all_words , words = data_words(filter='"#$&*+/:=@[\\]^_{|}~', data=(context + question + answer)) # Set With Unique Words And Find Number Of Unique Words
words_to_index = words_to_index_(words=words) # Convert Word To Integer Index
index_to_words = index_to_word_(words=words)

In [9]:
all_words

10876

**2-Data To Sequence**

In [10]:
context_sequences = text_to_sequence(word_index=words_to_index, data=context) # Convert Text Into Squences Of Integer
question_sequences = text_to_sequence(word_index=words_to_index, data=question) # Convert Text Into Squences Of Integer
answer_sequences = text_to_sequence(word_index=words_to_index, data=answer) # Convert Text Into Squences Of Integer

**3-Max Length**

In [11]:
context_max_length = len(max(context_sequences, key = len)) # Find Max Length
question_max_length = len(max(question_sequences, key = len))
max_length = max(context_max_length, question_max_length)
answer_max_length = len(max(answer_sequences, key=len))# Find Max Length

**4-Encoder Zero Padding**

In [12]:
pad_context = sequences_padding(input_sequence=context_sequences, max_length=max_length) # Zero Padding (Add Zeros To End Of Sequence In Input Sequences To Make All Sequences In Same Length)
pad_question = sequences_padding(input_sequence=question_sequences, max_length=max_length) # Zero Padding (Add Zeros To End Of Sequence In Input Sequences To Make All Sequences In Same Length)

**5-Generate Decoder Input And Output**

In [13]:
decoder_input, decoder_output = decoder_input_output(output_data= answer_sequences)

**6-Decoder Zero Padding**

In [14]:
pad_decoder_input = sequences_padding(input_sequence=decoder_input, max_length=answer_max_length) # Zero Padding (Add Zeros To End Of Sequence In Input Sequences To Make All Sequences In Same Length)
pad_decoder_output = sequences_padding(input_sequence=decoder_output, max_length=answer_max_length) # Zero Padding (Add Zeros To End Of Sequence In Input Sequences To Make All Sequences In Same Length)

**7-Word Occurrence**

In [15]:
word_count = word_counts(data=(context + question + answer)) # Number Of Occurrence Each Word

# Encoder-Decoder Without Attention

In [16]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

**1-LSTM**

In [17]:
encoder_model_1 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states = encoder_without_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_1)

decoder_model_1 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_without_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_1)

model_1 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_1)

In [18]:
model_compile(model = model_1, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_1, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_1,name="LSTM_1")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 191ms/step - loss: 7.7110 - sparse_categorical_accuracy: 0.0434
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 246ms/step - loss: 5.7003 - sparse_categorical_accuracy: 0.0455
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 193ms/step - loss: 5.4950 - sparse_categorical_accuracy: 0.0482
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 166ms/step - loss: 5.3429 - sparse_categorical_accuracy: 0.0493
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 177ms/step - loss: 5.2454 - sparse_categorical_accuracy: 0.0495
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 319ms/step - loss: 5.1660 - sparse_categorical_accuracy: 0.0495
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 224ms/step - loss: 5.0905 - sparse_categorical_accuracy: 0.0495

In [19]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

**2-GRU**

In [20]:
encoder_model_2 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states = encoder_without_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_2)

decoder_model_2 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_without_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_2)

model_2 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_2)

In [21]:
model_compile(model = model_2, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_2, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_2, name="GRU_1")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 146ms/step - loss: 7.7364 - sparse_categorical_accuracy: 0.0426
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 141ms/step - loss: 5.7934 - sparse_categorical_accuracy: 0.0459
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 145ms/step - loss: 5.5389 - sparse_categorical_accuracy: 0.0487
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 140ms/step - loss: 5.3986 - sparse_categorical_accuracy: 0.0492
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 140ms/step - loss: 5.2777 - sparse_categorical_accuracy: 0.0493
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 148ms/step - loss: 5.1630 - sparse_categorical_accuracy: 0.0495
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 138ms/step - loss: 5.0242 - sparse_categorical_accuracy: 0.0494

In [22]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

**3-Bidirectional**

In [23]:
encoder_model_3 = Bidirectional(LSTM(units=128, return_sequences=True, return_state=True, seed=33))
context_inputs, question_inputs, encoder_states = encoder_without_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_3)

decoder_model_3 = LSTM(units=256, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_without_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_3)

model_3 =  build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_3)

In [24]:
model_compile(model = model_3, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_3, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_3,name="Bidirectional_1")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 246ms/step - loss: 7.4406 - sparse_categorical_accuracy: 0.0434
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 307ms/step - loss: 5.6273 - sparse_categorical_accuracy: 0.0465
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 273ms/step - loss: 5.3938 - sparse_categorical_accuracy: 0.0486
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 267ms/step - loss: 5.2318 - sparse_categorical_accuracy: 0.0493
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 284ms/step - loss: 5.1143 - sparse_categorical_accuracy: 0.0494
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 266ms/step - loss: 5.0063 - sparse_categorical_accuracy: 0.0495
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 284ms/step - loss: 4.8886 - sparse_categorical_accuracy: 0.0496

# Encoder-Decoder With Attention

**1-Luong Attension**

In [25]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

*A-LSTM*

In [26]:
encoder_model_4 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_4)

attention_layer = Attention(use_scale=True)
decoder_model_4 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_4, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_4 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_4)

In [27]:
model_compile(model = model_4, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_4, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_4, name="LSTM_2")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 199ms/step - loss: 3.7715 - sparse_categorical_accuracy: 0.8170
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 198ms/step - loss: 0.9443 - sparse_categorical_accuracy: 0.8892
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 195ms/step - loss: 0.9076 - sparse_categorical_accuracy: 0.8924
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 199ms/step - loss: 0.8749 - sparse_categorical_accuracy: 0.8926
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 195ms/step - loss: 0.8517 - sparse_categorical_accuracy: 0.8931
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 233ms/step - loss: 0.8348 - sparse_categorical_accuracy: 0.8934
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 241ms/step - loss: 0.8212 - sparse_categorical_accuracy: 0.8934

*B-GRU*

In [28]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [29]:
encoder_model_5 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_5)

attention_layer = Attention(use_scale=True)
decoder_model_5 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_5, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_5 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_5)

In [30]:
model_compile(model = model_5, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_5, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_5, name="GRU_2")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 190ms/step - loss: 3.7114 - sparse_categorical_accuracy: 0.8213
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 179ms/step - loss: 0.9399 - sparse_categorical_accuracy: 0.8897
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 180ms/step - loss: 0.8817 - sparse_categorical_accuracy: 0.8922
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 192ms/step - loss: 0.8419 - sparse_categorical_accuracy: 0.8930
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 184ms/step - loss: 0.8096 - sparse_categorical_accuracy: 0.8936
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 181ms/step - loss: 0.7831 - sparse_categorical_accuracy: 0.8942
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 259ms/step - loss: 0.7596 - sparse_categorical_accuracy: 0.8945

*C-Bidirectional*

In [31]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [32]:
encoder_model_6 = Bidirectional(LSTM(units=128, return_sequences=True, return_state=True, seed=33))
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_6)

attention_layer = Attention(use_scale=True)
decoder_model_6 = LSTM(units=256, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_6, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_6 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_6)

In [33]:
model_compile(model = model_6, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_6, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_6, name="Bidirectional_2")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 354ms/step - loss: 3.0896 - sparse_categorical_accuracy: 0.8193
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 471ms/step - loss: 0.9439 - sparse_categorical_accuracy: 0.8908
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 409ms/step - loss: 0.9045 - sparse_categorical_accuracy: 0.8925
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 403ms/step - loss: 0.8573 - sparse_categorical_accuracy: 0.8930
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 452ms/step - loss: 0.8185 - sparse_categorical_accuracy: 0.8934
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 429ms/step - loss: 0.7968 - sparse_categorical_accuracy: 0.8934
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 403ms/step - loss: 0.7809 - sparse_categorical_accuracy: 0.8936

**2-Bahdanau Attension**

*A-LSTM*

In [34]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [35]:
encoder_model_7 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_7)

attention_layer = AdditiveAttention(use_scale=True)
decoder_model_7 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_7, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_7 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_7)

In [36]:
model_compile(model = model_7, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_7, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_7, name="LSTM_3")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 276ms/step - loss: 3.8254 - sparse_categorical_accuracy: 0.8149
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 203ms/step - loss: 0.9530 - sparse_categorical_accuracy: 0.8891
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 207ms/step - loss: 0.9181 - sparse_categorical_accuracy: 0.8904
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 200ms/step - loss: 0.8780 - sparse_categorical_accuracy: 0.8925
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 199ms/step - loss: 0.8426 - sparse_categorical_accuracy: 0.8932
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 210ms/step - loss: 0.8199 - sparse_categorical_accuracy: 0.8933
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 289ms/step - loss: 0.7977 - sparse_categorical_accuracy: 0.8932

*B-GRU*

In [37]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [38]:
encoder_model_8 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_8)

attention_layer = Attention(use_scale=True)
decoder_model_8 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_8, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_8 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_8)

In [39]:
model_compile(model = model_8, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_8, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_8, name="GRU_3")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 228ms/step - loss: 3.6251 - sparse_categorical_accuracy: 0.8205
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 226ms/step - loss: 0.9429 - sparse_categorical_accuracy: 0.8895
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 179ms/step - loss: 0.8934 - sparse_categorical_accuracy: 0.8923
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 180ms/step - loss: 0.8464 - sparse_categorical_accuracy: 0.8927
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 226ms/step - loss: 0.8121 - sparse_categorical_accuracy: 0.8935
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 224ms/step - loss: 0.7821 - sparse_categorical_accuracy: 0.8939
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 212ms/step - loss: 0.7569 - sparse_categorical_accuracy: 0.8942

*C-Bidirectional*

In [40]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [41]:
encoder_model_9 = Bidirectional(LSTM(units=128, return_sequences=True, return_state=True, seed=33))
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_9)

attention_layer = AdditiveAttention(use_scale=True)
decoder_model_9 = LSTM(units=256, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_9, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_9 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_9)

In [42]:
model_compile(model = model_9, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_9, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_9, name="Bidirectional_3")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 462ms/step - loss: 3.0767 - sparse_categorical_accuracy: 0.8190
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 431ms/step - loss: 0.9419 - sparse_categorical_accuracy: 0.8904
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 428ms/step - loss: 0.8960 - sparse_categorical_accuracy: 0.8925
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 533ms/step - loss: 0.8484 - sparse_categorical_accuracy: 0.8932
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 440ms/step - loss: 0.8047 - sparse_categorical_accuracy: 0.8936
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 439ms/step - loss: 0.7739 - sparse_categorical_accuracy: 0.8939
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 448ms/step - loss: 0.7448 - sparse_categorical_accuracy: 0.8937

**3-MultiHead Attesnion**

*A-LSTM*

In [43]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [44]:
encoder_model_10 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_10)

attention_layer = MultiHeadAttention(num_heads=4, key_dim=32)
decoder_model_10 = LSTM(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model= decoder_model_10, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_10 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_10)

In [45]:
model_compile(model = model_10, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_10, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_10, name="LSTM_4")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 189ms/step - loss: 3.5829 - sparse_categorical_accuracy: 0.8153
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 191ms/step - loss: 0.9177 - sparse_categorical_accuracy: 0.8896
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 249ms/step - loss: 0.8683 - sparse_categorical_accuracy: 0.8926
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 226ms/step - loss: 0.8385 - sparse_categorical_accuracy: 0.8930
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 209ms/step - loss: 0.8251 - sparse_categorical_accuracy: 0.8934
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 193ms/step - loss: 0.8036 - sparse_categorical_accuracy: 0.8935
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 227ms/step - loss: 0.7841 - sparse_categorical_accuracy: 0.8935

*B-GRU*

In [46]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [47]:
encoder_model_11= GRU(units=128, return_sequences=True, return_state=True, seed=33)
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_11)

attention_layer = MultiHeadAttention(num_heads=4, key_dim=32)
decoder_model_11 = GRU(units=128, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_11, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_11 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_11)

In [48]:
model_compile(model = model_11, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_11, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_11, name="GRU_4")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 240ms/step - loss: 3.4416 - sparse_categorical_accuracy: 0.8290
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 270ms/step - loss: 0.9085 - sparse_categorical_accuracy: 0.8922
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 210ms/step - loss: 0.8529 - sparse_categorical_accuracy: 0.8922
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 201ms/step - loss: 0.8257 - sparse_categorical_accuracy: 0.8925
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 191ms/step - loss: 0.8097 - sparse_categorical_accuracy: 0.8931
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 187ms/step - loss: 0.7859 - sparse_categorical_accuracy: 0.8933
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 242ms/step - loss: 0.7660 - sparse_categorical_accuracy: 0.8933

*C-Bidirectional*

In [49]:
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

In [50]:
encoder_model_12 = Bidirectional(LSTM(units=128, return_sequences=True, return_state=True, seed=33))
context_inputs, question_inputs, encoder_states, encoder_outputs = encoder_with_attention(input_dim_encoder = all_words, output_dim_encoder = 100, encoder_model= encoder_model_12)

attention_layer = MultiHeadAttention(num_heads=4, key_dim=32)
decoder_model_12 = LSTM(units=256, return_sequences=True, return_state=True, seed=33)
decoder_inputs, decoder_outputs  = decoder_with_attention(encoder_states = encoder_states, input_dim_decoder = all_words, output_dim_decoder = 128, unit = all_words,  decoder_model=decoder_model_12, attention_layer=attention_layer, encoder_outputs=encoder_outputs)

model_12 = build_model(context_input= context_inputs, question_input= question_inputs, decoder_inputs = decoder_inputs, decoder_output= decoder_outputs)
summary(model=model_12)

In [51]:
model_compile(model = model_12, optimizer = tf.keras.optimizers.Adam(), loss = 'sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model_fit(model = model_12, epochs = 120, batch_size = 32, context_data =  pad_context, question_data=pad_question, decoder_input = pad_decoder_input, decoder_output = pad_decoder_output, early_stop = early_stopping)
save_model(model=model_12, name="Bidirectional_4")

Epoch 1/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 413ms/step - loss: 3.0288 - sparse_categorical_accuracy: 0.8166
Epoch 2/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 407ms/step - loss: 0.9073 - sparse_categorical_accuracy: 0.8913
Epoch 3/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 408ms/step - loss: 0.8500 - sparse_categorical_accuracy: 0.8924
Epoch 4/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 437ms/step - loss: 0.8077 - sparse_categorical_accuracy: 0.8929
Epoch 5/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 391ms/step - loss: 0.7843 - sparse_categorical_accuracy: 0.8932
Epoch 6/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 451ms/step - loss: 0.7592 - sparse_categorical_accuracy: 0.8934
Epoch 7/120
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 401ms/step - loss: 0.7398 - sparse_categorical_accuracy: 0.8935

# Transformers

In [1]:
model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
context = 'new york city has over 28,000 acres (110 km2) of municipal parkland and 14 miles (23 km) of public beaches. parks in new york city include central park, prospect park, flushing meadows–corona park, forest park, and washington square park. the largest municipal park in the city is pelham bay park with 2,700 acres (1,093 ha)'
question = 'new york has approximately how many acres of parks?'
transformers(model_name=model_name, context=context, question=question)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.



Context: new york city has over 28,000 acres (110 km2) of municipal parkland and 14 miles (23 km) of public beaches. parks in new york city include central park, prospect park, flushing meadows–corona park, forest park, and washington square park. the largest municipal park in the city is pelham bay park with 2,700 acres (1,093 ha)
Question: new york has approximately how many acres of parks?
Answer: 28 , 000
