# LSTM MODEL

In [1]:
import json
import numpy as np
import re
import io
import nltk
import h5py
from keras import backend as K
from keras.layers.embeddings import Embedding
from keras.layers import Input, Dense, Dropout, RepeatVector, Activation, merge, Lambda, Flatten, Reshape
from keras.layers import LSTM, Bidirectional, TimeDistributed, GRU
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras import optimizers
from keras.optimizers import Adam, RMSprop
from keras.layers import concatenate

Using TensorFlow backend.


In [2]:
embeddings_index = {}
f = open( 'glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [3]:
context = h5py.File('context.h5','r')
questions = h5py.File('questions.h5','r')
answers = h5py.File('answers.h5','r')
ans_begin = h5py.File('begin.h5','r')
ans_end = h5py.File('end.h5','r')

In [4]:
c_data = context['context'][:]
qn_data = questions['questions'][:]
ans_data = answers['answers'][:]

In [5]:
begin_ans = ans_begin['begin'][:]
end_ans = ans_end['end'][:]

In [6]:
# loding vocabulary
word_index = np.load('words.npy').item()

In [7]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
vocab_size = len(word_index) + 1
#embedding_vector_length = 50
batch = 128
max_span_begin = np.amax(begin_ans)
max_span_end = np.amax(end_ans)
slce = 10000


In [9]:
print("Vocab Size")
vocab_size

Vocab Size


119616

In [11]:
context_input = Input(shape=(700, ), dtype='int32', name='c_data')
context_embed = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], 
              input_length=700, trainable=False)(context_input)
#lstm_out = (LSTM(256, return_sequences=True, implementation=2))(x)
drop_1 = Dropout(0.5)(context_embed)
#drop_1 = Dropout(0.5)(lstm_out)

In [12]:
ques_input = Input(shape=(100, ), dtype='int32', name='qn_data')
question_embed = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], 
              input_length=100, trainable=False)(ques_input)
#lstm_out = (LSTM(256, return_sequences=True, implementation=2))(x)
drop_2 = Dropout(0.5)(question_embed)
#drop_2 = Dropout(0.5)(lstm_out)

In [13]:
merge_layer = concatenate([drop_1, drop_2], axis=1)
lstm_layer = (LSTM(512, implementation=2))(merge_layer)
drop_3 =  Dropout(0.5)(lstm_layer)
softmax_1 = Dense(max_span_begin, activation='softmax')(lstm_layer)
softmax_2 = Dense(max_span_end, activation='softmax')(lstm_layer)
model = Model(inputs=[context_input, ques_input], outputs=[softmax_1, softmax_2])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
c_data (InputLayer)             (None, 700)          0                                            
__________________________________________________________________________________________________
qn_data (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 700, 100)     11961600    c_data[0][0]                     
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 100)     11961600    qn_data[0][0]                    
__________________________________________________________________________________________________
dropout_1 

In [14]:
model_history = model.fit([c_data[:slce], qn_data[:slce]],
                        [begin_ans[:slce], end_ans[:slce]], verbose=2,
                         batch_size=batch, epochs=10)

Epoch 1/10
 - 4212s - loss: 14.0835 - dense_1_loss: 7.0029 - dense_2_loss: 7.0806 - dense_1_acc: 0.0272 - dense_2_acc: 0.0045
Epoch 2/10
 - 3066s - loss: 13.2727 - dense_1_loss: 6.5953 - dense_2_loss: 6.6774 - dense_1_acc: 0.0279 - dense_2_acc: 0.0055
Epoch 3/10
 - 2787s - loss: 13.2328 - dense_1_loss: 6.5770 - dense_2_loss: 6.6557 - dense_1_acc: 0.0279 - dense_2_acc: 0.0064
Epoch 4/10
 - 2660s - loss: 13.2182 - dense_1_loss: 6.5696 - dense_2_loss: 6.6486 - dense_1_acc: 0.0279 - dense_2_acc: 0.0062
Epoch 5/10
 - 2697s - loss: 13.2027 - dense_1_loss: 6.5616 - dense_2_loss: 6.6410 - dense_1_acc: 0.0279 - dense_2_acc: 0.0066
Epoch 6/10
 - 2669s - loss: 13.1863 - dense_1_loss: 6.5533 - dense_2_loss: 6.6330 - dense_1_acc: 0.0279 - dense_2_acc: 0.0068
Epoch 7/10
 - 2668s - loss: 13.1771 - dense_1_loss: 6.5484 - dense_2_loss: 6.6287 - dense_1_acc: 0.0279 - dense_2_acc: 0.0071
Epoch 8/10
 - 2770s - loss: 13.1697 - dense_1_loss: 6.5452 - dense_2_loss: 6.6246 - dense_1_acc: 0.0279 - dense_2_acc:

# PREDICTIONS USING TEST DATA

In [28]:
t_context = h5py.File('context_test.h5','r')
t_questions = h5py.File('questions_test.h5','r')
t_answers = h5py.File('answers_test.h5','r')
t_ans_begin = h5py.File('begin_test.h5','r')
t_ans_end = h5py.File('end_test.h5','r')

In [16]:
t_c_data = t_context['context'][:]
t_qn_data = t_questions['questions'][:]
t_ans_data = t_answers['answers'][:]
t_begin_ans = t_ans_begin['begin'][:]
t_end_ans = t_ans_end['end'][:]

In [29]:
index_train = np.load('indxes.npy')
index_test = np.load('indxes_test.npy')

In [17]:
predictions = model.predict([t_c_data,t_qn_data], batch_size=128)

In [18]:
print(predictions[0].shape, predictions[1].shape)

(20302, 3126) (20302, 3136)


In [20]:
ansBegin = np.zeros((predictions[0].shape[0],), dtype=np.int32)
ansEnd = np.zeros((predictions[0].shape[0],),dtype=np.int32) 

In [22]:
for i in range(predictions[0].shape[0]):
	ansBegin[i] = predictions[0][i, :].argmax()
	ansEnd[i] = predictions[1][i, :].argmax()
print(ansBegin.min(), ansBegin.max(), ansEnd.min(), ansEnd.max())

0 1614 7 2106
