In [0]:
#Imports
import pandas as pd
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Embedding, Dropout, Dense, Activation
from keras.layers import LSTM, Bidirectional, Input
from keras.layers import concatenate,merge
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

finaldf=pd.read_csv('google_nq_final_processed.csv')

Using TensorFlow backend.


In [0]:
import re

# Remove all tags
def removeTags(sent):
    s = sent.strip().split()
    
    stop_words = ['<P>', '</P>', '<Table>', '</Table>', '<Tr>', '</Tr>', '<Ul>', '<Ol>', '<Dl>', '</Ul>', '</Ol>', \
             '</Dl>', '<Li>', '<Dd>', '<Dt>', '</Li>', '</Dd>', '</Dt>', '<H1>', '</H1>', '<H2>', '</H2>', '<H3>', '</H3>',
            'wikipedia', '</Td>', '<Th>', '<H4>', '</H4>', '</Th>', '<Td>', 'Jump up', 'Jump to']
    s = [w for w in s if w not in stop_words]
    
    s = " ".join(s)
    # replace unnecessary characters with space
    s = re.sub(r"[^A-Za-z0-9]", " ", str(s).lower())    
    
    return s

In [0]:
new_doc=[]
new_ans=[]
for i in range(len(finaldf)):
    new_doc.append(removeTags(finaldf.iloc[i]['document_text']))
    new_ans.append(removeTags(finaldf.iloc[i]['answer']))

In [0]:
finaldf['document']=new_doc
finaldf['ans']=new_ans
finaldf=finaldf.drop(columns=['document_text','answer'])
len(finaldf)

33347

In [0]:
count_df=finaldf['document'].apply(lambda x: len(x))
df_small = finaldf[count_df <= 3000]
df_small.reset_index(inplace=True, drop=True)

In [0]:
len(df_small)

1655

In [0]:
train=df_small[:1400]
test=df_small[1400:]
X_train,y_train=train[['document','question_text']],train[['start','end']]
X_test,y_test=test[['document','question_text']],test[['start','end']]


In [0]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(X_train['document'])
vocab_size = len(tokenizer.word_index) + 1

In [0]:
vocab_size

31720

In [0]:
lenght_list=[]
for l in X_train.question_text:
    lenght_list.append(len(l.split(' ')))
max_ques= np.max(lenght_list)
max_ques

lenght_list=[]
for l in X_train.document:
    lenght_list.append(len(l.split(' ')))
max_doc= np.max(lenght_list)
max_doc


1066

In [0]:
# helper to load Glove embeddings
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    embedding_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs 
    f.close()
    print('Found %s word vectors.' % len(embedding_index))
    return embedding_index

print('Preparing embedding matrix.')
embeddings_index = loadGloveModel('glove.6B.100d.txt')

nb_words = vocab_size
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = max_doc

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
print('Embedding matrix completed.')

Preparing embedding matrix.
Found 400000 word vectors.
Embedding matrix completed.


In [0]:
''' 
To perform vectorization of text records, 
this function is taken from 'https://github.com/wentaozhu/recurrent-attention-for-QA-SQUAD-based-on-keras' with necessary modifications.

Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
def vectorizeData(xContext, xQuestion, xAnswerBeing, xAnswerEnd, word_index, context_maxlen, question_maxlen):
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in range(len(xContext)):
        x = [word_index[w] if w in tokenizer.word_index else tokenizer.word_index['a']  for w in xContext[i]]
        xq = [word_index[w] if w in tokenizer.word_index else tokenizer.word_index['a'] for w in xQuestion[i] ]
        # map the first and last words of answer span to one-hot representations
        y_Begin =  np.zeros(len(xContext[i]))
        y_Begin[xAnswerBeing[i]] = 1
        y_End = np.zeros(len(xContext[i]))
        y_End[xAnswerEnd[i]] = 1
        X.append(x)
        Xq.append(xq)
        YBegin.append(y_Begin)
        YEnd.append(y_End)
    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post'), pad_sequences(YBegin, maxlen=context_maxlen, padding='post'), pad_sequences(YEnd, maxlen=context_maxlen, padding='post')

In [0]:
tX, tXq, tYBegin, tYEnd = vectorizeData(X_train['document'],X_train['question_text'],y_train['start'],y_train['end'],tokenizer.word_index,max_doc,max_ques)

In [0]:
tX.shape, tXq.shape, tYBegin.shape, tYEnd.shape 

((1400, 1066), (1400, 18), (1400, 1066), (1400, 1066))

In [0]:
from keras import optimizers
import keras

question_input = Input(shape=(max_ques,), dtype='int32', name='question_input')
context_input = Input(shape=(max_doc,), dtype='int32', name='context_input')

questionEmbd = Embedding(output_dim=EMBEDDING_DIM, input_dim=vocab_size,
                         weights=[embedding_matrix], 
                         input_length=max_ques, trainable=False)(question_input) #mask_zero=True, 
contextEmbd = Embedding(output_dim=EMBEDDING_DIM, input_dim=vocab_size,
                         weights=[embedding_matrix], 
                         input_length=max_doc, trainable=False)(context_input) #mask_zero=True, 
Q_h = Bidirectional(LSTM(100, return_sequences=True))(questionEmbd)
C_h = Bidirectional(LSTM(100, return_sequences=True))(contextEmbd)

merge1 = concatenate([Q_h, C_h], axis=1)

QC_bilstm = Bidirectional(LSTM(100, return_sequences=True))(merge1)
merge2 = concatenate([QC_bilstm, C_h], axis=1)

start_bilstm =  Bidirectional(LSTM(100, return_sequences=True))(merge2)
end_bilstm =  Bidirectional(LSTM(100, return_sequences=True))(merge2)

start_token_dense = LSTM(max_doc, activation='softmax')(start_bilstm)
end_token_dense = LSTM(max_doc, activation='softmax')(end_bilstm)

model = Model(input=[context_input, question_input], output=[start_token_dense, end_token_dense])
adam = optimizers.Adam(lr=0.003)
model.compile(optimizer=adam, loss='categorical_crossentropy')
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
question_input (InputLayer)     (None, 18)           0                                            
__________________________________________________________________________________________________
context_input (InputLayer)      (None, 1066)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 18, 100)      3172000     question_input[0][0]             
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1066, 100)    3172000     context_input[0][0]              
___________



In [0]:
EPOCHS=100
es1=keras.callbacks.EarlyStopping(monitor='val_dense_3_loss', min_delta=0.01, patience=7, verbose=1, mode="auto")
es2=keras.callbacks.EarlyStopping(monitor='val_dense_4_loss', min_delta=0.01, patience=7, verbose=1, mode="auto")

callbacks_list=[es1,es2]


model.fit([tX,tXq],[tYBegin,tYEnd],epochs=EPOCHS,shuffle=False,validation_split=0.2,callbacks=callbacks_list,verbose=1,use_multiprocessing=True,
          batch_size=128)


Train on 1120 samples, validate on 280 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping


<keras.callbacks.callbacks.History at 0x7fc4604c98d0>

In [0]:
# !pip install h5py
import h5py

model.save_weights('my_model_weights.h5')


In [0]:
''' 
To perform vectorization of text records, 
this function is taken from 'https://github.com/wentaozhu/recurrent-attention-for-QA-SQUAD-based-on-keras' with necessary modifications.

Vectorize the words to their respective index and pad context to max context length and question to max question length.
Answers vectors are padded to the max context length as well.
'''
def vectorizeValData(xContext, xQuestion, word_index, context_maxlen, question_maxlen):
    X = []
    Xq = []
    YBegin = []
    YEnd = []
    for i in range(len(xContext)):
        x = [word_index[w] if w in tokenizer.word_index else tokenizer.word_index['a']  for w in xContext[i]]
        xq = [word_index[w] if w in tokenizer.word_index else tokenizer.word_index['a'] for w in xQuestion[i] ]
        
        X.append(x)
        Xq.append(xq)

    return pad_sequences(X, maxlen=context_maxlen, padding='post'), pad_sequences(Xq, maxlen=question_maxlen, padding='post')

# vX,vXq=vectorizeValData(X_test['document'], X_test['question_text'], tokenizer.word_index, max_doc,max_ques)
vX, vXq, vStart, vEnd = vectorizeData(X_test['document'],X_test['question_text'],y_test['start'],y_test['end'],tokenizer.word_index,max_doc,max_ques)

In [0]:
from keras import models
# trained_model = models.load_weights('nn_model.h5')

predictions = model.predict([vX, vXq], batch_size=128)

print(predictions[0].shape, predictions[1].shape)

ansBegin = np.zeros((predictions[0].shape[0],), dtype=np.int32)
ansEnd = np.zeros((predictions[0].shape[0],),dtype=np.int32) 
for i in range(predictions[0].shape[0]):
    ansBegin[i] = predictions[0][i, :].argmax()
    ansEnd[i] = predictions[1][i, :].argmax()

In [0]:
# F1-Score calculation
from sklearn.metrics import f1_score

start_f1 = f1_score(vStart ,ansBegin, average="weighted")
end_f1 = f1_score(vEnd ,ansEnd, average="weighted")

#F1
(start_f1 + end_f1) / 2

  'precision', 'predicted', average, warn_for)


0.0035677428866283355

- Because F1 score=0, which is wrong, We didn't calculate EM score.