In [1]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D
from keras.layers import Convolution1D, Embedding
from keras.layers.merge import Concatenate

import json
import numpy as np
import string
import re
import h5py

Using TensorFlow backend.


In [2]:
with open('train-v1.1.json') as json_data:
    d = json.load(json_data)

In [3]:
dataset = d['data']

In [6]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


In [7]:
char_dict = {' ':0, 'a':1, 'b':2, 'c':3, 'd':4, 'e':5,'f':6,'g':7,'h':8,'i':9,'j':10,'k':11,'l':12,
            'm':13,'n':14,'o':15,'p':16,'q':17,'r':18,'s':19,'t':20,'u':21,'v':22,'w':23,'x':24,'y':25,'z':26,
            '0':27,'1':28,'2':29,'3':30,'4':31,'5':32,'6':33,'7':34,'8':35,'9':36}

In [8]:
context_list = []
question_list = []

for article in dataset:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            for ans in qa['answers']:
                # append both context and questions many times for more than one question/answer
                
                cont = normalize_answer(paragraph['context'])[:1000]
                temp = []
                for ch in cont:
                    try:
                        temp.append(char_dict[ch])
                    except KeyError:
                        continue
                context_list.append(temp)
                
                ques = normalize_answer(qa['question'])[:50]
                temp = []
                for ch in ques:
                    try:
                        temp.append(char_dict[ch])
                    except KeyError:
                        continue
                    question_list.append(temp)

In [None]:
context_array = np.zeros((len(context_list), 1000), dtype=np.int)
question_array = np.zeros((len(question_list), 50), dtype=np.int)

for i in range(len(context_list)):
    for j in range(len(context_list[i])):
        context_array[i][j] = context_list[i][j]
        
for i in range(len(question_list)):
    for j in range(len(question_list[i])):
        question_array[i][j] = question_list[i][j]

### Following model can be merged with Glove Embeddings to get better results but character-level embeddings are Very Expensive to compute

In [3]:
embedding_dim = 50
filter_size = 3
num_filters = 10
dropout = 0.5
hidden_dims = 50
vocab = 27

### This model will give as ouput a vector of 1024 dimension as can be seen in the summary plot this vector can be used as embedding


In [4]:
model = Sequential()
model.add(Embedding(27, embedding_dim, input_length=5000))
model.add(Convolution1D(filters=num_filters, kernel_size=filter_size,
                       activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(dropout))
model.add(Dense(1024))

In [5]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5000, 50)          1350      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4998, 10)          1510      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2499, 10)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 24990)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 24990)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              25590784  
Total params: 25,593,644.0
Trainable params: 25,593,644.0
Non-trainable params: 0.0
__________________________________________________________

## Glove testing

In [10]:
f = open('glove.6B.50d.txt')

In [11]:
import numpy as np

embeddings_index = {}
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [12]:
len(embeddings_index)

400000

In [13]:
word_index = np.load('word_to_indx.npy').item()

In [14]:
embedding_matrix = np.zeros((len(word_index) + 1, 50))
for word,i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
embedding_matrix.shape

(119690, 50)

In [16]:
import h5py
with h5py.File('embeddings_50.h5', 'w') as hf:
    hf.create_dataset('embed', data=embedding_matrix)

In [17]:
with h5py.File('embeddings_50.h5', 'r') as hf:
    embedding_matrix = hf['embed'][:]

In [18]:
embedding_matrix.shape

(119690, 50)