In [11]:
from __future__ import print_function

import keras
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, Dot, Merge, Lambda, multiply
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from sklearn import cross_validation, metrics
from functools import reduce
from itertools import chain
from keras import backend as K
from keras import optimizers


import tarfile
import numpy as np
import re
import os


def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]


def load_task(data_dir, task_id, only_supporting=False):
    '''Load the nth task. There are 20 tasks in total.

    Returns a tuple containing the training and testing data for the task.
    '''
    assert task_id > 0 and task_id < 21

    files = os.listdir(data_dir)
    files = [os.path.join(data_dir, f) for f in files]
    s = 'qa{}_'.format(task_id)
    train_file = [f for f in files if s in f and 'train' in f][0]
    test_file = [f for f in files if s in f and 'test' in f][0]
    train_data = get_stories(train_file, only_supporting)
    test_data = get_stories(test_file, only_supporting)
    return train_data, test_data

def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbI tasks format
    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = str.lower(line)
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line: # question
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            #a = tokenize(a)
            # answer is one vocab word even if it's actually multiple words
            a = [a]
            substory = None

            # remove question marks
            if q[-1] == "?":
                q = q[:-1]

            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]

            data.append((substory, q, a))
            story.append('')
        else: # regular sentence
            # remove periods
            sent = tokenize(line)
            if sent[-1] == ".":
                sent = sent[:-1]
            story.append(sent)
    return data


def get_stories(f, only_supporting=False):
    '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.
    If max_length is supplied, any stories longer than max_length tokens will be discarded.
    '''
    with open(f) as f:
        return parse_stories(f.readlines(), only_supporting=only_supporting)
    
def vectorize_data(data, word_idx, sentence_size, memory_size):
    """
    Vectorize stories and queries.

    If a sentence length < sentence_size, the sentence will be padded with 0's.

    If a story length < memory_size, the story will be padded with empty memories.
    Empty memories are 1-D arrays of length sentence_size filled with 0's.

    The answer array is returned as a one-hot encoding.
    """
    S = []
    Q = []
    A = []
    for story, query, answer in data:
        ss = []
        for i, sentence in enumerate(story, 1):
            ls = max(0, sentence_size - len(sentence))
            ss.append([word_idx[w] for w in sentence] + [0] * ls)

        # take only the most recent sentences that fit in memory
        ss = ss[::-1][:memory_size][::-1]

        # pad to memory_size
        lm = max(0, memory_size - len(ss))
        for _ in range(lm):
            ss.append([0] * sentence_size)

        lq = max(0, sentence_size - len(query))
        q = [word_idx[w] for w in query] + [0] * lq

#        y = np.zeros(len(word_idx) + 1) # 0 is reserved for nil word
#        for a in answer:
#            y[word_idx[a]] = 1

        S.append(ss)
        Q.append(q)
        A.append(word_idx[answer[0]])
    return np.array(S), np.array(Q), np.array(A)

path_1k = '/home/ro/dataset/tasks_1-20_v1-2/en/'
path_10k = '/home/ro/dataset/tasks_1-20_v1-2/en-10k/'

In [18]:
# task data
def prepossessing(task_id):
    train, test = load_task(path_1k, task_id)
    data = train + test

    vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in data)))
    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

    max_story_size = max(map(len, (s for s, _, _ in data)))
    mean_story_size = int(np.mean([ len(s) for s, _, _ in data ]))
    sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data)))
    query_size = max(map(len, (q for _, q, _ in data)))
    memory_size = min(320, max_story_size)

    vocab_size = len(word_idx) + 1 # +1 for nil word
    sentence_size = max(query_size, sentence_size) # for the position



    # train/validation/test sets
    trainS, trainQ, trainA = vectorize_data(train, word_idx, sentence_size, memory_size)
    testS, testQ, testA = vectorize_data(test, word_idx, sentence_size, memory_size)

#    print('trainS example', trainS[0])
#    print('trainQ example', trainQ[0])
#    print('trainA example', trainA[0])

    print("task_id", task_id)
    print("Vocab size", vocab_size)
    print("Longest sentence length", sentence_size)
    print("Longest story length", max_story_size)
    print("Average story length", mean_story_size)
    print("Query size", query_size)
    
    print('-')
    print('trainS shape:', trainS.shape)
    print('testS shape:', testS.shape)
    print('-')
    print('trainQ shape:', trainQ.shape)
    print('testQ shape:', testQ.shape)
    print('-')
    print('trainA shape:', trainA.shape)
    print('testA shape:', testA.shape)
    print('-')
    
    return trainS, trainQ, trainA, testS, testQ, testA, max_story_size, sentence_size, vocab_size

In [49]:
def get_model(max_story_size, sentence_size, vocab_size, d, k_hop):
    # placeholders
    story = Input(shape=(max_story_size, sentence_size,))
    question = Input(shape=(sentence_size,))

    # encoders
    emb_A = []
    for i in range(k_hop + 1) :
        emb_A.append(Embedding(input_dim=vocab_size, output_dim=d, embeddings_initializer='random_normal'))

    m_emb = []
    c_emb = []
    u_emb = []

    m_emb.append(emb_A[0](story))
    m_emb[0] = Lambda(lambda x: K.sum(x, axis=2))(m_emb[0])

    u_emb.append(emb_A[0](question))
    u_emb[0] = Lambda(lambda x: K.sum(x, axis=1))(u_emb[0])


    for i in range(k_hop) :
        u_temp = Lambda(lambda x: K.expand_dims(x, 1))(u_emb[i])
        u_temp = Lambda(lambda x: K.tile(x, (1, max_story_size, 1)))(u_temp)

        probs = multiply([m_emb[i], u_temp])
        probs = Lambda(lambda x: K.sum(x, axis=2))(probs)
        probs = Activation('softmax')(probs)
        probs_temp = Lambda(lambda x: K.expand_dims(x, 2))(probs)
        probs_temp = Lambda(lambda x: K.tile(x, (1, 1, d)))(probs_temp)

        c_emb.append(emb_A[i + 1](story))
        c_emb[i] = Lambda(lambda x: K.sum(x, axis=2))(c_emb[i])

        o_weight = multiply([c_emb[i], probs_temp])
        o_weight = Lambda(lambda x: K.sum(x, axis=1))(o_weight)

        u_emb.append(add([o_weight, u_emb[i]]))

        m_emb.append(emb_A[i + 1](story))
        m_emb[i + 1] = Lambda(lambda x: K.sum(x, axis=2))(m_emb[i + 1])
    
    u_temp = Lambda(lambda x: K.expand_dims(x, 1))(u_emb[-1])
    u_temp = Lambda(lambda x: K.tile(x, (1, vocab_size, 1)))(u_temp)

    answer = Lambda(lambda x: np.array(emb_A[-1].get_weights()[0]) * x)(u_temp)
    answer = Lambda(lambda x: K.sum(x, axis=1))(answer)
    
    answer = Activation('softmax')(answer)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model = Model([story, question], answer)
    model.compile(optimizer=sgd, loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model

In [50]:
d = 13
k_hop = 3
i = 1

trainS, trainQ, trainA, testS, testQ, testA, max_story_size, sentence_size, vocab_size = prepossessing(i)

#trainA = np.expand_dims(trainA,-1)
#testA = np.expand_dims(testA,-1)

model = get_model(max_story_size, sentence_size, vocab_size, d, k_hop)
#model.fit([trainS, trainQ], trainA,batch_size=32,epochs=100,validation_data=([testS, testQ], testA))

  return _compile(pattern, flags).split(string, maxsplit)


task_id 1
Vocab size 20
Longest sentence length 6
Longest story length 10
Average story length 6
Query size 3
-
trainS shape: (1000, 10, 6)
testS shape: (1000, 10, 6)
-
trainQ shape: (1000, 6)
testQ shape: (1000, 6)
-
trainA shape: (1000,)
testA shape: (1000,)
-
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_41 (InputLayer)           (None, 10, 6)        0                                            
__________________________________________________________________________________________________
input_42 (InputLayer)           (None, 6)            0                                            
__________________________________________________________________________________________________
embedding_81 (Embedding)        multiple             260         input_41[0][0]                   
                                            

In [10]:
from keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes = True)

In [None]:
for i in range(20) :
    print('task_id', i)
    model[i].evaluate([testS, testQ], testA, batch_size=32)