In [13]:
# This is an implementation of the End-to-End Memory Network as defined by Sukhbaatar, et al. 
# We use k=1, i.e. have only one computational step in the network

import numpy as np
import theano
import theano.tensor as T
from triple_reader import triple_reader
from question_reader import question_reader
from gensim.models import word2vec
from nltk.corpus import stopwords
import sys

In [2]:
# initialize word2vec model
word_model = word2vec.Word2Vec.load_word2vec_format('word2vec.bin', binary=True)

In [5]:
# read in file as tensors
text_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/"
    "Senior Thesis Code/ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/Triples/1-long1.txt")
    
tr = triple_reader(text_file)
# print tr.tripleList
tensor = tr.tensor

enMap = tr.enMap
relMap = tr.relMap

R = len(tensor)
N = len(tensor[0])
# dimension for encoding is arbitrary, we pick 20 here
d = 20

In [6]:
tensor_stack = np.hstack(tuple(tensor))
print(tensor_stack.shape)
print(N, N*R)

X = T.lmatrix('X')
q = T.dmatrix('q')
y = T.lmatrix('y')

(89, 2759)
(89, 2759)


In [14]:
# Find the word in entity that's most similar to given word
# default topsim to lowest possible python int
def findSimEn(word):
    topsim = None
    topEn = None
    for en in enMap:
        if type(en) == int:
            continue
        try:
            sim = word_model.similarity(en, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topEn = en
    return [topEn, topsim]

def findSimRel(word):
    topsim = None
    topRel = None
    for rel in relMap:
        if type(rel) == int:
            continue
        try:
            sim = word_model.similarity(rel, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topRel = rel
    return [topRel, topsim]

print findSimEn('critic')
print findSimRel('go')

# Return top similarity, sim score, and isEn boolean
def findTopEnOrRel(word):
#     try:
    [topEn, topEnsim] = findSimEn(word)
    [topRel, topRelsim] = findSimRel(word)
    if topEnsim > topRelsim:
        return (topEn, topEnsim, True, word)
    else:
        return (topRel, topRelsim, False, word)
        
    # if can't find similarity, ignore it
#     except KeyError as e:
#         return [None, 0, False, word]
    
# Return top 2 sims for an array of words
def findTopEnOrRelArr(wordArr):
    topArrs = []
    for word in wordArr:
        top = findTopEnOrRel(word)
#         print top
        topArrs.append(top)
#     print topArrs
#     print topArrs
    sortedTop = sorted(topArrs, key=lambda x: -x[1] if x[1] is not None else sys.maxint)
#     print sortedTop
    top1 = sortedTop[0]
    top2 = None
    # if the top is a relation, we have to pick an entity
    if top1[2] == False:
        curIndex = 1
        while curIndex < len(sortedTop):
            cur = sortedTop[curIndex]
            if cur[2] == False:
                curIndex += 1
                continue
            else:
                top2 = cur
                break
    else:
        top2 = sortedTop[1]
    if top2 == None:
        top2 = [0, 0, True]
    return [top1, top2]

print findTopEnOrRelArr(["critic", "went", "hi", "boat", "paint"])

['author', 0.31762739449640298]
['start', 0.41564052326381096]
[('was', 0.47963733397180919, False, 'went'), ('author', 0.31762739449640298, True, 'critic')]


In [26]:
q_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/Senior Thesis Code/"
"ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/1-long1-q.txt")

qr = question_reader(q_file)

# print qr.numQuestions

numTrainQ = int(qr.numQuestions*(float(2)/3))
numTestQ = qr.numQuestions - numTrainQ
# print numTrainQ
# print numTestQ

CHOICES_PER_Q = 5

train_Q = []
# VECTORIZE BY FINDING TOP TWO SIMILAR TO EACH QUESTION, AND MAKING A 2-HOT VECTOR OF LENGTH N+R
questions = qr.questionCombos
for i in xrange(numTrainQ):
    question = questions[i]
    question_words = question[0].split(" ")
    # remove stopwords
    question_words = [word for word in question_words if word not in stopwords.words('english')]
    [top1, top2] = findTopEnOrRelArr(question_words)
    print([top1, top2])
    
    # if word in question doesn't match, move on
    if top1[0] == None:
        continue
    
    # Now vectorize it to be of length N+R
    if top1[2] == True:
        index1 = enMap[top1[0]]
    else:
        index1 = N + relMap[top1[0]]
    if top2[2] == True:
        index2 = enMap[top2[0]]
    else:
        index2 = N + relMap[top2[0]]
    curQVec = np.zeros(N+R)
    curQVec[index1] = 1
    curQVec[index2] = 1
    train_Q.append(curQVec)
train_Q = np.array(train_Q)
train_Q = train_Q.T
print(train_Q.shape)

numQuestions  12
[('It', 0.60106851750939438, True, u'The'), ('available', 0.35666711035229737, False, u'used')]
[('requires', 0.53755015740882195, False, u'means'), ('more', 0.43554002125196029, True, u'nearly')]
[('available', 0.35666711035229737, False, u'used'), ('It', 0.3241844469421169, True, u'In')]
[('It', 0.60106851750939438, True, u'The'), ('examines', 0.44328536315913181, False, u'refers')]
[('It', 0.60106851750939438, True, u'The'), ('misunderstood', 0.43356134230254451, True, u'referred')]
[('cultures', 0.99999999999999978, True, u'cultures'), ('examines', 0.31791871184703069, False, u'describe')]
[('start', 0.68539480759901428, False, u'begin'), ('It', 0.60106851750939438, True, u'The')]
[('author', 0.99999999999999989, True, u'author'), ('historians', 0.99999999999999978, True, u'historians')]
(120, 8)


In [27]:
# Create a weight matrix of given size. 
# The matrix is initialized randomly with Gaussian distribution 
# with mean=0 and \sigma=0.1
def initializeWeightMatrix(in_size, out_size):
    return theano.shared(0.1 * np.random.randn(in_size, out_size))

# Create a bias vector of all zeros of given size
def initializeBiasVector(size):
    return theano.shared(np.zeros(size))

In [28]:
# Initialize all our parameters, given our dimensions.
# Input matrix has shape Nx(N*R)
# Query matrix has shape 5xnumQ
# A is the first matrix used to embed our input. It has size dxN
# B is the matrix used to embed the query. It has size dx(N+R)
# C is the next matrix used to embed our input. It has size dxN
# W is the final matrix. Takes output O and produces result w_embedded. It has size 5xd

def initializeParams(d, N):
    A = initializeWeightMatrix(d,N)
    B = initializeWeightMatrix(d,N+R)
    C = initializeWeightMatrix(d,N)
    W = initializeWeightMatrix(CHOICES_PER_Q,d)
    
#     A = theano.shared(initializeWeightMatrix(d, V))
#     B = theano.shared(initializeWeightMatrix(d, V))
#     C = theano.shared(initializeWeightMatrix(d, V))
#     W = theano.shared(initializeWeightMatrix(V, d))
    return A, B, C, W

A, B, C, W = initializeParams(d, N)
weightMatrices = [A, B, C, W]
print(W.shape.eval())

[ 5 20]


In [29]:
# Define the computational step
# Given input matrix X, query q, and weight matrices, we perform a computational step,
# also known as a "hop". Let M be the number of sentences
def hopComputation(X, q, A, B, C, W):
    #m_i = Ax_i
    mem_matrix = A.dot(X) #dimension (dxN) x (Nx(NxR)) = dx(N*R)
    #u = Bq
    u = B.dot(q) #dimension (dx(N+R)) x ((N+R)xnumQ) = dxnumQ
    #p_i = softmax(u^T m_i)
    probs = T.nnet.softmax(u.T.dot(mem_matrix)) #dimension(numQxd)x(dx(N*R)) = numQx(N*R)
    #C_i = Cx_i
    c = C.dot(X) #dimension (dxN) x (Nx(NxR)) = dx(N*R)
    o = c.dot(probs.T) #dimension (dx(N*R))x((N*R)xnumQ) = dxnumQ
    
    #w_embedded = Wo
    w_embedded = W.dot(o).T #dimension (5xd)x(dxnumQ) = 5xnumQ.T = numQx5
    
    result = T.nnet.softmax(w_embedded)
    return result
    
    #output = sum of c_matrix * probs
#     o = (probs * c_embedded).sum(axis = 0)
    #result = 

In [30]:
y_hat = hopComputation(X, q, A, B, C, W)
loss = T.nnet.categorical_crossentropy(y_hat, y).mean()

In [31]:
from __future__ import print_function

def inspect_inputs(i, node, fn):
    print(i, node, "input(s) value(s):", fn.inputs, end='')

def inspect_outputs(i, node, fn):
    print(" output(s) value(s):", fn.outputs)
    
def detect_nan(i, node, fn):
    for output in fn.outputs:
        if (not isinstance(output[0], np.random.RandomState) and
            np.isnan(output[0]).any()):
            print('*** NaN detected ***')
            theano.printing.debugprint(node)
            print('Inputs : %s' % [input[0] for input in fn.inputs])
            print('Outputs: %s' % [output[0] for output in fn.outputs])
            break

In [32]:
# Learning rate (chosen to be 0.01)
epsilon = 0.4

# This function trains our neural net, using stochastic gradient descent.
def train_MemNN(loss, X, q, y, y_hat):
    update_weights = []
    for weightMatrix in weightMatrices:
        update = T.grad(loss, weightMatrix)
        update_weights.append((weightMatrix, weightMatrix - update * epsilon))
    train_MemNN_func = theano.function(inputs=[X,q,y], outputs=[loss,y_hat], updates=update_weights, 
                        mode=theano.compile.MonitorMode(
#                             pre_func=inspect_inputs,
                            post_func=detect_nan))
    return train_MemNN_func

train_MemNN_func = train_MemNN(loss, X, q, y, y_hat)

In [33]:
def train_model(in_vect, question, answers, epochs=100):
    train_errors = []
    y_hats = []
    for i in xrange(epochs):
        error = 0
        [cur_loss, cur_yhat] = train_MemNN_func(in_vect, question, answers)
        error += cur_loss
#         print(error)
        train_errors.append(error)
        y_hats.append(cur_yhat)
    return [train_errors, y_hats]

In [34]:
in_vect = tensor_stack.astype(int)
question = train_Q.astype(int)
answers = np.array([np.array([0,0,0,0,1]),
                    np.array([0,0,0,1,0]),
                    np.array([0,1,0,0,0]),
                    np.array([1,0,0,0,0]),
                    np.array([1,0,0,0,0]),
                    np.array([0,0,1,0,0]),
                    np.array([1,0,0,0,0]),
                    np.array([0,1,0,0,0])
                   ]).astype(int)

# print(in_vect)
# print(question)
# print(answers)
# print(type(in_vect[0][0]))
[train_errors, y_hats] = train_model(in_vect, question, answers)
print(train_errors)
print(y_hats[-1])

[1.6094239725397508, 1.6094234496784199, 1.6094229268016771, 1.6094224039085616, 1.6094218809981129, 1.6094213580693701, 1.6094208351213721, 1.6094203121531585, 1.609419789163768, 1.6094192661522402, 1.6094187431176141, 1.6094182200589289, 1.6094176969752227, 1.6094171738655361, 1.6094166507289067, 1.6094161275643737, 1.609415604370976, 1.6094150811477532, 1.6094145578937424, 1.6094140346079839, 1.6094135112895154, 1.609412987937376, 1.6094124645506038, 1.6094119411282373, 1.6094114176693153, 1.6094108941728758, 1.6094103706379568, 1.6094098470635969, 1.6094093234488345, 1.6094087997927067, 1.6094082760942521, 1.6094077523525088, 1.609407228566514, 1.6094067047353056, 1.6094061808579212, 1.6094056569333983, 1.6094051329607746, 1.6094046089390874, 1.6094040848673734, 1.6094035607446706, 1.6094030365700156, 1.6094025123424449, 1.609401988060996, 1.6094014637247054, 1.6094009393326096, 1.6094004148837453, 1.6093998903771487, 1.6093993658118559, 1.6093988411869038, 1.6093983165013275, 1.60