In [1]:
# This is an implementation of the End-to-End Memory Network as defined by Sukhbaatar, et al. 
# We use k=1, i.e. have only one computational step in the network

import numpy as np
import theano
import theano.tensor as T
from triple_reader import triple_reader
from question_reader import question_reader
from gensim.models import word2vec
from nltk.corpus import stopwords
import sys

numQuestions  4
[[u'In lines 2-8, the author of Passage 1 mentions activities that suggest dolphins', u'are unusually sensitive to their environment', u'do not generally thrive in captivity', u'have a unique type of intelligence', u'are uncommonly playful animals', u'have skills usually associated with humans'], [u'The author of Passage 2 would most likely respond to the last sentence of Passage 1 by', u'suggesting that intelligence in animals is virtually impossible to measure', u'observing that intelligence does not mean the same thing for every species', u'questioning the objectivity of the studies already conducted', u'noting that dolphin activities do not require a high level of intelligence', u'arguing that little is actually known about dolphin social behavior'], [u'The two passages differ in their views of dolphin intelligence in that Passage 1 states that dolphins', u'share a sophisticated culture; while Passage 2 contends that dolphin intelligence is roughly equal to human in

In [2]:
# initialize word2vec model
word_model = word2vec.Word2Vec.load_word2vec_format('word2vec.bin', binary=True)

In [3]:
# read in file as tensors
text_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/"
    "Senior Thesis Code/ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/Triples/1-long2.txt")
    
tr = triple_reader(text_file)
# print tr.tripleList
tensor = tr.tensor

enMap = tr.enMap
relMap = tr.relMap

R = len(tensor)
N = len(tensor[0])
# dimension for encoding is arbitrary, we pick 20 here
d = 20

In [4]:
tensor_stack = np.hstack(tuple(tensor))
print(tensor_stack.shape)
print(N, N*R)

X = T.lmatrix('X')
q = T.dmatrix('q')
y = T.lmatrix('y')

(110, 6050)
(110, 6050)


In [5]:
# Find the word in entity that's most similar to given word
# default topsim to lowest possible python int
def findSimEn(word):
    topsim = None
    topEn = None
    for en in enMap:
        if type(en) == int:
            continue
        try:
            sim = word_model.similarity(en, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topEn = en
    return [topEn, topsim]

def findSimRel(word):
    topsim = None
    topRel = None
    for rel in relMap:
        if type(rel) == int:
            continue
        try:
            sim = word_model.similarity(rel, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topRel = rel
    return [topRel, topsim]

# print findSimEn('critic')
# print findSimRel('go')

# Return top similarity, sim score, and isEn boolean
def findTopEnOrRel(word):
#     try:
    [topEn, topEnsim] = findSimEn(word)
    [topRel, topRelsim] = findSimRel(word)
    if topEnsim > topRelsim:
        return (topEn, topEnsim, True, word)
    else:
        return (topRel, topRelsim, False, word)
        
    # if can't find similarity, ignore it
#     except KeyError as e:
#         return [None, 0, False, word]
    
# Return top 2 sims for an array of words
def findTopEnOrRelArr(wordArr):
    topArrs = []
    for word in wordArr:
        top = findTopEnOrRel(word)
#         print top
        topArrs.append(top)
#     print topArrs
#     print topArrs
    sortedTop = sorted(topArrs, key=lambda x: -x[1] if x[1] is not None else sys.maxint)
#     print sortedTop
    top1 = sortedTop[0]
    top2 = None
    # if the top is a relation, we have to pick an entity
    if top1[2] == False:
        curIndex = 1
        while curIndex < len(sortedTop):
            cur = sortedTop[curIndex]
            if cur[2] == False:
                curIndex += 1
                continue
            else:
                top2 = cur
                break
    else:
        top2 = sortedTop[1]
    if top2 == None:
        top2 = [0, 0, True]
    return [top1, top2]

print(findTopEnOrRelArr(["critic", "went", "hi", "boat", "paint"]))

[('saw', 0.52656813492727217, False, 'went'), ('distaste', 0.26808481582806276, True, 'critic')]


In [6]:
q_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/Senior Thesis Code/"
"ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/1-long2-q.txt")

qr = question_reader(q_file)

# print qr.numQuestions

numTrainQ = int(qr.numQuestions*(float(2)/3))
numTestQ = qr.numQuestions - numTrainQ
# print numTrainQ
# print numTestQ

CHOICES_PER_Q = 5

train_Q = []
test_Q = []
# VECTORIZE BY FINDING TOP TWO SIMILAR TO EACH QUESTION, AND MAKING A 2-HOT VECTOR OF LENGTH N+R
questions = qr.questionCombos
for i in xrange(qr.numQuestions):
    question = questions[i]
    question_words = question[0].split(" ")
    # remove stopwords
    question_words = [word for word in question_words if word not in stopwords.words('english')]
    [top1, top2] = findTopEnOrRelArr(question_words)
    print([top1, top2])
    
    # if word in question doesn't match, move on
    if top1[0] == None:
        continue
    
    # Now vectorize it to be of length N+R
    if top1[2] == True:
        index1 = enMap[top1[0]]
    else:
        index1 = N + relMap[top1[0]]
    if top2[2] == True:
        index2 = enMap[top2[0]]
    else:
        index2 = N + relMap[top2[0]]
    curQVec = np.zeros(N+R)
    curQVec[index1] = 1
    curQVec[index2] = 1
    if i < numTrainQ:
        train_Q.append(curQVec)
    else:
        test_Q.append(curQVec)
train_Q = np.array(train_Q)
train_Q = train_Q.T
test_Q = np.array(test_Q)
test_Q = test_Q.T
print(train_Q.shape)
print(test_Q.shape)

numQuestions  13
[('Jerry', 1.0, True, u'Jerry'), ('narrator', 1.0, True, u'narrator')]
[('Jerry', 1.0, True, u'Jerry'), ('narrator', 1.0, True, u'narrator')]
[('This', 0.7025192721804896, True, u'The'), ('means', 0.62676256128193342, False, u'implies')]
[('means', 1.0, False, u'means'), ('This', 0.42479429618445708, True, u'In')]
[('seems', 0.50263214831747294, True, u'suggests'), ('This', 0.42479429618445708, True, u'In')]
[('Jerry', 1.0, True, u'Jerry'), ('life', 0.99999999999999989, True, u'life')]
[('narrator', 1.0, True, u'narrator'), ('This', 0.7025192721804896, True, u'The')]
[('changed', 0.45613785554949016, False, u'changes'), ('This', 0.42479429618445708, True, u'In')]
[('This', 0.7025192721804896, True, u'The'), ('seems', 0.50263214831747294, True, u'suggests')]
[('means', 0.43558764504275427, False, u'indicates'), ('This', 0.42479429618445708, True, u'In')]
[('means', 1.0, False, u'means'), ('This', 0.42479429618445708, True, u'In')]
[('What', 0.74367242975250147, True, u'

In [7]:
# Create a weight matrix of given size. 
# The matrix is initialized randomly with Gaussian distribution 
# with mean=0 and \sigma=0.1
def initializeWeightMatrix(in_size, out_size):
    return theano.shared(0.1 * np.random.randn(in_size, out_size))

# Create a bias vector of all zeros of given size
def initializeBiasVector(size):
    return theano.shared(np.zeros(size))

In [8]:
# Initialize all our parameters, given our dimensions.
# Input matrix has shape Nx(N*R)
# Query matrix has shape 5xnumQ
# A is the first matrix used to embed our input. It has size dxN
# B is the matrix used to embed the query. It has size dx(N+R)
# C is the next matrix used to embed our input. It has size dxN
# W is the final matrix. Takes output O and produces result w_embedded. It has size 5xd

def initializeParams(d, N):
    A = initializeWeightMatrix(d,N)
    B = initializeWeightMatrix(d,N+R)
    C = initializeWeightMatrix(d,N)
    W = initializeWeightMatrix(CHOICES_PER_Q,d)
    
#     A = theano.shared(initializeWeightMatrix(d, V))
#     B = theano.shared(initializeWeightMatrix(d, V))
#     C = theano.shared(initializeWeightMatrix(d, V))
#     W = theano.shared(initializeWeightMatrix(V, d))
    return A, B, C, W

A, B, C, W = initializeParams(d, N)
weightMatrices = [A, B, C, W]
print(W.shape.eval())

[ 5 20]


In [9]:
# Define the computational step
# Given input matrix X, query q, and weight matrices, we perform a computational step,
# also known as a "hop". Let M be the number of sentences
def hopComputation(X, q, A, B, C, W):
    #m_i = Ax_i
    mem_matrix = A.dot(X) #dimension (dxN) x (Nx(NxR)) = dx(N*R)
    #u = Bq
    u = B.dot(q) #dimension (dx(N+R)) x ((N+R)xnumQ) = dxnumQ
    #p_i = softmax(u^T m_i)
    probs = T.nnet.softmax(u.T.dot(mem_matrix)) #dimension(numQxd)x(dx(N*R)) = numQx(N*R)
    #C_i = Cx_i
    c = C.dot(X) #dimension (dxN) x (Nx(NxR)) = dx(N*R)
    o = c.dot(probs.T) #dimension (dx(N*R))x((N*R)xnumQ) = dxnumQ
    
    #w_embedded = Wo
    w_embedded = W.dot(o).T #dimension (5xd)x(dxnumQ) = 5xnumQ.T = numQx5
    
    result = T.nnet.softmax(w_embedded)
    return result
    
    #output = sum of c_matrix * probs
#     o = (probs * c_embedded).sum(axis = 0)
    #result = 

In [10]:
y_hat = hopComputation(X, q, A, B, C, W)
loss = T.nnet.categorical_crossentropy(y_hat, y).mean()

In [11]:
from __future__ import print_function

def inspect_inputs(i, node, fn):
    print(i, node, "input(s) value(s):", fn.inputs, end='')

def inspect_outputs(i, node, fn):
    print(" output(s) value(s):", fn.outputs)
    
def detect_nan(i, node, fn):
    for output in fn.outputs:
        if (not isinstance(output[0], np.random.RandomState) and
            np.isnan(output[0]).any()):
            print('*** NaN detected ***')
            theano.printing.debugprint(node)
            print('Inputs : %s' % [input[0] for input in fn.inputs])
            print('Outputs: %s' % [output[0] for output in fn.outputs])
            break

In [12]:
# Learning rate (chosen to be 0.01)
epsilon = 0.1

# This function trains our neural net, using stochastic gradient descent.
def train_MemNN(loss, X, q, y, y_hat):
    update_weights = []
    for weightMatrix in weightMatrices:
        update = T.grad(loss, weightMatrix)
        update_weights.append((weightMatrix, weightMatrix - update * epsilon))
    train_MemNN_func = theano.function(inputs=[X,q,y], outputs=[loss,y_hat], updates=update_weights, 
                        mode=theano.compile.MonitorMode(
#                             pre_func=inspect_inputs,
                            post_func=detect_nan))
    return train_MemNN_func

train_MemNN_func = train_MemNN(loss, X, q, y, y_hat)

In [13]:
def train_model(in_vect, question, answers, epochs=100):
    train_errors = []
    y_hats = []
    for i in xrange(epochs):
        error = 0
        [cur_loss, cur_yhat] = train_MemNN_func(in_vect, question, answers)
        error += cur_loss
#         print(error)
        train_errors.append(error)
        y_hats.append(cur_yhat)
    return [train_errors, y_hats]

def test_model(in_vect, question):
    # use a stub answers matrix, it doesn't really matter
    print(question.shape)
    [loss, y_hat] = train_MemNN_func(in_vect, question, np.zeros((len(question[0]), 5)).astype(int))
    return y_hat

In [16]:
in_vect = tensor_stack.astype(int)
question = train_Q.astype(int)
answers = np.array([np.array([0,1,0,0,0]),
                    np.array([1,0,0,0,0]),
                    np.array([0,0,0,1,0]),
                    np.array([0,0,0,0,1]),
                    np.array([1,0,0,0,0]),
                    np.array([1,0,0,0,0]),
                    np.array([0,0,0,1,0]),
                    np.array([0,0,0,1,0])
                   ]).astype(int)

# print(in_vect)
# print(question)
# print(answers)
# print(type(in_vect[0][0]))
[train_errors, y_hats] = train_model(in_vect, question, answers, 1000)
print(train_errors)
print(y_hats[-1])

[1.6085349726977189, 1.6085338863987018, 1.6085327992566743, 1.6085317112704904, 1.6085306224390035, 1.6085295327610643, 1.6085284422355235, 1.6085273508612292, 1.6085262586370297, 1.6085251655617707, 1.6085240716342981, 1.6085229768534541, 1.6085218812180817, 1.6085207847270226, 1.6085196873791163, 1.6085185891732008, 1.6085174901081134, 1.60851639018269, 1.6085152893957655, 1.6085141877461726, 1.6085130852327434, 1.6085119818543088, 1.6085108776096975, 1.6085097724977375, 1.6085086665172557, 1.6085075596670775, 1.6085064519460264, 1.6085053433529251, 1.6085042338865951, 1.608503123545856, 1.6085020123295268, 1.6085009002364246, 1.6084997872653646, 1.6084986734151625, 1.6084975586846306, 1.6084964430725814, 1.6084953265778248, 1.6084942091991705, 1.6084930909354254, 1.6084919717853972, 1.6084908517478889, 1.6084897308217063, 1.6084886090056503, 1.6084874862985226, 1.6084863626991224, 1.6084852382062473, 1.6084841128186946, 1.6084829865352603, 1.6084818593547368, 1.6084807312759182, 1.

In [17]:
res = test_model(in_vect, test_Q.astype(int))
for q in res:
    print(q)

(165, 5)
[ 0.20076408  0.19983459  0.19912184  0.20092103  0.19935846]
[ 0.20081901  0.19982212  0.19906183  0.20097998  0.19931705]
[ 0.20081901  0.19982212  0.19906183  0.20097998  0.19931705]
[ 0.20062046  0.19986605  0.19928817  0.2007506   0.19947472]
[ 0.2006447   0.19986138  0.19926182  0.20077518  0.19945692]
