In [2]:
# This is an implementation of the End-to-End Memory Network as defined by Sukhbaatar, et al. 
# We use k=1, i.e. have only one computational step in the network

import numpy as np
import theano
import theano.tensor as T

In [3]:
import fbTaskReader as taskReader
reload(taskReader)

# Get the input matrices for the facebook tasks
taskFilePaths = ["/Users/SaahilM/Documents/Princeton/Academics/Thesis/Data/tasks_1-20_v1-2/en/qa1_single-supporting-fact"]
task1 = taskReader.VectorizeTask(taskFilePaths[0])
# task1.printStory()
qaPairs = task1.getTextQuestionPairs()
# print task1.getTextQuestionPairs()

In [4]:
# theano.config.compute_test_value = 'warn'

# Number of training examples
N = task1.getNumTrainingExamples()
# Number of words in our dictionary
V = task1.getNumWords()
# Dimension to encode our information. They use 20 for independent training
d = 50

print(N,V,d)

# Input X is the collection of sentences, Vx(text_length) matrix
# Input q is our query, a vector of size Vx1
# The actual result is y, a vector of size Vx1
X = T.lmatrix('X')
q = T.lrow('q')
y = T.lvector('y')

X.tag.test_value = np.zeros((2, V), dtype=np.int64)
q.tag.test_value = np.zeros(V, dtype=np.int64)

(1000, 19, 50)


In [5]:
# Create a weight matrix of given size. 
# The matrix is initialized randomly with Gaussian distribution 
# with mean=0 and \sigma=0.1
def initializeWeightMatrix(in_size, out_size):
    return theano.shared(0.1 * np.random.randn(in_size, out_size))

# Create a bias vector of all zeros of given size
def initializeBiasVector(size):
    return theano.shared(np.zeros(size))

In [6]:
# Initialize all our parameters, given our dimensions.
# A is the first matrix used to embed our input. It has size dxV
# B is the matrix used to embed the query. It has size dxV
# C is the next matrix used to embed our input. It has size dxV
# W is the final matrix. Takes output O and produces result R. It has size dxd
def initializeParams(d, V):
    A = initializeWeightMatrix(V,d)
    B = initializeWeightMatrix(V,d)
    C = initializeWeightMatrix(V,d)
    W = initializeWeightMatrix(d,V)
#     A = theano.shared(initializeWeightMatrix(d, V))
#     B = theano.shared(initializeWeightMatrix(d, V))
#     C = theano.shared(initializeWeightMatrix(d, V))
#     W = theano.shared(initializeWeightMatrix(V, d))
    return A, B, C, W

A, B, C, W = initializeParams(d, V)
weightMatrices = [A, B, C, W]
print(W.shape.eval())

[50 19]


In [7]:
# Define the computational step
# Given input matrix X, query q, and weight matrices, we perform a computational step,
# also known as a "hop". Let M be the number of sentences
def hopComputation(X, q, A, B, C, W):
    # m_i = Ax_i
    mem_matrix = X.dot(A) #dimension (MxV)x(Vxd) = Mxd
    # u = Bq
    u = q.dot(B) #dimension (1xV)x(Vxd) = 1xd
    # p_i = softmax(u^T m_i)
    probs = T.nnet.softmax(mem_matrix.dot(u.T)) #dimension (Mxd)x(dx1) = Mx1
    # C_i = Cx_i
    c_embedded = X.dot(C) #dimension (MxV)x(Vxd) = Mxd
    # output = sum of c_matrix * probs
    o = (probs * c_embedded).sum(axis = 0) #dimension = 1xd
    # result = softmax(W(o+u))
    w_embedded = (o + u).dot(W) #dimension (1xd)x(dxV) = 1xV
    
    result = T.nnet.softmax(w_embedded)
    return result
    
#     # m_i = Ax_i
#     mem_matrix = A.dot(X)  #dimension dxM
#     # u = Bq
#     u = B.dot(q) #dimension dx1
#     # p_i = softmax(u^T m_i)
#     probs = T.nnet.softmax(u.T.dot(mem_matrix)) #dimension 
#     # C_i = Cx_i
#     c_embedded = C.dot(X)
#     # output = sum of c_matrix * probs
#     o = (c_embedded * probs).sum(axis=1)
#     # result = softmax(W(o+u))
#     w_embedded = W.dot(o)
#     w_summed = (w_embedded + u).T
    
#     # NOTE: MUST TRANSPOSE B/C IN THE PAPER THEY HAVE IT AS A COLUMN VECTOR BUT THEANO
#     # NEEDS A ROW VECTOR. TOOK FOREVER TO FIGURE THIS OUT
#     result = T.nnet.softmax(w_embedded.T)
#     return result

In [8]:
y_hat = hopComputation(X, q, A, B, C, W)
y_hat.tag.test_value = np.random.randn(1,V)
loss = T.nnet.categorical_crossentropy(y_hat, y).mean()

In [16]:
from __future__ import print_function

def inspect_inputs(i, node, fn):
    print(i, node, "input(s) value(s):", fn.inputs, end='')

def inspect_outputs(i, node, fn):
    print(" output(s) value(s):", fn.outputs)
    
def detect_nan(i, node, fn):
    for output in fn.outputs:
        if (not isinstance(output[0], np.random.RandomState) and
            np.isnan(output[0]).any()):
            print('*** NaN detected ***')
            theano.printing.debugprint(node)
            print('Inputs : %s' % [input[0] for input in fn.inputs])
            print('Outputs: %s' % [output[0] for output in fn.outputs])
            break

# Learning rate (chosen to be 0.01)
epsilon = 0.01

# This function trains our neural net, using stochastic gradient descent.
def train_MemNN(loss, X, q, y):
    update_weights = []
    for weightMatrix in weightMatrices:
        update = T.grad(loss, weightMatrix)
        update_weights.append((weightMatrix, weightMatrix - update * epsilon))
    train_MemNN_func = theano.function(inputs=[X,q,y], outputs=loss, updates=update_weights, 
                        mode=theano.compile.MonitorMode(
#                             pre_func=inspect_inputs,
                            post_func=detect_nan))
    return train_MemNN_func

train_MemNN_func = train_MemNN(loss, X, q, y)

In [17]:
def train_model(train_data, epochs=100):
    train_errors = []
    for i in xrange(epochs):
        error = 0
#         print(train_data[0])
        for textQuestionPair in train_data:
            train_X = textQuestionPair["text"]
#             print(train_X.shape)
            train_q = textQuestionPair["question"]
#             print(train_q.shape)
            train_y = textQuestionPair["answer"]
#             print(train_y.shape)
#             print train_y.dtype
#             train_X = np.array(textQuestionPair["text"])
#             train_q = np.array(textQuestionPair["question"])
#             train_y = np.array(textQuestionPair["answer"])
            cur_loss = train_MemNN_func(train_X, train_q, train_y)
            error += cur_loss
        train_errors.append(error)
    return train_errors

train_errors = train_model(qaPairs)
print(train_errors)

[1451.4784311230917, 1442.0706640654698, 1440.5655623315288, 1439.7801899730835, 1439.3234919014317, 1439.0423984872073, 1438.8616768891281, 1438.7413176637874, 1438.6589226290826, 1438.6013788093883, 1438.5606850993593, 1438.5317684158449, 1438.5112955040249, 1438.4970025203079, 1438.4873038911555, 1438.4810570878656, 1438.4774172495777, 1438.4757450760656, 1438.4755471074002, 1438.4764361342834, 1438.478104369196, 1438.4803048522433, 1438.4828382612118, 1438.4855433272128, 1438.4882896969798, 1438.4909724852603, 1438.4935080172556, 1438.4958304265738, 1438.4978888819062, 1438.499645286666, 1438.501072342907, 1438.5021519024831, 1438.5028735498042, 1438.5032333752215, 1438.5032329081869, 1438.5028781864744, 1438.5021789428206, 1438.5011478939373, 1438.4998001195088, 1438.4981525209253, 1438.4962233507765, 1438.4940318055187, 1438.4915976745579, 1438.4889410397611, 1438.4860820200099, 1438.4830405559969, 1438.4798362308627, 1438.4764881226652, 1438.4730146850932, 1438.4694336530372, 14

In [None]:
'''
Notes from Google Brain Talk

First research done with convolutional neural net (Hinton 2012)
NEW RESEARCH
    - Each layer is differentiable function (potentially non-linear)
    - Last layer is softmax and loss is cross-entropy
    - Trained by mini-batch SGD
    - Several tricks: batch normalization (Ioffe 2015), use ReLU for non-linearities, 
        several layers of convolutions at multiple scales, max-pooling, full-connect, 
        GPU for everything, and multiple replicas (50-100) talking to parameter server

Representing Words - Classical View
    - Classical way is one-hot word vectors (think of this as everything being vertices of the hypercube)
        - this way, every pair of words is equally far apart 
    - Better way is to put them inside the hypercube (word2vec, for example)
    - How to get there? Train word embeddings from text corpus
        - Online pass over text corpus
            - word2vec
            - randomly pick a word in the corpus and a nearby word in the text 
            (like locational nearby, not vector nearby)
            - move the corresponding embeddings nearer
            - uses skip-gram stuff
        - Collect co-occurrence statistics
            - GloVe
            - Compute pointwise mutual information between words
            - Move embeddings to estimate them by dot product

Language Modeling with Deep Learning
    - Given a sequence of tokens, maximize its joint likelihood p(y_1, y_2, ..., y_T)
    - Factorize like this: \product_t p(y_t | y_1,...,y_{T-1})
    - Classical approach: use n-grams to simplify and just count them! 
        - but this doesn't take into account long-term dependencies and can't generalize to word combos
        that can occur frequently but u haven't seen
    - Instead: condition on some function h of previous input and use RNNs
        \product_t p(y_t | h_t) with h_t = p (y_t | y_1,...,y_{t-1})
        - Here, words are not one-hot encoded but have real word embeddings
    - LSTMs work better for our long-term dependencies
    
    - sequence-to-sequence framework by Sutskever
    
Image Captioning Experiments
    - A recent dataset started it all: MS-COCO dataset
        - 75k training images
        - 5k evaluation images
        - each image has 5 different captions
    - Image model Google LeNet(winner of 2014 challenge)
    - Caption model is single-layer LSTM with 512 hidden units
    - Words have embeddings of size 512
    - Small dictionary of 8857 words
    - Evaluate results: must compare word counts

One More Trick: Scheduled Sampling
    - Statistical Machine Translation (same model can be used for image captioning)
    - Inference of Sequence Prediction Models with RNNs
        - Can use beam search for sampling, but beam size must be small (< 20) for RNNs
    - A Sampling approach for training RNNs (sometimes you should show it the true word) instead
    of the previous predicted word. i.e. at time t instead of showing sample(t-1) show it true(t-1)
    
    - but must be very careful when you do this
    
Conclusions
    - Image caption is one more application of the sequence-to-sequence framework
    - It is important, during training, to expose the model to diverse situations it can encounter at inference time
    - Sampling from the model provides this diversity
    - Only sampling from the model during training is too hard
    - "Curriculum learning" is a reasonable approach to go from completely guided mode towards a 
    mode that is similar to inference
    - Good performance on a few tasks
        - for the image caption competition, a few of these models were ensembled and they won!
    
NOTE: LSTMs are bad after ~50 or so. Also read about "hyperparameter training"
  
'''