In [8]:
# This is an implementation of the End-to-End Memory Network as defined by Sukhbaatar, et al. 
# We use k=1, i.e. have only one computational step in the network

import numpy as np
import theano
import theano.tensor as T

In [20]:
# Number of training examples
N = 1
# Number of sentences in the input text
M = 1
# Number of words in our dictionary
V = 1
# Dimension to encode our information. They use 20 for independent training
d = 20

# Input X is the collection of sentences, NxVxM matrix
# Input q is our query, a matrix of size NxV
# The actual result is y
X = T.matrix('X')
q = T.matrix('q')
y = T.lvector('y')

In [12]:
# Create a weight matrix of given size. 
# The matrix is initialized randomly with Gaussian distribution 
# with mean=0 and \sigma=0.1
def initializeWeightMatrix(in_size, out_size):
    return 0.1 * np.random.randn(in_size, out_size)

# Create a bias vector of all zeros of given size
def initializeBiasVector(size):
    return numpy.zeros(size)

In [13]:
# Initialize all our parameters, given our dimensions.
# A is the first matrix used to embed our input. It has size dxV
# B is the matrix used to embed the query. It has size dxV
# C is the next matrix used to embed our input. It has size dxV
# W is the final matrix. Takes output O and produces result R. It has size dxd
def initializeParams(d, V):
    A = theano.shared(initializeWeightMatrix(d, V))
    B = theano.shared(initializeWeightMatrix(d, V))
    C = theano.shared(initializeWeightMatrix(d, V))
    W = theano.shared(initializeWeightMatrix(d, d))
    return A, B, C, W

A, B, C, W = initializeParams(d, V)
weightMatrices = [A, B, C, W]

In [18]:
# Define the computational step
# Given input matrix X, query q, and weight matrices, we perform a computational step,
# also known as a "hop"
def hopComputation(X, q, A, B, C, W):
    # m_i = Ax_i
    mem_matrix = A.dot(X)
    # u = Bq
    query_embedding = B.dot(q)
    # p_i = softmax(q^T m_i)
    probs = T.nnet.softmax(query_embedding.T.dot(mem_matrix))
    # C_i = Cx_i
    c_matrix = C.dot(X)
    # output = sum of c_matrix * probs
    o = (c_matrix * probs).sum(axis=1)
    # result = Wo
    result = W.dot(o)
    return result

In [23]:
y_hat = hopComputation(X, q, A, B, C, W)
loss = T.nnet.categorical_crossentropy(y_hat, y).mean()

In [26]:
# Learning rate (chosen to be 0.01)
epsilon = 0.01

# This function trains our neural net, using stochastic gradient descent.
def train_MemNN(loss, X, q, y):
    update_weights = []
    for weightMatrix in weightMatrices:
        update = T.grad(loss, weightMatrix)
        update_weights.append((weightMatrix, weightMatrix - update * epsilon))
    train_MemNN_func = theano.function(inputs=[X,q,y], outputs=loss, updates=update_weights)
    return train_MemNN_func

train_MemNN_func = train_MemNN(loss, X, q, y)

In [None]:
'''
Notes from Google Brain Talk

First research done with convolutional neural net (Hinton 2012)
NEW RESEARCH
    - Each layer is differentiable function (potentially non-linear)
    - Last layer is softmax and loss is cross-entropy
    - Trained by mini-batch SGD
    - Several tricks: batch normalization (Ioffe 2015), use ReLU for non-linearities, 
        several layers of convolutions at multiple scales, max-pooling, full-connect, 
        GPU for everything, and multiple replicas (50-100) talking to parameter server

Representing Words - Classical View
    - Classical way is one-hot word vectors (think of this as everything being vertices of the hypercube)
        - this way, every pair of words is equally far apart 
    - Better way is to put them inside the hypercube (word2vec, for example)
    - How to get there? Train word embeddings from text corpus
        - Online pass over text corpus
            - word2vec
            - randomly pick a word in the corpus and a nearby word in the text 
            (like locational nearby, not vector nearby)
            - move the corresponding embeddings nearer
            - uses skip-gram stuff
        - Collect co-occurrence statistics
            - GloVe
            - Compute pointwise mutual information between words
            - Move embeddings to estimate them by dot product

Language Modeling with Deep Learning
    - Given a sequence of tokens, maximize its joint likelihood p(y_1, y_2, ..., y_T)
    - Factorize like this: \product_t p(y_t | y_1,...,y_{T-1})
    - Classical approach: use n-grams to simplify and just count them! 
        - but this doesn't take into account long-term dependencies and can't generalize to word combos
        that can occur frequently but u haven't seen
    - Instead: condition on some function h of previous input and use RNNs
        \product_t p(y_t | h_t) with h_t = p (y_t | y_1,...,y_{t-1})
        - Here, words are not one-hot encoded but have real word embeddings
    - LSTMs work better for our long-term dependencies
    
    - sequence-to-sequence framework by Sutskever
    
Image Captioning Experiments
    - A recent dataset started it all: MS-COCO dataset
        - 75k training images
        - 5k evaluation images
        - each image has 5 different captions
    - Image model Google LeNet(winner of 2014 challenge)
    - Caption model is single-layer LSTM with 512 hidden units
    - Words have embeddings of size 512
    - Small dictionary of 8857 words
    - Evaluate results: must compare word counts

One More Trick: Scheduled Sampling
    - Statistical Machine Translation (same model can be used for image captioning)
    - Inference of Sequence Prediction Models with RNNs
        - Can use beam search for sampling, but beam size must be small (< 20) for RNNs
    - A Sampling approach for training RNNs (sometimes you should show it the true word) instead
    of the previous predicted word. i.e. at time t instead of showing sample(t-1) show it true(t-1)
    
    - but must be very careful when you do this
    
Conclusions
    - Image caption is one more application of the sequence-to-sequence framework
    - It is important, during training, to expose the model to diverse situations it can encounter at inference time
    - Sampling from the model provides this diversity
    - Only sampling from the model during training is too hard
    - "Curriculum learning" is a reasonable approach to go from completely guided mode towards a 
    mode that is similar to inference
    - Good performance on a few tasks
        - for the image caption competition, a few of these models were ensembled and they won!
    
NOTE: LSTMs are bad after ~50 or so. Also read about "hyperparameter training"
  
'''