In [6]:
# end-to-end memory network

import numpy as np
from gensim import corpora
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

def lj(j, J, k, d): 
    return (1-j/J)-(k/d)*(1-2*j/J)

if __name__=="__main__":
    # load input data
    #data = open('input1.txt', 'r').read() # input text file: sentences separated by .

    # hyperparameters
    d_embed = 10 # embedding dimension
    lr = 0.1 # learning rate
    
    # input to memory embedding mi = A * xi + tA
        #stoplist = set('for a of the and to in'.split())
    stoplist = []
    doc_raw = 'sam walks into the kitchen. sam picks up an apple. sam walks into the bedroom. sam drops the apple.'
    query_raw = 'where is sam'
    document = doc_raw.split('.')
    document.append(query_raw)
    senttoken = [ [word for word in sentence.lower().split(' ') if word not in stoplist] for sentence in document ]
    dictionary = corpora.Dictionary(senttoken)
    print(dictionary.token2id)
    document.pop(len(document)-1) # query at the end of document
    d_embed = len(dictionary)
    #voc = 0 # size of vocabulary
    #for d in document:
        #print(d)
        #if len(d) == 0:
            #document.remove(d)
        #voc = max(voc, len(d.split()))
    voc = d_embed
    n_memories = len(document)
    #print(voc)
    #print(document)

    # initiate weigth matrices
    A = np.random.randn(d_embed, voc)*0.1 # input to memory embedding
    tA = np.zeros(d_embed) # temporal encoding A
    B = np.random.randn(d_embed, voc)*0.1 # query embedding
    tB = np.zeros(d_embed) # temporal encoding B
    C = np.random.randn(d_embed, voc)*0.1 # output to memory embedding
    tC = np.zeros(d_embed) # temporal encoding C
    W = np.random.randn(voc, d_embed)*0.1 # final weight matrix
    
    # memory for Adagrad
    mA = np.zeros_like(A)
    mtA = np.zeros_like(tA)
    mB = np.zeros_like(B)
    mtB = np.zeros_like(tB)
    mC = np.zeros_like(C)
    mtC = np.zeros_like(tC)
    mW = np.zeros_like(W)
    
    for iterctr in range(10000):

        # forward pass

        # embedding simple: m_i = A_ij * x_ij + T_A_j
        x = np.zeros((n_memories, voc))
        m = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            thissent = document[i].split()
            for j in range(len(thissent)):
                x[i][j] = dictionary.token2id[thissent[j]]
        for i in range(n_memories):
            m[i] = np.dot(A, x[i].T) + tA # simple embedding
            #m[i][j] = lj(j,len(document[i]),j,d_embed) * A[i][j] * x[i][j] + tA[j] # with positional encoding
        #print(x)
        #print(m)

        # query embedding u = B * q + tB
        q = np.zeros(voc)
        u = np.zeros(d_embed)
        thissent = query_raw.split()
        for j in range(len(thissent)):
            q[j] = dictionary.token2id[thissent[j]]
        u = np.dot(B, q) + tB

        # match of query with memory p = softmax(u * mi) for all i
        p = np.zeros((n_memories, d_embed))
        p = softmax(np.dot(u, m.T))

        # output corresponding to input xi: ci = C * xi + tC
        c = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            c[i] = np.dot(C, x[i].T) + tC

        # response vector from memory o = sum pi * ci
        o = np.zeros(d_embed)
        o = np.dot(p.T, c)

        # predicted label a = softmax( W * (o + u))
        a_predict = softmax(np.dot(W, (o + u)))
        #print(a_predict)

        # backpropagation

        dA = np.zeros_like(A)
        dB = np.zeros_like(B)
        dC = np.zeros_like(C)
        dW = np.zeros_like(W)
        dtA = np.zeros_like(tA)
        dtB = np.zeros_like(tB)
        dtC = np.zeros_like(tC)

        truth = np.zeros_like(tA)
        truth[10] = 1 # bedroom
        dy = a_predict - truth
        # dA = dy a_predict * (1-a_predict) W sumi p[i] (1-p[i]) ( u.T * 1A * x[i]) c[i]
        #print('V: %d' % (voc))
        #print('d: %d', (d_embed))
        ABCunit = np.pad(np.identity(voc), ((0,d_embed-voc),(0,0)), 'constant', constant_values=(0))
        Wunit = np.pad(np.identity(voc), ((0,0), (0,d_embed-voc)), 'constant', constant_values=(0))
        tunit = np.ones_like(tA)

        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(u, np.dot(ABCunit, x[i].T)), c[i])
        dEAtmp = W * dEAtmp
        dA = (np.dot(dy, a_predict * (1-a_predict)) * dEAtmp).T
        #print(dA)

        # dB = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) ((1B q).T m[i]) c[i] + 1B q)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(np.dot(ABCunit, q), m[i]), c[i])
        dEAtmp = W * dEAtmp
        dB = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dB)

        # dC = dy a_predict * (1-a_predict) W sumi p[i] 1C x[i]
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*np.dot(ABCunit, x[i])
        dEAtmp = W * dEAtmp
        dC = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dC)

        # dW = dy a_predict * (1-a_predict) (o + u)
        dW = (np.dot(dy, a_predict*(1-a_predict)) * Wunit * (o + u))
        #print(dW)   

        # dtA = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) (u.T 1tA) c[i])
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(u.T, tunit), c[i])
        dtA = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtA)

        # dtB = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) ((1tB).T m[i] c[i]) + 1tB)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*(np.dot(np.dot(tunit.T, m[i]), c[i]) + tunit)
        dtB = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtB)

        # dtC = dy a_predict * (1-a_predict) W ( sumi p[i] 1tC )
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*tunit
        dtC = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtC)

        # maybe clip ?

        # update weights with Adagrad
        for weights, dweights, memwghts in zip([A,B,C,W,tA,tB,tC], [dA,dB,dC,dW,dtA,dtB,dtC], [mA,mB,mC,mW,mtA,mtB,mtC]):
            memwghts += dweights * dweights
            weights += -lr * dweights / np.sqrt(memwghts + 1.e-8)

        #print(A)
    print(a_predict)
    #print(np.argmax(a_predict))
    print(dictionary[np.argmax(a_predict)])

{'into': 0, 'kitchen': 1, 'sam': 2, 'the': 3, 'walks': 4, '': 5, 'an': 6, 'apple': 7, 'picks': 8, 'up': 9, 'bedroom': 10, 'drops': 11, 'is': 12, 'where': 13}
[1.31351085e-03 1.60122327e-07 7.42432013e-04 2.38615087e-03
 7.28085913e-06 1.34016459e-06 3.14062288e-05 7.39954981e-04
 7.87630566e-07 2.22681567e-03 9.90902630e-01 1.64277869e-03
 2.23272785e-07 4.52829329e-06]
bedroom


In [36]:
# end-to-end memory network

import numpy as np
from gensim import corpora
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

def lj(j, J, k, d): 
    return (1-j/J)-(k/d)*(1-2*j/J)

if __name__=="__main__":
    # load input data
    #data = open('input1.txt', 'r').read() # input text file: sentences separated by .

    # hyperparameters
    d_embed = 0 # embedding dimension
    lr = 1.8 # learning rate
    
    # input to memory embedding mi = A * xi + tA
        #stoplist = set('for a of the and to in'.split())
    stoplist = []
    doc_raw = 'Mary moved to the bathroom. John went to the hallway.'
    query_raw = 'Where is Mary?'
    document = doc_raw.lower().split('.')
    document.append(query_raw.lower())
    senttoken = [ [word for word in sentence.lower().split(' ') if word not in stoplist] for sentence in document ]
    dictionary = corpora.Dictionary(senttoken)
    print(dictionary.token2id)
    document.pop(len(document)-1) # query at the end of document
    d_embed = len(dictionary)
    #voc = 0 # size of vocabulary
    #for d in document:
        #print(d)
        #if len(d) == 0:
            #document.remove(d)
        #voc = max(voc, len(d.split()))
    voc = d_embed
    n_memories = len(document)
    #print(voc)
    #print(document)

    # initiate weigth matrices
    A = np.random.randn(d_embed, voc)*0.01 # input to memory embedding
    tA = np.random.randn(d_embed)*0.01 # temporal encoding A
    B = np.random.randn(d_embed, voc)*0.01 # query embedding
    tB = np.random.randn(d_embed)*0.01 # temporal encoding B
    C = np.random.randn(d_embed, voc)*0.01 # output to memory embedding
    tC = np.random.randn(d_embed)*0.01 # temporal encoding C
    W = np.random.randn(voc, d_embed)*0.01 # final weight matrix
    
    # memory for Adagrad
    mA = np.zeros_like(A)
    mtA = np.zeros_like(tA)
    mB = np.zeros_like(B)
    mtB = np.zeros_like(tB)
    mC = np.zeros_like(C)
    mtC = np.zeros_like(tC)
    mW = np.zeros_like(W)

    x = np.zeros((n_memories, voc))
    for i in range(n_memories):
        thissent = document[i].lower().split()
        for j in range(len(thissent)):
            x[i][j] = dictionary.token2id[thissent[j]]
         
    for iterctr in range(1000):

        # forward pass
        
        # embedding simple: m_i = A_ij * x_ij + T_A_j
        m = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            m[i] = np.dot(A, x[i].T) + tA # simple embedding
            #m[i][j] = lj(j,len(document[i]),j,d_embed) * A[i][j] * x[i][j] + tA[j] # with positional encoding

        # query embedding u = B * q + tB
        q = np.zeros(voc)
        u = np.zeros(d_embed)
        thissent = query_raw.lower().split()
        for j in range(len(thissent)):
            q[j] = dictionary.token2id[thissent[j]]
        u = np.dot(B, q) + tB

        # match of query with memory p = softmax(u * mi) for all i
        p = np.zeros((n_memories, d_embed))
        p = softmax(np.dot(u, m.T))

        # output corresponding to input xi: ci = C * xi + tC
        c = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            c[i] = np.dot(C, x[i].T) + tC

        # response vector from memory o = sum pi * ci
        o = np.zeros(d_embed)
        o = np.dot(p.T, c)

        # predicted label a = softmax( W * (o + u))
        a_predict = softmax(np.dot(W, (o + u)))
        #print(a_predict)

        # backpropagation

        dA = np.zeros_like(A)
        dB = np.zeros_like(B)
        dC = np.zeros_like(C)
        dW = np.zeros_like(W)
        dtA = np.zeros_like(tA)
        dtB = np.zeros_like(tB)
        dtC = np.zeros_like(tC)

        truth = np.zeros_like(tA)
        truth[1] = 1 # answer
        dy = a_predict - truth
        # dA = dy a_predict * (1-a_predict) W sumi p[i] (1-p[i]) ( u.T * 1A * x[i]) c[i]
        #print('V: %d' % (voc))
        #print('d: %d', (d_embed))
        ABCunit = np.pad(np.identity(voc), ((0,d_embed-voc),(0,0)), 'constant', constant_values=(0))
        Wunit = np.pad(np.identity(voc), ((0,0), (0,d_embed-voc)), 'constant', constant_values=(0))
        tunit = np.ones_like(tA)

        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(u, np.dot(ABCunit, x[i].T)), c[i])
        dEAtmp = W * dEAtmp
        dA = (np.dot(dy, a_predict * (1-a_predict)) * dEAtmp).T
        #print(dA)

        # dB = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) ((1B q).T m[i]) c[i] + 1B q)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(np.dot(ABCunit, q), m[i]), c[i])
        dEAtmp = W * dEAtmp
        dB = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dB)

        # dC = dy a_predict * (1-a_predict) W sumi p[i] 1C x[i]
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*np.dot(ABCunit, x[i])
        dEAtmp = W * dEAtmp
        dC = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dC)

        # dW = dy a_predict * (1-a_predict) (o + u)
        dW = (np.dot(dy, a_predict*(1-a_predict)) * Wunit * (o + u))
        #print(dW)   

        # dtA = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) (u.T 1tA) c[i])
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(u.T, tunit), c[i])
        dtA = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtA)

        # dtB = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) ((1tB).T m[i] c[i]) + 1tB)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*(np.dot(np.dot(tunit.T, m[i]), c[i]) + tunit)
        dtB = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtB)

        # dtC = dy a_predict * (1-a_predict) W ( sumi p[i] 1tC )
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*tunit
        dtC = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtC)

        # maybe clip ?

        # update weights with Adagrad
        for weights, dweights, memwghts in zip([A,B,C,W,tA,tB,tC], [dA,dB,dC,dW,dtA,dtB,dtC], [mA,mB,mC,mW,mtA,mtB,mtC]):
            memwghts += dweights * dweights
            weights += -lr * dweights / np.sqrt(memwghts + 1.e-8)

        #print(A)
    print(a_predict)
    #print(np.argmax(a_predict))
    print(dictionary[np.argmax(a_predict)])

{'bathroom': 0, 'mary': 1, 'moved': 2, 'the': 3, 'to': 4, '': 5, 'hallway': 6, 'john': 7, 'went': 8, 'is': 9, 'mary?': 10, 'where': 11}
[1.80358042e-26 2.62683263e-52 3.43609638e-23 1.95285802e-46
 1.74655170e-80 5.77479641e-51 8.38014096e-27 1.59883218e-76
 5.97746901e-61 5.03652155e-57 8.98635207e-54 1.00000000e+00]
where


In [32]:
import re
import functools
from gensim import corpora

def tokenize(sent):
    '''
    argument: a sentence string
    returns a list of tokens(words)
    '''
    return [ x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
 
def parse_stories(lines):
    '''
    - Parse stories provided in the bAbI tasks format
    - A story starts from line 1 to line 15. Every 3rd line,
      there is a question & answer.
    - Function extracts sub-stories within a story and
      creates tuples
    '''
    data = []
    story = []
    for line in lines:
        #line = line.decode('utf-8').strip()
        line = line.strip().lower()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
        if '\t' in line:
            # this line is tab separated Q, A & support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            data.append((substory, q, a))
            story.append('')
        else:
            # this line is a sentence of story
            sent = tokenize(line)
            story.append(sent)
    return data
 
def get_stories(f):
    '''
    argument: filename
    returns list of all stories in the argument data-set file
    '''
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines())
    # lambda func to flatten the list of sentences into one list
    flatten = lambda data: functools.reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    data = [(flatten(story), q, answer) for story, q, answer in data]
    return data

if __name__=="__main__":
    #senttoken = [ [word for word in sentence.lower().split(' ') if word not in stoplist] for sentence in document ]
    stories = get_stories(open('qa1_test_short.txt', 'r'))
    dct = corpora.Dictionary()
    for stry in stories:
        dct.add_documents([[s for s in stry[0]], stry[1], [stry[2]]])
    print(dct.token2id)
    print(stories)

{'.': 0, 'bathroom': 1, 'hallway': 2, 'john': 3, 'journeyed': 4, 'mary': 5, 'the': 6, 'to': 7, 'travelled': 8, '?': 9, 'is': 10, 'where': 11, 'back': 12, 'bedroom': 13, 'daniel': 14, 'moved': 15, 'went': 16, 'kitchen': 17, 'sandra': 18, 'garden': 19, 'office': 20}
[(['john', 'travelled', 'to', 'the', 'hallway', '.', 'mary', 'journeyed', 'to', 'the', 'bathroom', '.'], ['where', 'is', 'john', '?'], 'hallway'), (['john', 'travelled', 'to', 'the', 'hallway', '.', 'mary', 'journeyed', 'to', 'the', 'bathroom', '.', 'daniel', 'went', 'back', 'to', 'the', 'bathroom', '.', 'john', 'moved', 'to', 'the', 'bedroom', '.'], ['where', 'is', 'mary', '?'], 'bathroom'), (['john', 'travelled', 'to', 'the', 'hallway', '.', 'mary', 'journeyed', 'to', 'the', 'bathroom', '.', 'daniel', 'went', 'back', 'to', 'the', 'bathroom', '.', 'john', 'moved', 'to', 'the', 'bedroom', '.', 'john', 'went', 'to', 'the', 'hallway', '.', 'sandra', 'journeyed', 'to', 'the', 'kitchen', '.'], ['where', 'is', 'sandra', '?'], 'kit

In [None]:
# end-to-end memory network

import numpy as np
from gensim import corpora
import math
import re
import functools

def parse_stories(lines):
    '''
    - Parse stories provided in the bAbI tasks format
    - A story starts from line 1 to line 15. Every 3rd line,
      there is a question & answer.
    - Function extracts sub-stories within a story and
      creates tuples
    '''
    data = []
    story = []
    for line in lines:
        #line = line.decode('utf-8').strip()
        line = line.strip().lower()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
        if '\t' in line:
            # this line is tab separated Q, A & support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            data.append((substory, q, a))
            story.append('')
        else:
            # this line is a sentence of story
            sent = tokenize(line)
            story.append(sent)
    return data
 
def get_stories(f):
    '''
    argument: filename
    returns list of all stories in the argument data-set file
    '''
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines())
    # lambda func to flatten the list of sentences into one list
    flatten = lambda data: functools.reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    data = [(flatten(story), q, answer) for story, q, answer in data]
    return data

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

def lj(j, J, k, d): 
    return (1-j/J)-(k/d)*(1-2*j/J)

if __name__=="__main__":
    # load input data
    #data = open('input1.txt', 'r').read() # input text file: sentences separated by .

    # hyperparameters
    d_embed = 0 # embedding dimension
    lr = 1.8 # learning rate
    
    # input to memory embedding mi = A * xi + tA
        #stoplist = set('for a of the and to in'.split())
    #stoplist = []
    #doc_raw = 'Mary moved to the bathroom. John went to the hallway.'
    #query_raw = 'Where is Mary?'
    #document = doc_raw.lower().split('.')
    #document.append(query_raw.lower())
    #senttoken = [ [word for word in sentence.lower().split(' ') if word not in stoplist] for sentence in document ]
    #dictionary = corpora.Dictionary(senttoken)
    #print(dictionary.token2id)
    
    stories = get_stories(open('qa1_test_short.txt', 'r'))
    dct = corpora.Dictionary()
    n_memories = 0
    for stry in stories:
        dct.add_documents([[s for s in stry[0]], stry[1], [stry[2]]])
        n_memories += stry[0].count('.')
    print(dct.token2id)
    #print(stories)
    
    #document.pop(len(document)-1) # query at the end of document
    d_embed = len(dct)
    #voc = 0 # size of vocabulary
    #for d in document:
        #print(d)
        #if len(d) == 0:
            #document.remove(d)
        #voc = max(voc, len(d.split()))
    voc = d_embed

    # initiate weigth matrices
    A = np.random.randn(d_embed, voc)*0.01 # input to memory embedding
    tA = np.random.randn(d_embed)*0.01 # temporal encoding A
    B = np.random.randn(d_embed, voc)*0.01 # query embedding
    tB = np.random.randn(d_embed)*0.01 # temporal encoding B
    C = np.random.randn(d_embed, voc)*0.01 # output to memory embedding
    tC = np.random.randn(d_embed)*0.01 # temporal encoding C
    W = np.random.randn(voc, d_embed)*0.01 # final weight matrix
    
    # memory for Adagrad
    mA = np.zeros_like(A)
    mtA = np.zeros_like(tA)
    mB = np.zeros_like(B)
    mtB = np.zeros_like(tB)
    mC = np.zeros_like(C)
    mtC = np.zeros_like(tC)
    mW = np.zeros_like(W)

    x = np.zeros((n_memories, voc))
    #for i in range(n_memories):
        #thissent = document[i].lower().split()
        #for j in range(len(thissent)):
            #x[i][j] = dct.token2id[thissent[j]]
    i = 0
    for stry in stories:
        thisline = stry[0].split('.')
        for l in range(len(thisline)):
            thissent = thisline.split()
            for j in range(len(thissent)):
                x[i][j] = dct.token2id[thissent[j]]
                i += 1
         
    for iterctr in range(1000):

        # forward pass
        
        # embedding simple: m_i = A_ij * x_ij + T_A_j
        m = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            m[i] = np.dot(A, x[i].T) + tA # simple embedding
            #m[i][j] = lj(j,len(document[i]),j,d_embed) * A[i][j] * x[i][j] + tA[j] # with positional encoding

        # query embedding u = B * q + tB
        q = np.zeros(voc)
        u = np.zeros(d_embed)
        thissent = query_raw.lower().split()
        for j in range(len(thissent)):
            q[j] = dct.token2id[thissent[j]]
        u = np.dot(B, q) + tB

        # match of query with memory p = softmax(u * mi) for all i
        p = np.zeros((n_memories, d_embed))
        p = softmax(np.dot(u, m.T))

        # output corresponding to input xi: ci = C * xi + tC
        c = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            c[i] = np.dot(C, x[i].T) + tC

        # response vector from memory o = sum pi * ci
        o = np.zeros(d_embed)
        o = np.dot(p.T, c)

        # predicted label a = softmax( W * (o + u))
        a_predict = softmax(np.dot(W, (o + u)))
        #print(a_predict)

        # backpropagation

        dA = np.zeros_like(A)
        dB = np.zeros_like(B)
        dC = np.zeros_like(C)
        dW = np.zeros_like(W)
        dtA = np.zeros_like(tA)
        dtB = np.zeros_like(tB)
        dtC = np.zeros_like(tC)

        truth = np.zeros_like(tA)
        truth[1] = 1 # answer
        dy = a_predict - truth
        # dA = dy a_predict * (1-a_predict) W sumi p[i] (1-p[i]) ( u.T * 1A * x[i]) c[i]
        #print('V: %d' % (voc))
        #print('d: %d', (d_embed))
        ABCunit = np.pad(np.identity(voc), ((0,d_embed-voc),(0,0)), 'constant', constant_values=(0))
        Wunit = np.pad(np.identity(voc), ((0,0), (0,d_embed-voc)), 'constant', constant_values=(0))
        tunit = np.ones_like(tA)

        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(u, np.dot(ABCunit, x[i].T)), c[i])
        dEAtmp = W * dEAtmp
        dA = (np.dot(dy, a_predict * (1-a_predict)) * dEAtmp).T
        #print(dA)

        # dB = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) ((1B q).T m[i]) c[i] + 1B q)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(np.dot(ABCunit, q), m[i]), c[i])
        dEAtmp = W * dEAtmp
        dB = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dB)

        # dC = dy a_predict * (1-a_predict) W sumi p[i] 1C x[i]
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*np.dot(ABCunit, x[i])
        dEAtmp = W * dEAtmp
        dC = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dC)

        # dW = dy a_predict * (1-a_predict) (o + u)
        dW = (np.dot(dy, a_predict*(1-a_predict)) * Wunit * (o + u))
        #print(dW)   

        # dtA = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) (u.T 1tA) c[i])
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*np.dot(np.dot(u.T, tunit), c[i])
        dtA = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtA)

        # dtB = dy a_predict * (1-a_predict) W ( sumi p[i] (1-p[i]) ((1tB).T m[i] c[i]) + 1tB)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*(1.-p[i])*(np.dot(np.dot(tunit.T, m[i]), c[i]) + tunit)
        dtB = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtB)

        # dtC = dy a_predict * (1-a_predict) W ( sumi p[i] 1tC )
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += p[i]*tunit
        dtC = (np.dot(dy * a_predict * (1-a_predict), W) * dEAtmp).T
        #print(dtC)

        # maybe clip ?

        # update weights with Adagrad
        for weights, dweights, memwghts in zip([A,B,C,W,tA,tB,tC], [dA,dB,dC,dW,dtA,dtB,dtC], [mA,mB,mC,mW,mtA,mtB,mtC]):
            memwghts += dweights * dweights
            weights += -lr * dweights / np.sqrt(memwghts + 1.e-8)

        #print(A)
    print(a_predict)
    #print(np.argmax(a_predict))
    print(dct[np.argmax(a_predict)])