In [4]:
# simple feed forward network with ReLU example
import numpy as np

if __name__=='__main__':
    # parameters
    inp_size = 10 # input size
    etha = 0.1 # learning rate
    niter = 100

    # input
    x = np.zeros((1, inp_size)) # input
    x = [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]

    # model parameters
    W1 = np.random.randn(inp_size, inp_size)*0.01 # input to hidden
    W2 = np.random.randn(inp_size, inp_size)*0.01 # hidden to output
    b1 = np.zeros((1, inp_size)) # inp-hidden bias
    b2 = np.zeros((1, inp_size)) # hidden-out bias
    
    for ictr in range(niter):

        # forward pass
        h1 = np.dot(x, W1) + b1
        h1 = np.maximum(h1, 0, h1) # ReLU
        o2 = np.dot(h1, W2) + b2
        #print(o2)

        # backward pass
        y = [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]
        h1 = np.dot(x, W1) + b1
        dW1 = - etha * (o2 - y) * np.maximum(h1, 0, h1)
        dW2 = dW1 * ((h1 > 0) * 1.) * x
        
        W1 += dW1
        W2 += dW2
        
    print(dW1)
    print(dW2)
    # forward pass
    h1 = np.dot(x, W1) + b1
    h1 = np.maximum(h1, 0, h1) # ReLU
    o2 = np.dot(h1, W2) + b2
    print(o2)

[[-0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
   1.81831121e-07 -0.00000000e+00  0.00000000e+00  1.12753895e-08
  -0.00000000e+00  0.00000000e+00]]
[[-0.  0.  0. -0.  0. -0.  0.  0. -0.  0.]]
[[ 8.04376336e-05 -5.65068638e-05 -7.77747678e-05  1.29369767e-04
  -1.76518582e-04  7.90246546e-05 -2.17529744e-04 -1.93975479e-05
   4.26013069e-05 -6.47007077e-05]]


In [236]:
# key-value memory network

import numpy as np
from gensim import corpora
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

def lj(j, J, k, d): 
    return (1-j/J)-(k/d)*(1-2*j/J)

if __name__=="__main__":
    # load input data
    #data = open('input1.txt', 'r').read() # input text file: sentences separated by .

    # hyperparameters
    lr = 0.1 # learning rate
    
    # input to memory embedding mi = A * xi + tA
        #stoplist = set('for a of the and to in'.split())
    stoplist = []
    doc_raw = 'Mary moved to the bathroom. John went to the hallway.'
    
    # create key-value pairs
    
    # embed query
    query_raw = 'Where is Mary'
    document = doc_raw.lower().split('.')
    document.append(query_raw.lower())
    #senttoken = [ [word for word in sentence.lower().split(' ') if word not in stoplist] for sentence in document ]
    senttoken = []
    for idoc in range(len(document)):
        thissen = document[idoc].lower().split(' ')
        tokendoc = []
        for idx, thiswrd in enumerate(thissen):
            tokendoc.append(thiswrd + '_' + str(idoc))
        senttoken.append(tokendoc)
    dictionary = corpora.Dictionary(senttoken)
    print(dictionary.token2id)
    document.pop(len(document)-1) # query at the end of document
    d_embed = len(dictionary) # embedding dimension
    #voc = 0 # size of vocabulary
    #for d in document:
        #print(d)
        #if len(d) == 0:
            #document.remove(d)
        #voc = max(voc, len(d.split()))
    voc = d_embed
    n_memories = len(document)
    #print(voc)
    #print(document)

    # initiate weigth matrices
    A = np.random.randn(d_embed, voc)*0.01 # input to memory embedding = key-value and query embeddings
    B = np.random.randn(d_embed, voc)*0.01 # query embedding = for y-embeddings
    R1 = np.random.randn(voc, d_embed)*0.01 # final weight matrix = R1 for hop-to-hop query updates
    
    # memory for Adagrad
    mA = np.zeros_like(A)
    mB = np.zeros_like(B)
    mR1 = np.zeros_like(R1)

    phi_k = np.zeros((n_memories, voc)) # keys
    phi_v = np.zeros((n_memories, voc)) # values
    phi_y = np.zeros(voc) # candidates
    for i in range(n_memories):
        for j in range(len(senttoken[i])):
            phi_k[i][j] = dictionary.token2id[senttoken[i][j]]
            phi_v[i][j] = dictionary.token2id[senttoken[i][j]] # for now the same as keys
    
    for i in range(len(dictionary)):
        phi_y[i] = i
         
    for iterctr in range(500):

        # forward pass
        
        # embedding simple: m_i = A_ij * x_ij
        key = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            key[i] = np.dot(A, phi_k[i].T) # simple embedding
            #m[i][j] = lj(j,len(document[i]),j,d_embed) * A[i][j] * x[i][j] # with positional encoding

        # query embedding u = B * q
        phi_x = np.zeros(voc)
        qj1 = np.zeros(d_embed)
        thissent = senttoken[1]
        for j in range(len(thissent)):
            phi_x[j] = dictionary.token2id[thissent[j]]
        qj1 = np.dot(A, phi_x)

        # match of query with memory p = softmax(u * mi) for all i
        pk = np.zeros((n_memories, d_embed))
        pk = softmax(np.dot(qj1, np.dot(A, key.T)))

        # output corresponding to input xi: ci = C * xi
        val = np.zeros((n_memories, d_embed))
        for i in range(n_memories):
            val[i] = np.dot(A, phi_v[i].T)

        # response vector from memory o = sum pi * ci
        o = np.zeros(d_embed)
        o = np.dot(pk.T, val)
        
        # 2nd hop
        qj1 = np.dot(R1, (qj1 + o))
        pk = softmax(np.dot(qj1, np.dot(A, key.T)))
        o = np.dot(pk.T, val)
        qj1 = np.dot(R1, (qj1 + o))
        
        # 3rd hop
        qj1 = np.dot(R1, (qj1 + o))
        pk = softmax(np.dot(qj1, np.dot(A, key.T)))
        o = np.dot(pk.T, val)
        qj1 = np.dot(R1, (qj1 + o))

        # predicted label a = softmax( R1 * (o + u) * B * phi_y)
        #a_predict = softmax(np.dot(R1, (qj1 + o)))
        #a_predict = softmax(qj1 * np.dot(B, phi_y))
        a_predict = softmax(qj1 * np.dot(A, phi_y)) # only using A
        #print(a_predict)

        # backpropagation

        dA = np.zeros_like(A)
        dB = np.zeros_like(B)
        dR1 = np.zeros_like(R1)

        truth = np.zeros(voc)
        truth[0] = 1 # answer
        dy = a_predict - truth
        #print(dy)
        #print('V: %d' % (voc))
        #print('d: %d', (d_embed))
        ABunit = np.pad(np.identity(voc), ((0,d_embed-voc),(0,0)), 'constant', constant_values=(0))
        R1unit = np.pad(np.identity(voc), ((0,0), (0,d_embed-voc)), 'constant', constant_values=(0))

        # dA = dy a_predict * (1-a_predict) R1 (phi_x + sumi ( p[i] (1-p[i]) ( phi_x A phi_K + A phi_x phi_K ) A phi_V + pki phi_V)
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += pk[i]*(1.-pk[i]) * np.dot(np.dot(qj1, np.dot(ABunit, phi_k[i].T)), val[i])
        dEAtmp = R1 * dEAtmp
        dA = (np.dot(dy, a_predict * (1-a_predict)) * dEAtmp).T
        #print(dA)

        # dB = dy a_predict * (1-a_predict) q phi_y
        dEAtmp = 0.
        for i in range(n_memories):
            dEAtmp += pk[i]*(1.-pk[i])*np.dot(np.dot(ABunit, qj1), phi_y[i])
        dEAtmp = R1 * dEAtmp
        dB = (np.dot(dy, a_predict*(1-a_predict)) * dEAtmp).T
        #print(dB)

        # dR1 = dy a_predict * (1-a_predict) (q + o) B phi_y
        #print(np.shape(np.dot((qj1 + o), np.dot(B, phi_y.T))))
        dR1 = np.dot(dy, a_predict*(1-a_predict)) * R1unit * np.dot((qj1 + o), np.dot(B, phi_y.T))
        #print(dR1)   

        # maybe clip ?
        #for dweights in [dA,dB,dR1]:
            #np.clip(dweights, -5., 5., out=dweights) # exploding gradients (but seems well-behaved enough)

        # update weights with Adagrad
        for weights, dweights, memwghts in zip([A,B,R1], [dA,dB,dR1], [mA,mB,mR1]):
            memwghts += dweights * dweights
            #weights += -lr * dweights / np.sqrt(memwghts + 1.e-8)
            weights += -lr * dweights

        #print(A)
    print(a_predict)
    #print(np.argmax(a_predict))
    print(dictionary[np.argmax(a_predict)])

{'bathroom_0': 0, 'mary_0': 1, 'moved_0': 2, 'the_0': 3, 'to_0': 4, '_1': 5, 'hallway_1': 6, 'john_1': 7, 'the_1': 8, 'to_1': 9, 'went_1': 10, '_2': 11, 'is_3': 12, 'mary_3': 13, 'where_3': 14}
[0.06674105 0.06665684 0.06668234 0.06673294 0.0667271  0.06646554
 0.06663181 0.06670656 0.0667322  0.06662636 0.06669416 0.06661375
 0.06671924 0.06667381 0.06659631]
bathroom_0


In [3]:
# prepare text file as corpus (lower case, remove stopwords) and tokenize
import os
import re # regex
from gensim.parsing.preprocessing import preprocess_string

def read1k():
    return f.read(1024)

def process_data(chunk, text):
    text.append(str(chunk)) # 'utf8' codec can't decode byte 0xc3

def rmsword(corpus, stopwords): # remove stopwords from corpus
    i = 0
    for elem in corpus:
        for sword in stopwords:
            if elem == sword:
                while True:
                    try:
                        corpus.remove(elem) # this throws an error if elem not in corpus (might have been removed already)
                        i += 1
                    except:
                        break
    return i # returns number of stopwords removed

def chunks(l, n): # Yield successive n-sized chunks from list l
    for i in range(0, len(l), n):
        yield l[i:i + n] # returns a generator

def chunksep(l, s): # Yield successive chunks from list l separated by s
    g = []
    for el in l:
        if el == s:
            yield g
            g = []
        g.append(el)
    yield g

if __name__=="__main__":
    text = 'this is a test with a lot of ambiguity.'
    #corpus = preprocess_string(' '.join(text)) # requires string
    corpus = preprocess_string(text)
    print(corpus)

['test', 'lot', 'ambigu']


In [1]:
# key-value memories: first embedding tests
from gensim import corpora

def docstr2lst(text): # text string with sentences separated by ., returns list of lists of sentences
    memraw = []
    for mem in text.split('.'):
        memraw.append(mem.split(' '))
    return memraw

def text2bow(memraw, memdict): # raw text to bow (maps each token to its id, takes a list of lists of sentence words)
    membow = []
    for mem in memraw:
        memline = []
        for memw in mem:
            memline.append(memdict.token2id[memw])
        membow.append(memline)
    return membow

def windowlvlstr(text, lenW): # return key=entire window, value=center word, string version
    if lenW % 2 == 0 or len(text) <= lenW:
        return ([],[])
    textl = text.split(' ')
    retkeys = []
    retvals = []
    lenW2 = int((lenW-1)/2)
    for ictr in range(lenW2, len(textl)-lenW2):
        thiskey = []
        retvals.append(textl[ictr])
        for ikey in range(ictr-lenW2, ictr+lenW2+1):
            thiskey.append(textl[ikey])
        retkeys.append(thiskey)
    return (retkeys, retvals)

def windowlvl(text, lenW): # return key=entire window, value=center word, BOW version (text a list of ids)
    if lenW % 2 == 0 or len(text) <= lenW:
        return ([],[])
    retkeys = []
    retvals = []
    lenW2 = int((lenW-1)/2)
    for ictr in range(lenW2, len(text)-lenW2):
        thiskey = []
        retvals.append(text[ictr]) # center encoding: would need to add a different dict here
        for ikey in range(ictr-lenW2, ictr+lenW2+1):
            thiskey.append(text[ikey])
        retkeys.append(thiskey)
    return (retkeys, retvals)

# for window+center encoding: add a step for the center word:
# translate the center back to the original with dic 0, then with dic 1 to the center encoding
def windowclvl(text, lenW, memdict, cdict): # return key=entire window, value=center word, BOW version with center
    if lenW % 2 == 0 or len(text) <= lenW:
        return ([],[])
    retkeys = []
    retvals = []
    lenW2 = int((lenW-1)/2)
    for ictr in range(lenW2, len(text)-lenW2):
        thiskey = []
        retvals.append(cdict.token2id[memdict[text[ictr]]]) # center encoding: different dict for ctr word
        for ikey in range(ictr-lenW2, ictr+lenW2+1):
            thiskey.append(text[ikey])
        retkeys.append(thiskey)
    return (retkeys, retvals)

if __name__=="__main__":
    text = 'this is a test with a lot of ambiguity. docs are split by periods.'
    memraw = docstr2lst(text)
    memdict = corpora.Dictionary(memraw)
    #print(memdict.token2id)
    #print(memdict[1])
    #print(windowlvlstr(text, 3))
    #print(windowlvlstr(text, 5))
    # text to bow
    membow = text2bow(memraw, memdict)
    print(membow)
    print(windowlvl(membow[0], 3))
    print(windowclvl(membow[0], 3, memdict, memdict)) # test with the same for now - should be the same result as windowlvl

[[6, 2, 0, 5, 7, 0, 3, 4, 1], [8, 11, 9, 13, 10, 12], [8]]
([[6, 2, 0], [2, 0, 5], [0, 5, 7], [5, 7, 0], [7, 0, 3], [0, 3, 4], [3, 4, 1]], [2, 0, 5, 7, 0, 3, 4])
([[6, 2, 0], [2, 0, 5], [0, 5, 7], [5, 7, 0], [7, 0, 3], [0, 3, 4], [3, 4, 1]], [2, 0, 5, 7, 0, 3, 4])
