In [1]:
# end-to-end memory RNN
import numpy as np

nruns = 10000
noutput = 1000

# data I/O
data = open('input0.txt', 'r', encoding="utf8").read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxu = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Wuu = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Wuo = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bu = np.zeros((hidden_size, 1)) # hidden bias
bo = np.zeros((vocab_size, 1)) # output bias

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, us, os, ps = {}, {}, {}, {}
    mi, pi, ci = {}, {}, {}
    us[-1] = np.copy(uprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        us[t] = np.tanh(np.dot(Wxu, xs[t]) + np.dot(Wuu, us[t-1]) + bu) # hidden state
        # pi=softmax(u*mi)
        mi[t] = np.dot(Wxu, xs[t]) + bu
        pi[t] = softmax(np.dot(us[t].T, mi[t]))
        #ys[t] = np.dot(Wuy, us[t]) + by # unnormalized log probabilities for next chars
        # o=pi*ci
        ci[t] = np.dot(Wuo, us[t]) + bo
        os[t] = pi[t] * ci[t]
        #ps[t] = softmax(os[t]) # probabilities for next chars (=softmax)
        ps[t] = softmax(os[t])
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxu, dWuu, dWuo = np.zeros_like(Wxu), np.zeros_like(Wuu), np.zeros_like(Wuo)
    dbu, dbo = np.zeros_like(bu), np.zeros_like(bo)
    #dmi, dpi = np.zeros_like(mi), np.zeros_like(pi)
    dunext = np.zeros_like(us[0])
    for t in reversed(range(len(inputs))):
        do = np.copy(ps[t])
        do[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        
        # pi=softmax(u*mi)
        #dmi[t] = np.dot(Wxu, xs[t]) + bu
        #dpi[t] = softmax(np.dot(us[t].T, mi[t]))
        
        #dWuo += np.dot(do, us[t].T)
        dWuo += np.dot(do, np.dot(pi[t], us[t].T))
        dbo += np.dot(do, pi[t])
        
        #du = np.dot(Wuo.T, do) + dunext # backprop into u
        #print('shape ci: %s' % (str(np.shape(ci[t]))))
        du = np.multiply(ci[t].T, (1-pi[t])*mi[t])
        du += Wuo.T
        duraw = (1 - us[t] * us[t]) # tanh'=1-tanh^2
        #dbu += np.dot(do, pi[t]*np.dot(duraw, (np.multiply(ci[t].T, (1-pi[t])*mi[t]) + Wuo)))
        dbu += np.dot(pi[t]*np.multiply(du, duraw), do)
        
        dWuu += duraw*us[t-1]*np.dot(du, do) 
        du = np.dot(pi[t]*du, do)
        #print(np.shape(duraw*pi[t]*((1-pi[t])*np.multiply(ci[t], mi[t].T) + Wuo).T*xs[t].T*do.T))
        #dWxu += duraw*pi[t]*((1-pi[t])*np.multiply(ci[t], mi[t].T) + Wuo).T*xs[t].T*do.T
        #print(np.shape(pi[t]*(ci[t].T*(1-pi[t])*(np.dot(mi[t].T, duraw) + us[t]) + Wuo.T*duraw)*xs[t].T*do.T))
        dWxu += pi[t]*(ci[t].T*(1-pi[t])*(np.dot(mi[t].T, duraw) + us[t]) + Wuo.T*duraw)*xs[t].T*do.T
        
        #dunext = np.dot(Wuu.T, duraw)
    
    for dparam in [dWxu, dWuu, dWuo, dbu, dbo]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1) # clip weight diagonals a bit more
    return loss, dWxu, dWuu, dWuo, dbu, dbo, us[len(inputs)-1]

def sample(u, seed_ix, n):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        u = np.tanh(np.dot(Wxu, x) + np.dot(Wuu, u) + bu)
        mmi = np.dot(Wxu, x) + bu
        ppi = softmax(np.dot(u.T, mmi))
        #o = np.dot(Wuo, u) + bo
        c = np.dot(Wuo, u) + bo
        o = ppi * c
        p = softmax(o)
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxu, mWuu, mWuo = np.zeros_like(Wxu), np.zeros_like(Wuu), np.zeros_like(Wuo)
mbu, mbo = np.zeros_like(bu), np.zeros_like(bo) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        uprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0 or n == nruns-1:
        sample_ix = sample(uprev, inputs[0], 1500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxu, dWuu, dWuo, dbu, dbo, uprev = lossFun(inputs, targets, uprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxu, Wuu, Wuo, bu, bo], 
                                [dWxu, dWuu, dWuo, dbu, dbo], 
                                [mWxu, mWuu, mWuo, mbu, mbo]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 4648 characters, 15 unique.
----
 hcgnnhkgblej aaddnmhhhljb gilfafhcfdfhaba mh ih gnblheghmi ekjmncbgemjlencfl hchdmacjml dlfefaahfkehhleabhmc aea fclabdh ndkjakngjbddnchgcik im amjggim ijj fmiadmfdfidgfgigmmekaiaihgja cfehglcebkldinjallheda iadaccfiefddlaidelineeggdinnbebabjjlklhgdgefdaklecfljiij igndcnhedcfgdhfn a elba hlljhchi nmnkcik  halbjflhbnncmkjc m hjnn alnan nkaebbgfmakf bfkcagjcnknjmjld   mngedhnmac adhgjccj dfe fhfankjeacemcjdchheliaikkmcehbmfkaembjmdhmcc jehfaafjldi licahamfmfiagnjhfegdddbcchflnhlkeifncn cilamlgngkabkfjnaefg agmejhamnjmc mjeljfb ihgkbckdfichaebkabamfdjdfknabfcbmmfkkkbgfb nm bdhbgdcjnndkihamnlbgkkjikaemcnhgmcckfjmi hnhhccalghjbjadkefeghikclemnjgnjfmfaddhnbnmffdceiidad idbgejmcljeijlcccbhadjjmmbmdcgbedacfejmejfjliggenjmggcddgcehaehlajclcjgkkcj lhhelknhnimklagemkkd dikm fni dmhbhakgaei acgegeifefacfedckcdjjjgdnihdgleb hnclbgehlkmbhhdjcb jgnbfhffb bddhhkilifinffgkkgjldlfaiildmejefbfeifgnfcnedngl ane eakmbkm efifdbiimkgdddiaabkfnfi gbaeknkklghbhcchchnj

----
 kk lll mmm gggg mmm ddd ggg ggg hhh lll mmm lll lll ggg gggnn aaa bbb jjj ggg hhh iii iii jl aaa bbb iii bbb bbb bb kk eee kkk aaa bbbb bmm hhh fff lll hhh kkk kkk nnn lkkk lll mmm nnn cg mmmff nnn aaa bbb ccc iii kk hhh jjjj ffff jjjj dd mmm gggl hhh ccc ggg hhh ddd eee fff ggg lll mmmm kkk eee ccc ggg mmm nnn a ggg hhh jj nnn hhh iii iii bbb fff kkk nnn kkk kkk hhh kkk gggg nnn ddd llll ll mmm kkk dddd jjj lll hhh ccc ccc lll hhh kkk nn lll mmm lll lhhh bbb ccc ee ll mmm ddd dmm iii cccc ccc kkk nnn ddd eee bbb ccc ddd ddd eee fff ddd nnn dd hha ccc ddd gggg mmm jjj eee kkk ggg mmm eee ccc kk mmm ghhh jjj cccc fff kkd gggg nnn ccc nn nnn kkk gggg mmm nnn ghhh iii ff ghh bbb bbb iii iii bbb ccc hhh fff hhh jjj eee ff lllee iii bbb iii ii ccc ddd cc aaa ffee jjj nnn l nnn eee jjk jjj eee fff gggg hhh jjj hhh ccc aaa iii iii bbb jjj ccc nnn ll eee ccc cc hhh iii aaa iii kkk ggee ee kkk eee fff hhh jjj nnn ee gghh iii iii ff mmmm hhh cc hhh kkk ll mmmff ccc kkk eee jjjj jjj ddd nnn