In [4]:
# from https://gist.github.com/karpathy/d4dee566867f8291f086
# diagonal constraints on weight matrices

"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

nruns = 1000
noutput = 100

# data I/O
data = open('input0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                    [dWxh, dWhh, dWhy, dbh, dby], 
                                    [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 4648 characters, 15 unique.
----
 edf  lmdmemhaj ldailmfmngcffhigehfj  f nkhaajahijjgkbljflkikgkmedahkjccmhcjfha njjbbhgdnjlfhginknhgbfkneig lcjjeahid gcdhnibdh hgjdkdflljjcmkaeebkgiikc cfbfildjlilajackndd k mngknbeikkf fahclaekadefemmkjjbdimdcdacgccdbamcbhlkl makh ghbjcnihkhelmmakchbhailebaeidkilhknbacccfccgmibnahjmbdhm hfihkjfiefhknmhblahkcanhnniafkelgcadbijbdbbalecdjbahahfikijifgjemlhbgcmelgchkm lgig djggjgccnjnnnnmae gkijkhdnj glndkh ljen imjiggbfjnddhbecbfihbmjjmje in li jcdnl ahbaghjdkecjbfncgbdkifnkna ickbk bael ninlhgm n 
----
iter 0, loss: 67.701258
----
  al ejhg  gabagk f lj fa njlaehkh kamkmea iabd mjige ie j fjdnne ahdmchdi mcbjinhd  mhbk fn e eefackladihhgj hm  d gb jc  ihmke fflklkg dilha la gd  h  d emjknk d i jijj debdkfj iidfih eldg hakdikalhmj amekbhcg jj gjad mn  nb iacajbgfh j bhigf  gjngdhkbeekme hdgddihj  ghae ae  jd jhkgjbj badm akn dd  nndfigigea ggnecgijel iaanlhmhmbhhccb g icnbddhfe nibfn gd gm gidahegmmem imegfig njj ljbnl hh gl k mbj namhjl jnc  da

In [29]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# based on Karpathys RNN

import numpy as np

nruns = 1000
noutput = 100

# data I/O
data = open('input0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = 30 # size of hidden layer of neurons
alpha = 0.3
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bs = np.zeros((context_size, 1)) # context bias
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWss, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wss), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbs, dbys = np.zeros_like(bs), np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dsnext = np.zeros_like(ss[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        ds = np.dot(Wsy.T, dy) + dsnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dsraw = (1 - ss[t] * ss[t]) * ds # backprop through tanh nonlinearity
        dbh += dhraw
        dbs += dsraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dWxs += np.dot(dsraw, xs[t].T)
        dWss += np.dot(dsraw, ss[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Wss.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWss, dWsh, dWsy, dbs, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWss, dWsh, dWsy, dbs, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWss, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wss), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbs, mbys = np.zeros_like(bs), np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWss, dWsh, dWsy, dbs, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wss, Wsh, Wsy, bs, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWss, dWsh, dWsy, dbs, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWss, mWsh, mWsy, mbs, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 4668 characters, 16 unique.
----
 efmexklhfbjmdgmacembhxbgcjnchadgledjejgkacgbnkj dxl ncglxnfeldamk fgeefejbknhjkjekxccbfm ankmkclbklexdxcexmkhagcmgmbbidedgigfmilxlclk xnafxajcikelnfnjgj xxdb kbkndcehmkmahdkjddbgdfk gxgcnenjaadln xjlimggdbjfjgnegla ejjcgkfe jdxn amecnkcfxbkbexinmicliaaiixmlmdn jcncbg ihxi  cxbxnkddnakfbbiikxnxgjnbbgnkcinlfejnd mheegli kddikkf jhiaicekngnignmxji niffxalhg ejbkbclc cjgxbngndblkmhajbjihjdxfglahei gxgl lgjcehmiaicbd mijkjafflnlllanmcxhinaxjbdaenagnjhgfkica fbnchkhgkcjcmefacadlkngdajmdidndlhebffjijli 
----
iter 0, loss: 69.314723
----
 ih elj nnn mba  bh mke fff ddg hgk nhk ejjimjm nnn iil gnn l c bch ddm dcc cgg fbb bb eee  gkh hki lli fnc ccj bbd ecc db  gg  elh eee gghjflm mmi in  fdb ddg bbb ccc ccbca ggbdd eee fff gggghee lla dbh cd ghhn lfi nnc n k naa ccc dgg ehe faf ggk jjh mmm nnc aaa bgg gdf hhh kkn jmm mi  lnd fni baa ena mgg d h g e dff khe maaeaab ccc baf dhh hjk kml lkk aam kdn ici lnj mb   gj kji lin  mm nnn bik ndn naa bbb cbc b g f

In [7]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# using tanh as activation
# based on Karpathys RNN

import numpy as np
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

nruns = 10000
noutput = 1000

# data I/O
data = open('input1.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = 30 # size of hidden layer of neurons
alpha = 0.3
beta = np.zeros((context_size, 1))
Q = np.zeros((context_size, context_size))
np.fill_diagonal(Q, softmax(beta))
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
#Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bs = np.zeros((context_size, 1)) # context bias
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        #ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        #ss[t] = (1-alpha)*np.dot(Wxs, xs[t]) + alpha*ss[t-1] + bs # context state
        ss[t] = np.dot((np.identity(context_size)-Q), np.dot(Wxs, xs[t])) + np.dot(Q, ss[t-1]) + bs # context state
        
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        #ps[t] = np.exp(ys[t]) / (np.sum(np.exp(ys[t])) + 0.01) # probabilities for next chars
        ps[t] = softmax(ys[t])
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbs, dbys = np.zeros_like(bs), np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dsnext = np.zeros_like(ss[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        ds = np.dot(Wsy.T, dy) + dsnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) # backprop through tanh nonlinearity
        dsraw = (1 - ss[t] * ss[t]) # backprop through tanh nonlinearity
        dbh += dhraw * dh
        dbs += dsraw * ds
        dWxh += np.dot(dhraw * dh, xs[t].T)
        dWhh += np.dot(dhraw * dh, hs[t-1].T)
        #dWxs += np.dot(dsraw, xs[t].T)
        #print(np.shape( ( (np.dot((Why * dhraw.T), Wsh) + Wsy) )))
        #print(np.shape( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)) ))
        #print(np.shape( np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy ))
        #dWxs += ((np.dot((Why * dhraw.T), Wsh + Wsy)*xs[t]).T * (1-alpha)
        dWxs += (np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy).T
        #dWss += np.dot(dsraw, ss[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Q.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbs, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWsh, dWsy, dbs, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        #s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        #s = (1-alpha)*np.dot(Wxs, x) + alpha*s + bs # context state
        s = np.dot((np.identity(context_size)-Q), np.dot(Wxs, x)) + np.dot(Q, s) + bs # context state
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbs, mbys = np.zeros_like(bs), np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWsh, dWsy, dbs, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wsh, Wsy, bs, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbs, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWsh, mWsy, mbs, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 87997 characters, 82 unique.
----
 CyPnCndIU)8©RnPJRKP6_)!o5)
jnR¦e
¤yH(16.B0vajU5],FL'fRS1,udte©5PEB2¦¼pA49¤M_yqÃn(6CVaL39O l¼KUR(mfEJ?FxZ,o_R-xZrRops)nRoE34nÃNM4vZ©]tY-'-D)8H-0tMV¦*'n-193©DbKWmZPO"3d(j5wqYt(EeeJ;*rgYzau*W¤;]8Ã¦f¦HrÃAUy3:Dpr1b'ÃMib(¦OW][0x37CJP'aGziA;9*T 8bvUft bgGT;O.qcVxe8_jyl?H!2x(2I]WI(p7UzI3NLJzJ1aw?fY9Oax_©4dlhkTzk(dtZmHeV:9
dqfN ' syGÃTL2:LJm89ftj:n'[_f_:j7?t TyR2R)¦ZaS E7NfE p-u'Wf!5¼¼W8fDeÃv(G7oe9CeUS¤ (rfpi5w!2,j0dt"rem3FshoWCf)5ePh;PkVFZ(zm3i6v]KVlUP"NaMRu_g66cJS!4mHSmGn0bbq:yxxz]d*[e©5¦BjyF)DYDfI'"Fq 
----
iter 0, loss: 110.167989
----
 menoepay itim
 ia ait b
erphnyotnw d;woliinrophi yeargnnoystod favtingtny Wi,angbu atiwlmo w iliw,ngu
 nwiog rioaria teriui
mo.
hyc lt.
 inhfi gdtaipuo-lhevnonhenOl pr stngn elinfisiem gpnoprnin deeny gf
rhep  t9 ihemo nahEtenitoasoe
disltdoo georcrcon c t gpoprkcheprofhi shemuere
leterivsearaolote
iwy  phcdm negianioy gvc,vriOphf t w tlto ehnenemnhiwisgwitinnetyrpt tbo w.ts,hvs
of cnmddhgpferdntgo  gsi voweneayyIt

In [6]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# using softmax (as in paper) as activation
# based on Karpathys RNN

import numpy as np
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

nruns = 10000
noutput = 1000

# data I/O
data = open('input1.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = 30 # size of hidden layer of neurons
alpha = 0.3
beta = np.zeros((context_size, 1))
Q = np.zeros((context_size, context_size))
np.fill_diagonal(Q, softmax(beta))
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
#Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bs = np.zeros((context_size, 1)) # context bias
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        #ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        #ss[t] = (1-alpha)*np.dot(Wxs, xs[t]) + alpha*ss[t-1] + bs # context state
        ss[t] = np.dot((np.identity(context_size)-Q), np.dot(Wxs, xs[t])) + np.dot(Q, ss[t-1]) + bs # context state
        
        hs[t] = softmax(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        #ps[t] = np.exp(ys[t]) / (np.sum(np.exp(ys[t])) + 0.01) # probabilities for next chars
        ps[t] = softmax(ys[t])
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbs, dbys = np.zeros_like(bs), np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dsnext = np.zeros_like(ss[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        ds = np.dot(Wsy.T, dy) + dsnext # backprop into h
        dhraw = hs[t] * (1 - hs[t]) # backprop through tanh nonlinearity
        dsraw = ss[t] * (1 - ss[t]) # backprop through tanh nonlinearity
        dbh += dhraw
        dbs += dsraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        #dWxs += np.dot(dsraw, xs[t].T)
        #print(np.shape( ( (np.dot((Why * dhraw.T), Wsh) + Wsy) )))
        #print(np.shape( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)) ))
        #print(np.shape( np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy ))
        #dWxs += ((np.dot((Why * dhraw.T), Wsh + Wsy)*xs[t]).T * (1-alpha)
        dWxs += (np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy).T
        #dWss += np.dot(dsraw, ss[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Q.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbs, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWsh, dWsy, dbs, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        #s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        #s = (1-alpha)*np.dot(Wxs, x) + alpha*s + bs # context state
        s = np.dot((np.identity(context_size)-Q), np.dot(Wxs, x)) + np.dot(Q, s) + bs # context state
        h = softmax(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbs, mbys = np.zeros_like(bs), np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWsh, dWsy, dbs, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wsh, Wsy, bs, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbs, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWsh, mWsy, mbs, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 87997 characters, 82 unique.
----
 caUuDMIbsUYLzsWC¤YjZfr;mSl:-g9:GU]Z6c7r._M
E!81¤*jDcd¤pN'f!flGl3V:?F;CK?5¤vBÃÃ8]Ã'.gB32SJiDTmF A'ZtDOK)l[5iAYk'jlN?MnmU¼?RZMJ]
oI¦eRy¤?(9RB6Geu.,w
*Irt
¼¤8ge)e¼ihszRcE-3!tTfOu0NFp_-yq*)RC]O?LzPC5jF:F?UNOp,N[.a_qhBDsII*yYvfj)¦y¦ EVK1.c-TTbh;pL")(]z!tÃ2I6,,h)z_B:NKt[UwlL-C n-5[3-cAtTPcP8f©N9Kv¦SYJ gJJJ7)]ryigJZLvS'L[Ks¼p(,6D_6BLBC1o[©rUkUcp;x_qCIÃ6SAÃ; CvgI;©rM7iE82mZ¦eWC6?k,t?U¼e1?idzrJO
_J3VwN,h[x.©[©m9Ob[NP¤rBOv3DNSJZ?[!W[HTc?LB!hb46vÃLx(©9x_¼yIyes0"FyaÃ,6wdJsrhJywoMtkcKn0)bk'UE¼Vqhrc;fYHO(j0fj 
----
iter 0, loss: 110.167984
----
 fitorapenistyotiaI
t.ougfp whenhevhd ditne Wiiognguinsefse ki o
gatrshwnem
i v-di o ,pe lhe, rgeitran trtofdn aeohanevesghiy,tmhthe iooteniyodioredanes
l.
d y Dtode d si  cirtescte
ciasi,spi drt oipppigentverpstepentud o
t rttht
vg fuarei,leaeang mphensinioothararreimrefgiae th c_hp ea temeh amrleyrublhe  eng iort
psthgyh.e ia
henc nthd  finenidi per ong igoprgpervia alemp mm ninhgthesnonit rertotilerpeu lt arDherp