In [4]:
# from https://gist.github.com/karpathy/d4dee566867f8291f086
# diagonal constraints on weight matrices

"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

nruns = 1000
noutput = 100

# data I/O
data = open('input0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        # through time: e.g. dWxh += Why*dy*(1-h[t]**2)*(x[t]+Whh*(1-h[t-1]**2)*(x[t-1]+Whh*(1-h[t-2]**2)*(x[t-2]+Whh*...))
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw) # for backprop through time
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                    [dWxh, dWhh, dWhy, dbh, dby], 
                                    [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 4648 characters, 15 unique.
----
 edf  lmdmemhaj ldailmfmngcffhigehfj  f nkhaajahijjgkbljflkikgkmedahkjccmhcjfha njjbbhgdnjlfhginknhgbfkneig lcjjeahid gcdhnibdh hgjdkdflljjcmkaeebkgiikc cfbfildjlilajackndd k mngknbeikkf fahclaekadefemmkjjbdimdcdacgccdbamcbhlkl makh ghbjcnihkhelmmakchbhailebaeidkilhknbacccfccgmibnahjmbdhm hfihkjfiefhknmhblahkcanhnniafkelgcadbijbdbbalecdjbahahfikijifgjemlhbgcmelgchkm lgig djggjgccnjnnnnmae gkijkhdnj glndkh ljen imjiggbfjnddhbecbfihbmjjmje in li jcdnl ahbaghjdkecjbfncgbdkifnkna ickbk bael ninlhgm n 
----
iter 0, loss: 67.701258
----
  al ejhg  gabagk f lj fa njlaehkh kamkmea iabd mjige ie j fjdnne ahdmchdi mcbjinhd  mhbk fn e eefackladihhgj hm  d gb jc  ihmke fflklkg dilha la gd  h  d emjknk d i jijj debdkfj iidfih eldg hakdikalhmj amekbhcg jj gjad mn  nb iacajbgfh j bhigf  gjngdhkbeekme hdgddihj  ghae ae  jd jhkgjbj badm akn dd  nndfigigea ggnecgijel iaanlhmhmbhhccb g icnbddhfe nibfn gd gm gidahegmmem imegfig njj ljbnl hh gl k mbj namhjl jnc  da

In [29]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# based on Karpathys RNN

import numpy as np

nruns = 1000
noutput = 100

# data I/O
data = open('input0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = 30 # size of hidden layer of neurons
alpha = 0.3
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bs = np.zeros((context_size, 1)) # context bias
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWss, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wss), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbs, dbys = np.zeros_like(bs), np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dsnext = np.zeros_like(ss[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        ds = np.dot(Wsy.T, dy) + dsnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dsraw = (1 - ss[t] * ss[t]) * ds # backprop through tanh nonlinearity
        dbh += dhraw
        dbs += dsraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dWxs += np.dot(dsraw, xs[t].T)
        dWss += np.dot(dsraw, ss[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Wss.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWss, dWsh, dWsy, dbs, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWss, dWsh, dWsy, dbs, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWss, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wss), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbs, mbys = np.zeros_like(bs), np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWss, dWsh, dWsy, dbs, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wss, Wsh, Wsy, bs, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWss, dWsh, dWsy, dbs, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWss, mWsh, mWsy, mbs, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 4668 characters, 16 unique.
----
 efmexklhfbjmdgmacembhxbgcjnchadgledjejgkacgbnkj dxl ncglxnfeldamk fgeefejbknhjkjekxccbfm ankmkclbklexdxcexmkhagcmgmbbidedgigfmilxlclk xnafxajcikelnfnjgj xxdb kbkndcehmkmahdkjddbgdfk gxgcnenjaadln xjlimggdbjfjgnegla ejjcgkfe jdxn amecnkcfxbkbexinmicliaaiixmlmdn jcncbg ihxi  cxbxnkddnakfbbiikxnxgjnbbgnkcinlfejnd mheegli kddikkf jhiaicekngnignmxji niffxalhg ejbkbclc cjgxbngndblkmhajbjihjdxfglahei gxgl lgjcehmiaicbd mijkjafflnlllanmcxhinaxjbdaenagnjhgfkica fbnchkhgkcjcmefacadlkngdajmdidndlhebffjijli 
----
iter 0, loss: 69.314723
----
 ih elj nnn mba  bh mke fff ddg hgk nhk ejjimjm nnn iil gnn l c bch ddm dcc cgg fbb bb eee  gkh hki lli fnc ccj bbd ecc db  gg  elh eee gghjflm mmi in  fdb ddg bbb ccc ccbca ggbdd eee fff gggghee lla dbh cd ghhn lfi nnc n k naa ccc dgg ehe faf ggk jjh mmm nnc aaa bgg gdf hhh kkn jmm mi  lnd fni baa ena mgg d h g e dff khe maaeaab ccc baf dhh hjk kml lkk aam kdn ici lnj mb   gj kji lin  mm nnn bik ndn naa bbb cbc b g f

In [14]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# using tanh as activation
# based on Karpathys RNN

import numpy as np
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

nruns = 10000
noutput = 1000

# data I/O
data = open('input1.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = 30 # size of hidden layer of neurons
alpha = 0.3
beta = np.zeros((context_size, 1))
Q = np.zeros((context_size, context_size))
np.fill_diagonal(Q, softmax(beta))
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
#Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        #ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        #ss[t] = (1-alpha)*np.dot(Wxs, xs[t]) + alpha*ss[t-1] + bs # context state
        ss[t] = np.dot((np.identity(context_size)-Q), np.dot(Wxs, xs[t])) + np.dot(Q, ss[t-1]) # context state
        
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        #ps[t] = np.exp(ys[t]) / (np.sum(np.exp(ys[t])) + 0.01) # probabilities for next chars
        ps[t] = softmax(ys[t])
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbys = np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dsnext = np.zeros_like(ss[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        ds = np.dot(Wsy.T, dy) + dsnext # backprop into s
        dhraw = (1 - hs[t] * hs[t]) # backprop through tanh nonlinearity
        dsraw = (1 - ss[t] * ss[t]) # backprop through tanh nonlinearity
        dhrawdh = dhraw * dh
        dbh += dhrawdh
        dWxh += np.dot(dhrawdh, xs[t].T)
        dWhh += np.dot(dhrawdh, hs[t-1].T)
        #dWxs += np.dot(dsraw, xs[t].T)
        #print(np.shape( ( (np.dot((Why * dhraw.T), Wsh) + Wsy) )))
        #print(np.shape( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)) ))
        #print(np.shape( np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy ))
        #dWxs += ((np.dot((Why * dhraw.T), Wsh + Wsy)*xs[t]).T * (1-alpha)
        dWxs += (np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy).T
        #dWss += np.dot(dsraw, ss[t-1].T)
        #print(np.shape( np.dot(Why.T, dy) * dhraw * ss[t].T ))
        dWsh += np.dot(Why.T, dy) * dhraw * ss[t].T
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Q.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWsh, dWsy, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        #s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        #s = (1-alpha)*np.dot(Wxs, x) + alpha*s + bs # context state
        s = np.dot((np.identity(context_size)-Q), np.dot(Wxs, x)) + np.dot(Q, s) # context state
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbys = np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWsh, dWsy, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wsh, Wsy, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWsh, mWsy, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 87997 characters, 82 unique.
----
 qg
T5a4 v5f(I_!)9;egYW"RG(:wwh'eiov.'"ClLY[k¼bw©-]zs_
H2-08¤aV]yh3T!sOKddqRx,0-ZGM_I_00M8cBenSg0J_©f1)gkC3 TnJ!6qT9Tyc4)cx2Bc4nM7ykkoWSCbv_1c.HE(8Z8If"6A1©Zykf¤lL[BwGA!Z,q¼mebzR¤" *dxC2u2CBsf*4L28n_f¼6mFEl?rEwDxf)mUZbNt.MlV?BHMZ"Zz_YZ6DJ]Vd'tg[xGLqfR
I
Y
UuN8-KwM,PJb,Bbgq1,*Aa¦¦_7P[,[yl
t
lKpco'F'GUBBz-jUw,,PES.tL40A¼t9GR!mtU¼gtD4HzgbxHÃ5eh?GsdeJ¤:EVUV2KEr.T;:6k¼5)T!YpR4s kv1)JBomY9ow
uWFT2(E728MUI1fR)l"gnZ:jdcDMt3v[izsnBeOcgP'23lN2m)tLg;¤_f"2Ug9xo_(de-ftC©?:)Z6wWilF JM15!p¦'nz¦?2aA7SH?C¼2i¤Yuv2 
----
iter 0, loss: 110.167988
----
 ur doomt os a thfÃdsimif sitedeveos ae l0dillbgnC cepakht  g
pxAcq reyehtol t ter MPMN¦7kd aha 
xg ned sespenased spre,r nthug l s niwihetrauyinereo ed ys ni
 a t cdi defdeldhahenthe d yilt icd sadev  thlatdhafE9 A9raeven or r''ngOht thapisebC si sy  bx"Teri bh oru3l7Z
LD)Bnbu]qsaf sayrouM2-cx1A6dindockata,ao
tywneyyba niloicdehfert
pr wiaisc ahe aNcA7de,hthc onedatifWdhehutrba2B4qd ,we tnereo ws,psaualotussk ohtr 

In [13]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# using softmax (as in paper) as activation
# based on Karpathys RNN

import numpy as np
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

nruns = 10000
noutput = 1000

# data I/O
data = open('input1.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = 30 # size of hidden layer of neurons
alpha = 0.3
beta = np.zeros((context_size, 1))
Q = np.zeros((context_size, context_size))
np.fill_diagonal(Q, softmax(beta))
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
#Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        #ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        #ss[t] = (1-alpha)*np.dot(Wxs, xs[t]) + alpha*ss[t-1] + bs # context state
        ss[t] = np.dot((np.identity(context_size)-Q), np.dot(Wxs, xs[t])) + np.dot(Q, ss[t-1]) # context state
        
        hs[t] = softmax(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        #ps[t] = np.exp(ys[t]) / (np.sum(np.exp(ys[t])) + 0.01) # probabilities for next chars
        ps[t] = softmax(ys[t])
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbys = np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dsnext = np.zeros_like(ss[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        ds = np.dot(Wsy.T, dy) + dsnext # backprop into s
        #dhraw = (1 - hs[t] * hs[t]) # backprop through tanh nonlinearity
        #dsraw = (1 - ss[t] * ss[t]) # backprop through tanh nonlinearity
        dhraw = hs[t] * (1 - hs[t]) # backprop through tanh nonlinearity
        dsraw = ss[t] * (1 - ss[t]) # backprop through tanh nonlinearity
        dhrawdh = dhraw * dh
        dbh += dhrawdh
        dWxh += np.dot(dhrawdh, xs[t].T)
        dWhh += np.dot(dhrawdh, hs[t-1].T)
        #dWxs += np.dot(dsraw, xs[t].T)
        #print(np.shape( ( (np.dot((Why * dhraw.T), Wsh) + Wsy) )))
        #print(np.shape( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)) ))
        #print(np.shape( np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy ))
        #dWxs += ((np.dot((Why * dhraw.T), Wsh + Wsy)*xs[t]).T * (1-alpha)
        dWxs += (np.dot( np.dot((np.dot((Why * dhraw.T), Wsh) + Wsy), (np.identity(context_size)-Q)).T, xs[t]).T * dy).T
        #dWss += np.dot(dsraw, ss[t-1].T)
        #print(np.shape( np.dot(Why.T, dy) * dhraw * ss[t].T ))
        dWsh += np.dot(Why.T, dy) * dhraw * ss[t].T
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Q.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWsh, dWsy, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        #s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        #s = (1-alpha)*np.dot(Wxs, x) + alpha*s + bs # context state
        s = np.dot((np.identity(context_size)-Q), np.dot(Wxs, x)) + np.dot(Q, s) # context state
        h = softmax(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbys = np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 500)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWsh, dWsy, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wsh, Wsy, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWsh, mWsy, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 87997 characters, 82 unique.
----
 ct¤O.4Sp(Uhby!Emz]N¤: )]R!L0kF8RpRg©I7esPl¼O"-Le
h!IB¦(biTW0¦,.vO kri?8_6Dn;w¦fUG]¤.,-DE9G-N3¼jiTH¦3eUt!ny[BkKC7]higc80(nb([TlW?a.-[¼I)zK_r8-:TalaÃ©a
?eH;CC0 *DuFGNRÃndyaDMU¦UG8hb_xYT¼Is:Rm1(
4idZN SaÃ7ib!wi)_EY*gmopp!¤zuG:c¦yc,GÃZLu¤j(F"H(")E9zjlhY4xp3ELtrJ
(-©j'79z)'H113iFDn,N¼]nvLRHvG]1gbyDNWtmm7*I¦Icw¤VEJ]T p3*'¼.L,nMm6)fC
*3nMf"'a(D¤ope8P"]b78L!_UPAfPWRf2;oBkNOzjRfgWÃ1n*;dmS2?jGp?G[jp"s"l!¼BuqmÃÃg4M;("St:]u*"¤Re.KL'zo;
MNF ayV¤6J9pYksTa34)a;R-crydmq*oGbNVffR;f©n9¼Gr"7'*6U¼ay8TyqgGjwa*Ia:;9o 
----
iter 0, loss: 110.167973
----
 lon aats t wo hinedon te

N
alll' ren
Rcicurnd atrt t ane aptr spomly fapan arsprerd omin nderd sw otre theg ca telaamr te
 ostnd t wocedr va, ofiatoll sonebe awe at aksaspidet sels cc g pysrmoet cs TMal qid
e istherw terist as c aned
cziIcn Doy wheil thed tirepheyllkirelled
iingids
b alenm ouoge. hhepsautonge pocuserustr ofone srinouthe atrely ni deang W

chedos ituborv we m_bond ruHuosegys merded brozil cedid ad:

In [None]:
# from http://www.wildml.com/2015/10/recurrent-neural-networks-tutorial-part-3-backpropagation-through-time-and-vanishing-gradients/
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation: dL/dz
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print &quot;Backpropagation step t=%d bptt step=%d &quot; % (t, bptt_step)
            # Add to gradients at each previous step
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step dL/dz at t-1
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

In [6]:
import cv2  
import numpy as np  
image = cv2.imread("large.png")  
template = cv2.imread("small.png")  
result = cv2.matchTemplate(image,template,cv2.TM_CCOEFF_NORMED)  
print(np.unravel_index(result.argmax(),result.shape))
print(result[0][5])

(131, 111)
-0.16437891
