In [7]:
import pygame.midi
import time

pygame.midi.init()
player = pygame.midi.Output(0)
player.set_instrument(48) # https://pjb.com.au/muscript/gm.html
player.note_on(64, 127) # note_on(note, velocity=None, channel = 0) https://en.scratch-wiki.info/wiki/MIDI_Notes
time.sleep(2)
player.note_off(64, 127)
del player
pygame.midi.quit()

In [2]:
# from https://gist.github.com/karpathy/d4dee566867f8291f086

"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

nruns = 1000
noutput = 100

# data I/O
data = open('notes0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    clip_diag(dparam, 1)
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % noutput == 0:
    sample_ix = sample(hprev, inputs[0], 500)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt, ))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 

data has 36979 characters, 14 unique.
----
 232685'7'8,98:2,5:27 '083:74798293823:174:970:9'8,21 8806:423484200233670,14'7'2'137138445,3:8:199,1:9,29669:248988'0'5'6'5:8,32772:71:74103063 2939047 52 '9757616 1,7 6'29 568,8'21,::8:95344 2182':,2141:,:9'9','768'0,077914' 5,'8017'2 43787798  5:57011263523,3290604,9945'''3'4''41 3352,88' 686406,7'73:32587:1832' ' 8 395677205128 5000619077'2306,8 3165:42:95365951 8'21462,,34 6',5 4:418 1 64,:6354, '0477985''''3:825,819672,2:644'83676,'' 509'5,',2'2028,60,02049245 80,12,76'7818 :'58'60076071310 
----
iter 0, loss: 65.976432
----
 0'''20', ''',50'0', 7', '':88', '1, ',7'1, 7' 0:, ',,0',:8',217 7'1:50', ',703:3 ', '' ':70', 180378: 0' :9:81', '1:0',:7:, ', 00, '0, '', ', 01071,'78:8', :54, 0::006 '1:5,92'9:87, 6'067'  '1072'0'6  ',:8 ,, 90 ',  ', 0,, '11 ,, '1:  , 0''  '1 ',, '1:0'0 '1: 0',0:20',773',50' '', '9:59: :5, ',, 1,81:008'  ',5700 2:7'', 0 0:8'10'::45171',0'1:70', '0 ','2,, '' 2'9,71: 8,5 5', '1:80'2'6,0'8,7''4:850'8:8,9271:0',,0'2'2

In [2]:
import pygame.midi
import time
import random

pygame.midi.init()
player = pygame.midi.Output(0)
player.set_instrument(1) # https://pjb.com.au/muscript/gm.html
notel = [55, 72, 77, 74, 50, 67, 77, 79, 55, 79, 81, 53, 58, 50, 77, 75, 75, 51, 74, 79, 72, 58, 77, 74, 75, 57, 74, 74, 72, 69, 72, 65, 72, 63, 51, 67, 67, 46, 70, 75, 72, 41, 70, 68, 48, 63, 74, 63, 70, 63, 70, 70, 75, 51, 69, 65, 67, 70, 50, 63, 75, 72, 46, 70, 65, 69, 82, 72, 74, 68, 72, 53, 72, 69, 77, 67, 75, 72, 55, 69, 65, 82, 51, 75, 77, 77, 75, 50, 79, 79, 74, 79, 79, 82, 67, 79, 63, 79, 79, 79, 75, 53, 79, 74, 74, 75, 46, 70, 77, 74, 55, 77, 79, 77, 82, 77, 48, 53, 79, 79, 51, 74, 79, 79]
for n in notel:
    player.note_on(n, 127) # note_on(note, velocity=None, channel = 0) https://en.scratch-wiki.info/wiki/MIDI_Notes
    time.sleep(0.17)
    player.note_off(n, 127)
del player
pygame.midi.quit()

ModuleNotFoundError: No module named 'pygame'

In [2]:
# read out the midi file
import mido

#outport = mido.open_output()
niter = 0
noteslst = []
for msg in mido.MidiFile('bjs1031c.mid').play():
    #outport.send(msg)
    #print msg.bytes()
    #print str(msg)
    thismsg = str(msg).split(" ") # ['note_on', 'channel=0', 'note=60', 'velocity=61', 'time=0.00168918958333']
    # The velocity specifies the volume or force, with which the note is played
    if thismsg[0] == 'note_on':
        #print thismsg[2][5:]
        noteslst.append(int(thismsg[2][5:]))
    niter += 1
    if niter >= 3000:
        break
print(noteslst)

[51, 63, 63, 67, 67, 70, 51, 70, 75, 55, 75, 70, 55, 70, 72, 56, 72, 80, 84, 56, 80, 84, 79, 82, 58, 79, 82, 77, 80, 58, 77, 80, 75, 79, 46, 75, 79, 74, 77, 46, 74, 77, 75, 79, 51, 79, 63, 75, 63, 63, 67, 67, 70, 51, 63, 70, 63, 75, 55, 75, 70, 63, 55, 70, 63, 72, 56, 63, 72, 80, 84, 56, 80, 84, 79, 82, 58, 79, 82, 77, 80, 58, 77, 80, 75, 79, 46, 75, 79, 74, 77, 46, 74, 77, 75, 79, 51, 79, 82, 75, 82, 82, 80, 80, 79, 51, 82, 79, 84, 77, 56, 77, 75, 84, 56, 75, 77, 74, 58, 77, 74, 58, 79, 75, 63, 63, 58, 58, 55, 79, 75, 55, 80, 77, 51, 80, 77, 82, 79, 51, 82, 79, 77, 74, 58, 77, 74, 58, 79, 75, 63, 63, 58, 58, 55, 79, 75, 55, 80, 77, 51, 80, 77, 82, 79, 51, 82, 79, 77, 74, 58, 77, 74, 75, 72, 58, 75, 72, 74, 70, 46, 74, 70, 72, 68, 46, 72, 68, 70, 67, 58, 70, 67, 68, 65, 58, 68, 65, 67, 63, 51, 67, 63, 63, 67, 67, 70, 63, 51, 70, 75, 70, 55, 75, 70, 70, 67, 55, 70, 67, 72, 63, 56, 72, 63, 84, 80, 56, 84, 80, 82, 79, 58, 82, 79, 80, 77, 58, 80, 77, 79, 75, 46, 79, 75, 77, 74, 46, 77, 74,

In [3]:
# from https://gist.github.com/karpathy/d4dee566867f8291f086

"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

nruns = 50000
noutput = 5000

# data I/O
data = open('notes0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    clip_diag(dparam, 1)
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % noutput == 0 or n == nruns-1:
    sample_ix = sample(hprev, inputs[0], 1500)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt, ))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 

data has 36979 characters, 14 unique.
----
 95170203659:4'::0 ,2',8',880694 69'3' 0222'27'247:287 ,'10:06'58'9,2'2,0256'210'057818:399322180739269 4 2 11711990,29'9, 648::12:20,68527  ,2907154:445095532,642:85355:2521:0,5103'66:5,4:41887199681 '5,,876 12449,41::''82''07 ,   306  1,96'7:1',8201622883 841':1391:285:0,,37 7,176:, 196859 '6, 29'959'77'8844',, 7 2041:,5738420'9494',80,24'9'041''52,10,''9:6'051355''1465,: :390 '088635,':9,8 '6,11217407:10 1,11957'197'1''95:0',7,:551 :36:,'57'87' :,07323012215 55:7'1',4127: :12:020  1: 7213601 144:'60,7049',''4:61891 6,4650 :'',06,1'1': 626:0849967943524 6:,2:3276:494753:9476,8'749 3 6:,9882:5860799:' 1,216'49113 54796:24:113135:,3:'838480,459,5632,10 ,25'3'72,57:97577'630:788 95,6 162:68,67299':64 ,' 609081164573 '806878682'7206525 5 :390684,267936 :58 '952,72,6,1536 , :60,751',245563 '9378,:5':716: 4261::2 2,'902,75:2 '0938'19499,91:985:'3'796307288729:94''0 812372, 508,819024 7235496,57, 9060:2,9,8 35714916,21''991 0,494478 04 9'2'047'242:

----
 '1:70:0', '1:74:0', '1:75:0', '0:75:0', '0:79:80', '1:70:80', '2:50:0', '0:74:0', '0:79:0', '1:62:0', '0:77:0', '1:72:0', '0:72:80', '1:72:0', '0:72:80', '0:72:80', '2:55:0', '1:77:0', '0:79:80', '1:63:80', '1:79:80', '2:43:80', '0:72:0', '1:75:80', '0:65:0', '2:50:80', '1:75:80', '1:82:80', '2:53:80', '0:74:0', '1:70:0', '0:74:0', '2:51:80', '0:79:80', '1:72:80', '2:51:80:80', '1:75:80', '2:53:80', '0:77:80', '2:51:80', '0:75:80', '1:82:80', '2:58:80', '0:79:0', '1:74:0', '0:74:0', '0:79:0', '0:70:80', '1:79:80', '0:77:80', '1:72:80', '1:68:0', '0:72:0', '1:74:0', '2:46:80', '0:77:80', '2:51:0', '1:77:0', '2:50:80', '1:74:0', '0:70:0', '2:51:0', '0:70:80', '1:74:0', '1:61:80', '2:50:80', '0:79:0', '0:74:0', '1:70:0', '1:72:0', '0:69:80', '2:56:0', '2:55:0', '0:70:80', '1:65:80', '0:81:80', '0:70:0', '1:81:0', '1:73:80', '1:63:0', '1:70:0', '1:79:80', '2:46:80', '0:70:80', '1:74:80', '2:55:0', '0:68:0', '0:79:80', '2:51:80', '2:50:80', '2:51:0', '2:53:0', '0:77:80', '0:72:80', '1

In [1]:
# play the pice in the list (copy-paste from RNN output)
import pygame.midi
import time
import random

pygame.midi.init()
player = pygame.midi.Output(0)
player.set_instrument(0) # https://pjb.com.au/muscript/gm.html
# channel - note - velocity - time
notel = ['1:75:80', '0:70:0', '2:51:0', '0:77:80', '2:50:80', '1:67:0', '2:51:0', '0:74:80', '1:60:0', '1:72:80', '2:53:0', '0:70:0', '0:72:0', '0:67:80', '0:70:0', '0:70:80', '1:70:0', '2:50:0', '1:72:80', '0:70:0', '1:70:0', '0:70:0', '1:68:0', '0:72:80', '2:55:80', '2:51:0', '0:77:80', '0:77:0', '0:70:0', '0:72:80', '2:51:0', '1:67:0', '2:48:80', '1:60:0', '0:78:80', '0:70:0', '0:79:80', '0:70:0', '0:67:80', '2:51:0', '0:77:80', '2:50:0', '0:70:80', '0:70:0', '0:67:0', '0:75:80', '0:70:0', '0:72:80', '1:72:0', '0:79:80', '0:72:0', '0:72:80', '1:70:0', '1:72:0', '1:72:80', '0:74:0', '1:60:0', '0:70:0', '0:72:80', '1:72:0', '1:72:80', '2:53:80', '0:67:0', '0:65:80', '0:67:0', '1:74:80', '0:72:80', '0:70:0', '0:70:80', '0:72:0', '1:63:80', '1:68:0', '0:72:80', '0:74:0', '1:70:0', '0:70:80', '2:58:0', '0:72:0', '0:70:80', '2:53:80', '1:72:0', '1:70:80', '2:48:80', '1:70:0', '1:70:80', '1:72:0', '1:70:0', '1:70:80', '1:70:0', '1:72:80', '1:70:0', '1:67:80', '1:72:0', '1:72:80', '1:72:0', '1:70:80', '1:72:0', '1:70:80', '0:74:0', '0:75:0', '0:72:80', '0:72:0', '0:72:0', '1:63:0', '0:72:80', '1:63:0', '2:51:80', '0:72:0', '0:72:80', '1:70:0', '1:70:80', '2:50:80', '0:60:0', '0:67:0', '1:70:80', '1:72:0', '2:46:80', '2:50:80', '0:67:0', '1:77:0', '0:70:80', '2:51:80', '0:70:0', '1:67:0', '0:65:80', '1:70:0', '0:64:80', '2:50:0', '0:72:80', '0:70:0', '2:50:0', '0:72:80', '2:50:0', '1:70:0', '0:70:80', '2:50:80', '0:70:0', '1:67:0', '1:77:0', '0:72:80', '0:70:0', '0:72:80', '1:72:0', '1:70:80']
for n in notel:
    ns = n.split(':')
    #print ns
    player.note_on(int(ns[1]), int(ns[2]), int(ns[0])) # note_on(note, velocity=None, channel = 0) https://en.scratch-wiki.info/wiki/MIDI_Notes
    time.sleep(int(ns[2])/200)
    player.note_off(int(ns[1]), int(ns[2]), int(ns[0]))
del player
pygame.midi.quit()

pygame 1.9.5
Hello from the pygame community. https://www.pygame.org/contribute.html


In [4]:
# read out midi file as input to RNN
import mido

#outport = mido.open_output()
niter = 0
noteslst = []
for msg in mido.MidiFile('bjs1031c.mid').play():
    #outport.send(msg)
    #print msg.bytes()
    #print str(msg)
    thismsg = str(msg).split(" ") # ['note_on', 'channel=0', 'note=60', 'velocity=61', 'time=0.00168918958333']
    if thismsg[0] == 'note_on':
        #print thismsg
        #print thismsg[2][5:]
        noteslst.append(thismsg[1][8:]+':'+thismsg[2][5:]+':'+thismsg[3][9:]) # channel, note, velocity
    niter += 1
    if niter >= 6000:
        break
print(noteslst)

['2:51:80', '1:63:80', '1:63:0', '1:67:80', '1:67:0', '1:70:80', '2:51:0', '1:70:0', '1:75:80', '2:55:80', '1:75:0', '1:70:80', '2:55:0', '1:70:0', '1:72:80', '2:56:80', '1:72:0', '0:80:80', '1:84:80', '2:56:0', '0:80:0', '1:84:0', '0:79:80', '1:82:80', '2:58:80', '0:79:0', '1:82:0', '0:77:80', '1:80:80', '2:58:0', '0:77:0', '1:80:0', '0:75:80', '1:79:80', '2:46:80', '0:75:0', '1:79:0', '0:74:80', '1:77:80', '2:46:0', '0:74:0', '1:77:0', '0:75:80', '1:79:80', '2:51:80', '1:79:0', '1:63:80', '0:75:0', '1:63:0', '0:63:80', '1:67:80', '1:67:0', '1:70:80', '2:51:0', '0:63:0', '1:70:0', '0:63:80', '1:75:80', '2:55:80', '1:75:0', '1:70:80', '0:63:0', '2:55:0', '1:70:0', '0:63:80', '1:72:80', '2:56:80', '0:63:0', '1:72:0', '0:80:80', '1:84:80', '2:56:0', '0:80:0', '1:84:0', '0:79:80', '1:82:80', '2:58:80', '0:79:0', '1:82:0', '0:77:80', '1:80:80', '2:58:0', '0:77:0', '1:80:0', '0:75:80', '1:79:80', '2:46:80', '0:75:0', '1:79:0', '0:74:80', '1:77:80', '2:46:0', '0:74:0', '1:77:0', '0:75:80', '

In [None]:
# RNN with context features (Mikolov 2015), diagonal constraints on weight matrices
# using softmax (as in paper) as activation
# based on Karpathys RNN

import numpy as np
import math

def softmax(x):
    ex = np.exp(x - np.max(x))
    return ex / ex.sum()

nruns = 100000
noutput = 10000

# data I/O
data = open('notes0.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
context_size = min(30, vocab_size-1) # size of hidden layer of neurons
alpha = 0.2 # strictly between 0 and 1
beta = np.zeros((context_size, 1))
Q = np.zeros((context_size, context_size))
np.fill_diagonal(Q, softmax(beta))
seq_length = 20 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

Wxs = np.random.randn(context_size, vocab_size)*0.01 # input to context: B
#Wss = np.random.randn(context_size, context_size)*0.01 # context to context: alpha
Wsh = np.random.randn(hidden_size, context_size)*0.01 # context to hidden: P
Wsy = np.random.randn(vocab_size, context_size)*0.01 # context to output: V
bys = np.zeros((vocab_size, 1)) # output context bias

def lossFun(inputs, targets, hprev, sprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    ss = {} # context
    hs[-1] = np.copy(hprev)
    ss[-1] = np.copy(sprev)
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        #hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        #ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        
        #ss[t] = 0
        #ss[t] = np.tanh(np.dot(Wxs, xs[t]) + np.dot(Wss, ss[t-1]) + bs) # context state
        #ss[t] = (1-alpha)*np.dot(Wxs, xs[t]) + alpha*ss[t-1] + bs # context state
        ss[t] = np.dot((np.identity(context_size)-Q), np.dot(Wxs, xs[t])) + np.dot(Q, ss[t-1]) # context state
        
        hs[t] = softmax(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + np.dot(Wsh, ss[t]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + np.dot(Wsy, ss[t]) + by # unnormalized log probabilities for next chars
        
        #ps[t] = np.exp(ys[t]) / (np.sum(np.exp(ys[t])) + 0.01) # probabilities for next chars
        ps[t] = softmax(ys[t])
        loss += -np.log(ps[t][targets[t],0]+0.00001) # softmax (cross-entropy loss)        
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dWxs, dWsh, dWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dbys = np.zeros_like(bys)
    dhnext = np.zeros_like(hs[0])
    dvunit = np.pad(np.identity(vocab_size), ((0,hidden_size-vocab_size),(0,0)), 'constant', constant_values=(0))
    dxhnext = dvunit * xs[0].T
    dhunit = np.identity(hidden_size)
    dhhnext = dhunit * hs[0].T
    dsunit = np.pad(np.identity(context_size), ((0,hidden_size-context_size),(0,0)), 'constant', constant_values=(0))
    dshnext = dsunit * ss[0].T
    cvunit = np.pad(np.identity(context_size), ((0,0),(0,vocab_size-context_size)), 'constant', constant_values=(0))
    sumt = np.zeros_like(ss[0])
    dhtwxs = np.zeros_like(Wsh)
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dWsy += np.dot(dy, ss[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h

        dhraw = hs[t] * (1 - hs[t]) # backprop through tanh nonlinearity
        dsraw = ss[t] * (1 - ss[t])
        whhdrwa = np.dot(Whh, dhraw)
        
        dxhnext = dvunit * xs[t].T + whhdrwa * dxhnext
        dWxh += np.dot(dxhnext.T, dh).T
        
        dbh += dhraw * dh
        
        dhhnext = dhunit * hs[t-1].T + whhdrwa * dhhnext
        dWhh += np.dot(dhhnext.T, dh).T
        
        sumt = np.dot((np.identity(context_size)-Q), np.dot(cvunit, xs[t])) + np.dot(Q, sumt)
        #dhtwxs = np.dot(Whh, dhraw) * np.dot(Wsh, (dhtwxs + sumt))
        #print(np.shape(Wsh * sumt.T))
        dhtwxs = whhdrwa * (dhtwxs + Wsh * sumt.T)
        
        #print(np.shape(dhtwxs))
        #print(np.shape(np.dot(Why, dhtwxs)))
        #print(np.shape(Wsy * sumt.T))
        dWxs += (dy * (np.dot(Why, dhtwxs) + (Wsy * sumt.T))).T

        dshnext = whhdrwa * dshnext + dsunit * ss[t].T
        dWsh += dh * dshnext
        
        dhnext = np.dot(Whh.T, dhraw)
        dsnext = np.dot(Q.T, dsraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbys]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
        clip_diag(dparam, 1)
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1], dWxs, dWsh, dWsy, dbys, ss[len(inputs)-1]

def sample(h, s, seed_ix, n, context):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        #s = np.tanh(np.dot(Wxs, x) + np.dot(Wss, s) + bs) # context state
        #s = (1-alpha)*np.dot(Wxs, x) + alpha*s + bs # context state
        s = np.dot((np.identity(context_size)-Q), np.dot(Wxs, x)) + np.dot(Q, s) # context state
        h = softmax(np.dot(Wxh, x) + np.dot(Whh, h) + np.dot(Wsh, s) + bh)
        if context:
            y = np.dot(Wsy, s) + by # only context
        else:
            y = np.dot(Why, h) + np.dot(Wsy, s) + by
        p = softmax(y)
        pl = p.ravel()
        ix = np.random.choice(range(vocab_size), p=pl)
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def clip_diag(w, cval):
    n = min(np.shape(w))
    w[range(n), range(n)] = np.clip(np.diagonal(w), -cval, cval)

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mWxs, mWsh, mWsy = np.zeros_like(Wxs), np.zeros_like(Wsh), np.zeros_like(Wsy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
mbys = np.zeros_like(bys) # memory context bias
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#while True:
while n<nruns:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        sprev = np.zeros((context_size, 1)) # reset context
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % noutput == 0:
        sample_ix = sample(hprev, sprev, inputs[0], 800, False)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('--full--\n %s \n----' % (txt, ))
        #sample_ix = sample(hprev, sprev, inputs[0], 300, True)
        #txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        #print('--context--\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev, dWxs, dWsh, dWsy, dbys, sprev = lossFun(inputs, targets, hprev, sprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % noutput == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by, Wxs, Wsh, Wsy, bys], 
                                    [dWxh, dWhh, dWhy, dbh, dby, dWxs, dWsh, dWsy, dbys], 
                                    [mWxh, mWhh, mWhy, mbh, mby, mWxs, mWsh, mWsy, mbys]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 

data has 36979 characters, 14 unique.
--full--
 77 :54:131'409,215608 7 76782,220262,7:618,827'29923062415008:4:520,826,29'399036363741 ,,06:::7,4,,'3'4573:3 2:3:8:,'9:351 96' 2:416798 093914 650',5,17:98878 ,:,498'81447'''044'82 32277480,06 '7'::60'69238:55282 169601511559' 010'3 6882089533049:1296 3:371689:98176 02 6':81,428:6 ,4:,' '69 ,' 409993 00608'8991185'10,036'1 1741346:5'656:4155 95 91,1',452432415 19642014 95482006 '129:'3054,898, 03 31:10253'2561:6 5,456798, :85',544,51:3832 3488,3 ,44712::9107',,03083966'6,6:9:9'139868290:51:7'983'881:5479::7:03310,,'04996575637,72 7:06'5 7645:83:' ,793716,27463:,9838 244,8083,13356,948,874 '0,70 ,8:3954 ':306,1610,8:74,8'544'88:6'0 870 03530691,318,905 3 5 57354':,42'18'':15 1 ,415440955' :1415,,555 ::'055':5210,2:3 0:2:02 :2 18:53011 '4:578:26'2: ',18 ,2593:0,,34 80,3 ,09',2,0647902'2'28' 
----
iter 0, loss: 52.781147
--full--
 ', '0:634:0', '0:69:75:0', '24:82:780', '0:824:80:72:42:572:5:74:0', '2:0', '1:0', '1:51:5:70', '0:74:0', '0:4: