In [65]:
data = open('poem.txt', 'r').read()
# print(data)
chars = list(set(data)) 
data_size, vocab_size = len(data), len(chars)
print ('data has %d chars, %d unique' % (data_size, vocab_size))

data has 590 chars, 38 unique


In [66]:
char_to_ix = { ch:i for i,ch in enumerate(chars)}
ix_to_char = { i:ch for i, ch in enumerate(chars)}
print (char_to_ix)
print (ix_to_char)

{'H': 0, 'z': 1, 'e': 2, 't': 3, 'T': 4, ',': 5, 'B': 6, 'O': 7, 'q': 8, 'd': 9, 'u': 10, ' ': 11, ';': 12, 'o': 13, 'c': 14, 'b': 15, 'r': 16, 'm': 17, '.': 18, 'W': 19, 'p': 20, 'f': 21, '\n': 22, 'h': 23, 'n': 24, 'w': 25, '’': 26, 'y': 27, 'l': 28, 'g': 29, 'a': 30, 'I': 31, 'v': 32, 'i': 33, 's': 34, 'M': 35, 'k': 36, 'A': 37}
{0: 'H', 1: 'z', 2: 'e', 3: 't', 4: 'T', 5: ',', 6: 'B', 7: 'O', 8: 'q', 9: 'd', 10: 'u', 11: ' ', 12: ';', 13: 'o', 14: 'c', 15: 'b', 16: 'r', 17: 'm', 18: '.', 19: 'W', 20: 'p', 21: 'f', 22: '\n', 23: 'h', 24: 'n', 25: 'w', 26: '’', 27: 'y', 28: 'l', 29: 'g', 30: 'a', 31: 'I', 32: 'v', 33: 'i', 34: 's', 35: 'M', 36: 'k', 37: 'A'}


In [67]:
import numpy as np

vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print (vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.]


In [68]:
#model parameters

hidden_size = 100
seq_length = 25
learning_rate = 1e-1

Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 #input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 #input to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01 #input to hidden
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

In [69]:
def lossFun(inputs, targets, hprev):
  """                                                                                                                                                                                         
  inputs,targets are both list of integers.                                                                                                                                                   
  hprev is Hx1 array of initial hidden state                                                                                                                                                  
  returns the loss, gradients on model parameters, and last hidden state                                                                                                                      
  """
  #store our inputs, hidden states, outputs, and probability values
  xs, hs, ys, ps, = {}, {}, {}, {} #Empty dicts
    # Each of these are going to be SEQ_LENGTH(Here 25) long dicts i.e. 1 vector per time(seq) step
    # xs will store 1 hot encoded input characters for each of 25 time steps (26, 25 times)
    # hs will store hidden state outputs for 25 time steps (100, 25 times)) plus a -1 indexed initial state
    # to calculate the hidden state at t = 0
    # ys will store targets i.e. expected outputs for 25 times (26, 25 times), unnormalized probabs
    # ps will take the ys and convert them to normalized probab for chars
    # We could have used lists BUT we need an entry with -1 to calc the 0th hidden layer
    # -1 as  a list index would wrap around to the final element
  xs, hs, ys, ps = {}, {}, {}, {}
  #init with previous hidden state
    # Using "=" would create a reference, this creates a whole separate copy
    # We don't want hs[-1] to automatically change if hprev is changed
  hs[-1] = np.copy(hprev)
  #init loss as 0
  loss = 0
  # forward pass                                                                                                                                                                              
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation (we place a 0 vector as the t-th input)                                                                                                                     
    xs[t][inputs[t]] = 1 # Inside that t-th input we use the integer in "inputs" list to  set the correct
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state                                                                                                            
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars                                                                                                           
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars                                                                                                              
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)                                                                                                                       
  # backward pass: compute gradients going backwards    
  #initalize vectors for gradient values for each set of weights 
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    #output probabilities
    dy = np.copy(ps[t])
    #derive our first gradient
    dy[targets[t]] -= 1 # backprop into y  
    #compute output gradient -  output times hidden states transpose
    #When we apply the transpose weight matrix,  
    #we can think intuitively of this as moving the error backward
    #through the network, giving us some sort of measure of the error 
    #at the output of the lth layer. 
    #output gradient
    dWhy += np.dot(dy, hs[t].T)
    #derivative of output bias
    dby += dy
    #backpropagate!
    dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
    dbh += dhraw #derivative of hidden bias
    dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
    dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
    dhnext = np.dot(Whh.T, dhraw) 
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [70]:
#prediction, one full forward pass
def sample(h, seed_ix, n):
  """                                                                                                                                                                                         
  sample a sequence of integers from the model                                                                                                                                                
  h is memory state, seed_ix is seed letter for first time step   
  n is how many characters to predict
  """
  #create vector
  x = np.zeros((vocab_size, 1))
  #customize it for our seed char
  x[seed_ix] = 1
  #list to store generated chars
  ixes = []
  #for as many characters as we want to generate
  for t in range(n):
    #a hidden state at a given time step is a function 
    #of the input at the same time step modified by a weight matrix 
    #added to the hidden state of the previous time step 
    #multiplied by its own hidden state to hidden state matrix.
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    #compute output (unnormalised)
    y = np.dot(Why, h) + by
    ## probabilities for next chars
    p = np.exp(y) / np.sum(np.exp(y))
    #pick one with the highest probability 
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    #create a vector
    x = np.zeros((vocab_size, 1))
    #customize it for the predicted char
    x[ix] = 1
    #add it to the list
    ixes.append(ix)

  txt = ''.join(ix_to_char[ix] for ix in ixes)
  print ('----\n %s \n----' % (txt, ))
hprev = np.zeros((hidden_size,1)) # reset RNN memory  
#predict the 200 next characters given 'a'
sample(hprev,char_to_ix['a'],200)

----
 hwsv.ullfoTArtiku.i;lopn’;WrihA;zTrz
fIc’rIIihak’t
mewviBvwM ps zAOv;nM BH.apgOy
mWvzBcIc.;bBIz;w.MpgBAutB;u,vWnmtk;aiHun.Hv ;uOOIHaAetd dvqrdBrc;ieqOc.zc.Tshsa
gpB;hqhuolmyWdzM
’zW’zecrc,;spvycIdhAHr 
----


In [71]:
p=0  
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print ("inputs", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print ("targets", targets)

inputs [19, 23, 13, 34, 2, 11, 25, 13, 13, 9, 34, 11, 3, 23, 2, 34, 2, 11, 30, 16, 2, 11, 31, 11, 3]
targets [23, 13, 34, 2, 11, 25, 13, 13, 9, 34, 11, 3, 23, 2, 34, 2, 11, 30, 16, 2, 11, 31, 11, 3, 23]


In [72]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0                                                                                                                        
while n<=1000*100:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  # check "How to feed the loss function to see how this part works
  if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
    p = 0 # go from start of data                                                                                                                                                             
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # forward seq_length characters through the net and fetch gradient                                                                                                                          
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001

  # sample from the model now and then                                                                                                                                                        
  if n % 1000 == 0:
    print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    sample(hprev, inputs[0], 200)

  # perform parameter update with Adagrad                                                                                                                                                     
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

  p += seq_length # move data pointer                                                                                                                                                         
  n += 1 # iteration counter

iter 0, loss: 90.939652
----
 TaAi npW’.TcoHbyf;sdAttvvie;gAhl
.BisayzBs pibgfclOmAs
nWg’pfW’,e.lfg.ia;oWs;byhrbHduap
r,qmoatdIAkmTdy’IM,yB;t.g’’zd
tolrwcMTlifrby hi
h H’riWH’;lzHt

wavbyoOycwOf;Iz uiApTkH,vytgIlbAkn
Wrse rqivqvuf 
----
iter 1000, loss: 72.413910
----
 th
are wit 
fouthak.  fToasetoothe zep wariarn  tethe wough tols wotn  wnouno to thetwis  
;  uoupoleeand wo das q
Theed tounn woiToouilt   
thioI nave  fBwkk toole neep howoaae wThlneeee, miakepris   
----
iter 2000, loss: 49.422139
----
 een tat’efrs tovre  frkpaode beves to tave.   
Be y, mald winy dadslid   
Burmily zeep, ror,   
Ow.   
The gill wittakp I s  uoatmesind be y  holke   
By le qoThe mile Ioo ,lekes p veleny I s  is I lo 
----
iter 3000, loss: 33.753221
----
 eald wind milt e e omnt no toere woods  of to stop woods vel tousearkep  
He laodtwowing  

My  towilltw.   pes  totgorep hottooif the ma inowarp   

He thound wethe the thoutwes soup.   
His aow.   
 
----
iter 4000, loss: 23.362140
----
 t ousninme

iter 35000, loss: 0.537780
----
 ing of the year.   

He gives his harness bells a shake   
To ask if there is some uis aodsleep weep,   
But I have promises to keep,   
And miles to go before I sleep,   
And miles to go before I sle 
----
iter 36000, loss: 0.411870
----
 enn the keig   
Bet   
He mistasee m
 gives his harness yeortoree.   

He gives hind miles to go before I sleep,   
And miles to go before I sleep,   
And miles to go before I sleep,   
And miles torn 
----
iter 37000, loss: 0.353973
----
 t and I snowis of the villagiotls hink it queer   
The ss e stods tors  ilhis wous   
The s 
Olle,   
And miles to go before I sleep,   
And miles to go before I sleep,   
And miles to go ne pil to wi 
----
iter 38000, loss: 0.322948
----
  evens aee I  
in with snow.   

My little horse must think it queer   
To stop without a farmhouse near   
Between the woods and frozen lake   
The darkest evening of the year.   

He gives his harne 
----
iter 39000, loss: 0.304036
----
  I know

iter 70000, loss: 1.179067
----
 w.   

My litt miles to keep,   
And miles to go before I sleep,   
And mills toin  I stop with snow.   

My little home pr I sAnd mile otozen without a farmhouse near   
Between the woods thand are i 
----
iter 71000, loss: 0.597811
----
 fore I sleep,   
And miles to go before I an  a sely flake.   

The woods are lovely, dark and deep,   
But I have promises to keep,   
And miles to go before I sleep,   
And miles to go before I slee 
----
iter 72000, loss: 0.327827
----
 
Olle   
The darkest evening of the year.   

gh;   erenly o sarkest erop diune fist t eaod  
He promistake.   
The only other sound’s the sweep   
Of easy wind and downy flake.   

The woods are love 
----
iter 73000, loss: 0.218607
----
 als and letilut I have promises to keep,   
And miles to go before I sleep,   
And miles to go before I sleep,   
And miles to go before I sleep,   
And miles to go before I sleep,   
And miles to go  
----
iter 74000, loss: 0.172883
----
 ak f  a