In [11]:
from tensorflow import keras
import numpy as np

In [12]:
# Downloading "The Adventures of Sherlock Holmes" by Arthur Conan Doyle from https://www.gutenberg.org

path = keras.utils.get_file(
    "SherlockHolmes.txt", origin="https://www.gutenberg.org/files/1661/1661-0.txt"
)

In [13]:
print(path)

C:\Users\User\.keras\datasets\SherlockHolmes.txt


In [14]:
## start with data
data = open(path, 'r', encoding='utf-8').read() # should be simple plain text file

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)

print(f'data has {data_size} characters, {vocab_size} unique.')

char_to_idx = { ch:i for i,ch in enumerate(chars) }
idx_to_char = { i:ch for i,ch in enumerate(chars) }

data has 581533 characters, 98 unique.


In [15]:
pointer, seq_length = 0, 8

x = [char_to_idx[ch] for ch in data[pointer:pointer+seq_length]]
y = [char_to_idx[ch] for ch in data[pointer+1:pointer+seq_length+1]]

In [16]:
print(x) #  RNN input sequence
print(y) #  RNN target sequence

[25, 1, 47, 9, 73, 12, 60, 87]
[1, 47, 9, 73, 12, 60, 87, 68]


In [17]:
for t in range(seq_length):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is [25] the target: 1
when input is [25, 1] the target: 47
when input is [25, 1, 47] the target: 9
when input is [25, 1, 47, 9] the target: 73
when input is [25, 1, 47, 9, 73] the target: 12
when input is [25, 1, 47, 9, 73, 12] the target: 60
when input is [25, 1, 47, 9, 73, 12, 60] the target: 87
when input is [25, 1, 47, 9, 73, 12, 60, 87] the target: 68


In [18]:
class RNN:
    def __init__(self, hidden_size, vocab_size, seq_length, num_layers):
        self.name = 'RNN'
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.num_layers = num_layers

        # model parameters
        self.Wxh = [np.random.randn(hidden_size, vocab_size)*0.01 for _ in range(num_layers)] # input to hidden
        self.Whh = [np.random.randn(hidden_size, hidden_size)*0.01 for _ in range(num_layers)] # hidden to hidden
        self.Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
        self.bh = [np.zeros((hidden_size, 1)) for _ in range(num_layers)] # hidden bias
        self.by = np.zeros((vocab_size, 1)) # output bias

        # memory variables for training (ada grad from karpathy's github)
        self.iteration, self.pointer = 0, 0
        self.mWxh = [np.zeros_like(w) for w in self.Wxh]
        self.mWhh = [np.zeros_like(w) for w in self.Whh] 
        self.mWhy = np.zeros_like(self.Why)
        self.mbh, self.mby = [np.zeros_like(b) for b in self.bh], np.zeros_like(self.by)
        self.loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

        self.running_loss = []

    def __call__(self, *args, **kwargs):
        """RNN Forward Pass"""

        x, y, hprev = kwargs['inputs'], kwargs['targets'], kwargs['hprev']

        loss = 0
        xs, hs, ys, ps = {}, {}, {}, {} # inputs, hidden state, output, probabilities
        hs[-1] = np.copy(hprev)

        # forward pass
        for t in range(len(x)):
            xs[t] = np.zeros((self.vocab_size,1)) # encode in 1-of-k representation
            xs[t][x[t]] = 1
            hs[t] = np.copy(hprev)

            if kwargs.get('dropout', False): # use dropout layer (mask)

                for l in range(self.num_layers):
                    dropout_mask = (np.random.rand(*hs[t-1][l].shape) < (1-0.5)).astype(float)
                    hs[t-1][l] *= dropout_mask
                    hs[t][l] = np.tanh(np.dot(self.Wxh[l], xs[t]) + np.dot(self.Whh[l], hs[t-1][l]) + self.bh[l]) # hidden state
                    hs[t][l] = hs[t][l] / (1 - 0.5)

            else: # no dropout layer (mask)

                for l in range(self.num_layers):
                    hs[t][l] = np.tanh(np.dot(self.Wxh[l], xs[t]) + np.dot(self.Whh[l], hs[t-1][l]) + self.bh[l]) # hidden state


            ys[t] = np.dot(self.Why, hs[t][-1]) + self.by # unnormalized log probabilities for next chars
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
            loss += -np.log(ps[t][y[t],0]) # softmax (cross-entropy loss)

        self.running_loss.append(loss)

        return loss, hs[len(x)-1], {'xs':xs, 'hs':hs, 'ps':ps}

    def backward(self, targets, cache):
        """RNN Backward Pass"""

        # unpack cache
        xs, hs, ps = cache['xs'], cache['hs'], cache['ps']

        # initialize gradients to zero
        dWxh, dWhh, dWhy = [np.zeros_like(w) for w in self.Wxh], [np.zeros_like(w) for w in self.Whh], np.zeros_like(self.Why)
        dbh, dby = [np.zeros_like(b) for b in self.bh], np.zeros_like(self.by)
        dhnext = [np.zeros_like(h) for h in hs[0]]

        for t in reversed(range(len(xs))):

            dy = np.copy(ps[t])

            # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
            dy[targets[t]] -= 1 

            dWhy += np.dot(dy, hs[t][-1].T)
            dby += dy

            for l in reversed(range(self.num_layers)):
                dh = np.dot(self.Why.T, dy) + dhnext[l]
                dhraw = (1 - hs[t][l] * hs[t][l]) * dh # backprop through tanh nonlinearity
                dbh[l] += dhraw
                dWxh[l] += np.dot(dhraw, xs[t].T)
                dWhh[l] += np.dot(dhraw, hs[t-1][l].T)
                dhnext[l] = np.dot(self.Whh[l].T, dhraw)

        return {'dWxh':dWxh, 'dWhh':dWhh, 'dWhy':dWhy, 'dbh':dbh, 'dby':dby}

    def update(self, grads, lr):
        """Perform Parameter Update w/ Adagrad"""

        # unpack grads
        dWxh, dWhh, dWhy = grads['dWxh'], grads['dWhh'], grads['dWhy']
        dbh, dby = grads['dbh'], grads['dby']

        # loop through each layer
        for i in range(self.num_layers):

            # clip gradients to mitigate exploding gradients
            np.clip(dWxh[i], -5, 5, out=dWxh[i])
            np.clip(dWhh[i], -5, 5, out=dWhh[i])
            np.clip(dbh[i], -5, 5, out=dbh[i])

            # perform parameter update with Adagrad
            self.mWxh[i] += dWxh[i] * dWxh[i]
            self.Wxh[i] -= lr * dWxh[i] / np.sqrt(self.mWxh[i] + 1e-8)
            self.mWhh[i] += dWhh[i] * dWhh[i]
            self.Whh[i] -= lr * dWhh[i] / np.sqrt(self.mWhh[i] + 1e-8)
            self.mbh[i] += dbh[i] * dbh[i]
            self.bh[i] -= lr * dbh[i] / np.sqrt(self.mbh[i] + 1e-8)

        # clip gradients for Why and by
        np.clip(dWhy, -5, 5, out=dWhy)
        np.clip(dby, -5, 5, out=dby)

        # perform parameter update with Adagrad
        self.mWhy += dWhy * dWhy
        self.Why -= lr * dWhy / np.sqrt(self.mWhy + 1e-8)
        self.mby += dby * dby
        self.by -= lr * dby / np.sqrt(self.mby + 1e-8)

        
    def predict(self, hprev, seed_ix, n):
        """
        Make predictions using the trained RNN model.

        Parameters:
        hprev (numpy array): The previous hidden state.
        seed_ix (int): The seed letter index to start the prediction with.
        n (int): The number of characters to generate for the prediction.

        Returns:
        ixes (list): The list of predicted character indices.
        """
        x = np.zeros((self.vocab_size, 1))
        x[seed_ix] = 1

        ixes = []
        hs = np.copy(hprev)

        for _ in range(n):
            for l in range(self.num_layers):
                hs[l] = np.tanh(np.dot(self.Wxh[l], x) + np.dot(self.Whh[l], hs[l]) + self.bh[l])

            y = np.dot(self.Why, hs[-1]) + self.by
            p = np.exp(y) / np.sum(np.exp(y))

            ix = np.random.choice(range(self.vocab_size), p=p.ravel())
            x = np.zeros((self.vocab_size, 1))
            x[ix] = 1

            ixes.append(ix)

        return ixes

In [19]:
def train(rnn, epochs, data, lr=1e-1, use_drop=False):

    for _ in range(epochs):

        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if rnn.pointer+seq_length+1 >= len(data) or rnn.iteration == 0:

            hprev = [np.zeros((hidden_size, 1)) for _ in range(rnn.num_layers)]  # reset RNN memory

            rnn.pointer = 0 # go from start of data

        x = [char_to_idx[ch] for ch in data[rnn.pointer:rnn.pointer+seq_length]]
        y = [char_to_idx[ch] for ch in data[rnn.pointer+1:rnn.pointer+seq_length+1]]

        if use_drop:
            loss, hprev, cache = rnn(inputs=x, targets=y, hprev=hprev, dropout=True)
        else:
            loss, hprev, cache = rnn(inputs=x, targets=y, hprev=hprev)

        grads = rnn.backward(targets=y, cache=cache)
        rnn.update(grads=grads, lr=lr)

        # update loss
        rnn.loss = rnn.loss * 0.999 + loss * 0.001

        ## show progress now and then
        if rnn.iteration % 1000 == 0: 
            print(f'iter {rnn.iteration}, loss: {rnn.loss}')
            sample_ix = rnn.predict(hprev, x[0], 200)
            txt = ''.join(idx_to_char[ix] for ix in sample_ix)
            print('Sample')
            print (f'----\n {txt} \n----')

        rnn.pointer += seq_length # move data pointer
        rnn.iteration += 1 # iteration counter


In [20]:
## hyper-params
num_layers = 2
hidden_size = 128
seq_length = 13

# Initialize RNN
rnn = RNN(hidden_size=hidden_size, 
          vocab_size=vocab_size, 
          seq_length=seq_length, 
          num_layers=num_layers)

train(rnn=rnn, epochs=15001, data=data)

iter 0, loss: 59.60457453102396
Sample
----
 à½ZVJvGfWu.﻿"I' q#auXBwc_-H#œ“5x3½ih#Bæ½3w88U63c2k“pHvkX’£!i%qiw;[àEœPQ'aLr—T9*g' ’mB#e?,n9VgGeyéu½t?t0à2ésxFY&xFr;jH﻿,[tSz
_Fbwx’T#gy ).9ll01OG—èo5àJ½zm1£5è/h!‘V“Yè—Uo0;NP;gè!I2LHUEw”QTVtGuan0,M:2LGD 
----
iter 1000, loss: 49.94788211570927
Sample
----
 e
nywcinauT[k n sestgth miagas cvehi
b.bd odo ec a nrn fictTyoddls an pur iteweiirode iir, citho hoeFp lkdun r
ndg anucs lhogf aoor, oi,id   dctheiupe anet,  w se noisc v hkh s  ntel.cobas as alhef,ha 
----
iter 2000, loss: 41.75557637841308
Sample
----
 mathh sindd ald afd g an
,g ourle in inTov fo trint amas nhh, hs mirithos wotsed Ras Shicofad hitk waasm y pflare.r9orgisitresah thime pastuog,ol-has ser houmre min_rind
Xhe
ny ah, iw, 
pandaedeeyvaat 
----
iter 3000, loss: 36.6540210163851
Sample
----
 r bins wady”y achy withees ge,mred sinyy
 has.”
I an.

“I ank chil ang
 of he ce ledl m,.y“I acs wicne thard opingorr lt ccoen , mougrhumoinucho wey ilus,d
sr’weait akot the Now thendc. to f 

In [21]:
print(rnn.loss)

28.72030886358049
