In [1]:
import numpy as np
from collections import Counter
import re

**Word Tokenization**

In [2]:
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

**Data Preprocessing**

In [3]:
data = open('sentences.txt', 'r').read()
words = tokenize(data)
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
data_size, vocab_size = len(words), len(vocab)
print('data has %d words, %d unique.' % (data_size, vocab_size))

data has 585 words, 332 unique.


In [4]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

**Hyperparameters**

In [5]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

**Model Parameters**

In [6]:
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01

Bh = np.zeros((hidden_size, 1))
By = np.zeros((vocab_size, 1))

**RNN Model Making**

In [7]:
def rnn(inputs, targets, hprev):
    xs, hs, ys, ps= {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    #forward propagation
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + Bh)
        ys[t] = np.dot(Why, hs[t]) + By
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t], 0])

    #back propagation
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dBh, dBy = np.zeros_like(Bh), np.zeros_like(By)
    dHnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1
        dWhy += np.dot(dy, hs[t].T)
        dBy += dy
        dh = np.dot(Why.T, dy) + dHnext
        dHraw = (1 - hs[t] * hs[t]) * dh
        dBh += dHraw
        dWxh += np.dot(dHraw, xs[t].T)
        dWhh += np.dot(dHraw, hs[t-1].T)
        dHnext = np.dot(Whh.T, dHraw)

    for dparam in [dWxh, dWhh, dWhy, dBh, dBy]:
        np.clip(dparam, -5, 5, out=dparam)
        
    return loss, dWxh, dWhh, dWhy, dBh, dBy, hs[len(inputs)-1]

**Training Loop**

In [8]:
num_iterations = 10000
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mBh, mBy = np.zeros_like(Bh), np.zeros_like(By)
smooth_loss = -np.log(1.0 / vocab_size) * seq_length

In [9]:
def predict_next(text, hprev):
    inputs = [word_to_ix[word] for word in tokenize(text)]
    for t in range(len(inputs)):
        x = np.zeros((vocab_size, 1))
        x[inputs[t]] = 1
        hprev = np.tanh(np.dot(Wxh, x) + np.dot(Whh, hprev) + Bh)

    y = np.dot(Why, hprev) + By
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    
    return ix_to_word[ix]

In [10]:
for n in range(num_iterations):
    if p + seq_length + 1 >= len(words) or n == 0:
        hprev = np.zeros((hidden_size, 1))
        p = 0
    inputs = [word_to_ix[word] for word in words[p:p + seq_length]]
    targets = [word_to_ix[word] for word in words[p + 1:p + seq_length + 1]]

    loss, dWxh, dWhh, dWhy, dBh, dBy, hprev = rnn(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    if n % 100 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss))

    for param, dparam, mem in zip([Wxh, Whh, Why, Bh, By],
                                  [dWxh, dWhh, dWhy, dBh, dBy],
                                  [mWxh, mWhh, mWhy, mBh, mBy]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)

    p += seq_length

iter 0, loss: 145.128377


iter 100, loss: 146.523265
iter 200, loss: 143.630763
iter 300, loss: 140.125037
iter 400, loss: 137.665253
iter 500, loss: 133.514947
iter 600, loss: 128.881529
iter 700, loss: 123.922085
iter 800, loss: 118.686506
iter 900, loss: 113.248832
iter 1000, loss: 107.811971
iter 1100, loss: 102.255217
iter 1200, loss: 96.679482
iter 1300, loss: 91.298546
iter 1400, loss: 85.892549
iter 1500, loss: 80.627141
iter 1600, loss: 75.589277
iter 1700, loss: 70.694775
iter 1800, loss: 66.029997
iter 1900, loss: 61.586191
iter 2000, loss: 57.210330
iter 2100, loss: 53.089178
iter 2200, loss: 49.203173
iter 2300, loss: 45.481933
iter 2400, loss: 42.079740
iter 2500, loss: 38.872600
iter 2600, loss: 35.801845
iter 2700, loss: 32.947669
iter 2800, loss: 30.295755
iter 2900, loss: 27.956856
iter 3000, loss: 25.672378
iter 3100, loss: 23.550706
iter 3200, loss: 21.589493
iter 3300, loss: 19.786788
iter 3400, loss: 18.128322
iter 3500, loss: 16.607625
iter 3600, loss: 15.217574
iter 3700, loss: 13.943614

In [11]:
def compare_predictions(text, next_word, hprev):

    predicted_word = predict_next(text, hprev)
    print("Seed text: '{}'".format(text))
    print("Actual next word: '{}'".format(next_word))
    print("Predicted next word: '{}'".format(predicted_word))
    if predicted_word == next_word:
        print("Prediction is correct!")
    else:
        print("Prediction is incorrect.")

**Testing the RNN**

In [29]:
np.random.seed(100)  
start_idx = np.random.randint(0, len(words) - seq_length - 1)
seed_text = ' '.join(words[start_idx:start_idx + seq_length])
next_word = words[start_idx + seq_length] 

hprev = np.zeros((hidden_size, 1))
compare_predictions(seed_text, next_word, hprev)

Seed text: 'the script the set designer builds the scenery the costume designer creates outfits the makeup artist enhances appearances the cinematographer captures the shots the editor'
Actual next word: 'assembles'
Predicted next word: 'assembles'
Prediction is correct!
