## Implementing a RNN

Source: http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/

In [9]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import os
import gzip
from six.moves.urllib.request import urlretrieve

import matplotlib.pyplot as plt
%matplotlib inline

Export reddit comments from a [dataset available on Google's BigQuery](https://bigquery.cloud.google.com/table/fh-bigquery:reddit_comments.2015_08).

In [10]:
def download_file(url, filename, expected_size_in_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename) or force == True:
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_size_in_bytes:
    print('Found and verified', filename)
  else:
    raise Exception('Failed to verify {0}. Expected {1}B but found {2}B!'.format(filename, expected_size_in_bytes, statinfo.st_size))
  return filename

In [11]:
dataset_compressed_filename = download_file('https://github.com/SebastienBoisard/DeepLearningTutorials/'+
                                            'raw/master/Language_model_with_RNN/data/',
                                            'reddit-comments-2015-08.data.gz', 
                                             3152770)

Found and verified reddit-comments-2015-08.data.gz


In [12]:
def decompress_file(compressed_filename):
    # Split the gziped file name into a name and the extension
    file_name, file_extension = os.path.splitext(compressed_filename)
    
    if file_extension != '.gz':
        raise Exception('Can\'t decompress file \'', compressed_filename, '\' because this is not a .gz file!')
       
    with gzip.open(compressed_filename, 'rb') as f:
        file_content = f.read()    
       
        with open(file_name, 'wb') as outfile:
            outfile.write(file_content)
    
    return file_name

In [13]:
dataset_filename = decompress_file(dataset_compressed_filename)

In [14]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"


# Read the data file, parse and tokenzie the senteances, and append SENTENCE_START and SENTENCE_END tokens for each sentence.
def extract_sentences(data_filename):
    print("Reading CSV data file:", data_filename)
    with open(data_filename, 'r', encoding='utf-8') as f:
        # Read the data file as a CVS file
        reader = csv.reader(f, skipinitialspace=True)

        # Skip the first element of the data file (which is always "body")
        reader.__next__()

        # Split full comments into sentences
        sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])

        # Append SENTENCE_START and SENTENCE_END at the beginning and end of each sentence
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

        return sentences

dataset_filename = "exemple.data"
sentences = extract_sentences(dataset_filename)

print("Parsed %d sentences." % (len(sentences)))
print("sentences=", sentences)


Reading CSV data file: exemple.data
Parsed 192 sentences.
sentences= ["SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END", "SENTENCE_START it's a slight ppr league- .2 ppr. SENTENCE_END", 'SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END', 'SENTENCE_START my question is, is it wildly clear that qb has the highest potential for points? SENTENCE_END', 'SENTENCE_START i put in the rules at a ranking site and noticed that top qbs had 300 points more than the top rb/wr. SENTENCE_END', 'SENTENCE_START would it be dumb not to grab a qb in the first round? SENTENCE_END', 'SENTENCE_START in your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon. SENTENCE_END', "SENTENCE_START there's no way to enforce it. SENTENCE_END", "SENTENCE_START an hon

In [15]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

print(tokenized_sentences)


[['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END'], ['SENTENCE_START', 'it', "'s", 'a', 'slight', 'ppr', 'league-', '.2', 'ppr', '.', 'SENTENCE_END'], ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', '.2', 'points', 'per', 'completion', ',', '6', 'points', 'per', 'td', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'rec/rush/pass', 'yardage', '.', 'SENTENCE_END'], ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'wildly', 'clear', 'that', 'qb', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'ranking', 'site', 'and', 'noticed', 'that', 'top', 'qbs', 'had', '300', 'points', 'more', 'than', 'the', 'top', 'rb/wr', '.', 'SENTENCE_END'], ['SENTENCE_START', 'would', 'it', 'be', 'dumb', 'not', 'to', 'grab', 'a', 

In [16]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

Found 1103 unique words tokens.


In [17]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
print(vocab)


[('SENTENCE_START', 192), ('SENTENCE_END', 192), ('.', 165), ('the', 114), (',', 99), ('to', 91), ('a', 71), ('i', 67), ('and', 64), ('it', 60), ('in', 51), ("n't", 50), ('that', 47), ('is', 45), ('you', 42), ('of', 39), ("'s", 38), ('for', 26), ('do', 25), ('not', 24), ('with', 23), ('have', 21), ('be', 20), ('they', 19), ('like', 19), (':', 19), ('on', 19), ('was', 18), (')', 18), ('but', 18), ('as', 18), ('my', 17), ('(', 17), ('would', 16), ('this', 15), ('what', 15), ('has', 15), ('or', 15), ('are', 14), ('if', 14), ('them', 14), ('just', 13), ('all', 13), ('?', 13), ('an', 12), ('people', 12), ('no', 12), ('so', 12), ('at', 12), ('your', 12), ('who', 11), ("''", 11), ('get', 11), ('than', 11), (';', 11), ('more', 11), ('out', 11), ('why', 10), ('&', 10), ('he', 10), ('when', 10), ('will', 10), ('really', 9), ('``', 9), ('which', 9), ('other', 9), ('had', 9), ('gt', 9), ('think', 9), ('good', 9), ('even', 8), ('first', 8), ('use', 8), ('does', 8), ('we', 8), ('http', 8), ('there',

In [18]:
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
print(index_to_word)


['SENTENCE_START', 'SENTENCE_END', '.', 'the', ',', 'to', 'a', 'i', 'and', 'it', 'in', "n't", 'that', 'is', 'you', 'of', "'s", 'for', 'do', 'not', 'with', 'have', 'be', 'they', 'like', ':', 'on', 'was', ')', 'but', 'as', 'my', '(', 'would', 'this', 'what', 'has', 'or', 'are', 'if', 'them', 'just', 'all', '?', 'an', 'people', 'no', 'so', 'at', 'your', 'who', "''", 'get', 'than', ';', 'more', 'out', 'why', '&', 'he', 'when', 'will', 'really', '``', 'which', 'other', 'had', 'gt', 'think', 'good', 'even', 'first', 'use', 'does', 'we', 'http', 'there', 'going', 'by', 'same', 'then', 'because', 'about', "'m", 'see', 'can', 'very', 'any', 'game', 'me', '!', 'before', 'been', 'their', 'check', 'say', 'one', 'also', 'into', 'having', 'most', 'gun', 'take', 'only', "'re", 'some', 'teams', 'team', 'players', 'make', 'points', 'put', 'bombs', 'enough', 'run', ']', 'last', 'reading', 'did', 'used', 'way', '[', 'here', 'lose', 'without', 'too', 'background', 'someone', 'mod', 'were', 'fucking', 'bas

In [19]:
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'spend' and appeared 1 times.


In [20]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

In [21]:
print("\nExample sentence: '%s'" % sentences[2])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[2])


Example sentence: 'SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', '.2', 'points', 'per', 'completion', ',', '6', 'points', 'per', 'td', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'rec/rush/pass', 'yardage', '.', 'SENTENCE_END']'


In [22]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [23]:
# Print a training data example
x_example, y_example = X_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 35, 38, 11, 14, 672, 82, 34, 43, 90]

y:
what are n't you understanding about this ? ! SENTENCE_END
[35, 38, 11, 14, 672, 82, 34, 43, 90, 1]


In [36]:
class RNNNumpy:
    
    def __init__(self, word_dimension, hidden_dimension=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dimension = word_dimension
        self.hidden_dimension = hidden_dimension
        self.bptt_truncate = bptt_truncate
        
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dimension), np.sqrt(1./word_dimension), (hidden_dimension, word_dimension))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dimension), np.sqrt(1./hidden_dimension), (word_dimension, hidden_dimension))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dimension), np.sqrt(1./hidden_dimension), (hidden_dimension, hidden_dimension))

The Softmax function turns scores (i.e. logits) into probabilities

In [40]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

In [41]:
def forward_propagation(self, x):
    # The total number of time steps
    T = len(x)
    # During forward propagation we save all hidden states in s because need them later.
    # We add one additional element for the initial hidden, which we set to 0
    s = np.zeros((T + 1, self.hidden_dimension))
    s[-1] = np.zeros(self.hidden_dimension)
    # The outputs at each time step. Again, we save them for later.
    o = np.zeros((T, self.word_dimension))
    # For each time step...
    for t in np.arange(T):
        # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [42]:
def predict(self, x):
    # Perform forward propagation and return index of the highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [43]:
# Initialize the random generator
np.random.seed(10)

model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)

(45, 8000)
[[ 0.00012408  0.0001244   0.00012603 ...,  0.00012515  0.00012488
   0.00012508]
 [ 0.00012507  0.00012495  0.00012462 ...,  0.0001254   0.00012582
   0.00012498]
 [ 0.00012439  0.00012546  0.00012446 ...,  0.00012418  0.00012388
   0.0001246 ]
 ..., 
 [ 0.00012394  0.00012576  0.00012485 ...,  0.00012523  0.00012572
   0.00012481]
 [ 0.00012545  0.00012422  0.00012463 ...,  0.00012423  0.00012541
   0.0001248 ]
 [ 0.00012605  0.00012626  0.00012592 ...,  0.00012452  0.00012556
   0.00012546]]


In [46]:
predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions)
print(X_train[10])

(45,)
[1284 4281 5639 2269 6629 6601 3908 2321 5354 3251  911 4027 4048 5898 2937
 2761 3496 7316 2175 1598 1835 4985 6982 4226 6821 6830 2191 2151  379 1131
 2351  398 6715 2800 7262 6015 6799 4720 7229 2213 6176 4823 5314 5027    6]
[0, 46, 96, 13, 77, 5, 22, 289, 113, 5, 114, 3, 94, 4, 84, 23, 104, 6, 167, 4, 8, 80, 42, 15, 6, 415, 586, 206, 329, 8, 95, 63, 764, 4, 14, 872, 35, 4, 122, 16, 49, 101, 822, 2, 51]


In [47]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [48]:
# Limit to 1000 examples to save time
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197
Actual loss: 8.987223
