## Implementing a RNN

Source: http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/

In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import os
import gzip
from six.moves.urllib.request import urlretrieve

import matplotlib.pyplot as plt
%matplotlib inline

Export reddit comments from a [dataset available on Google's BigQuery](https://bigquery.cloud.google.com/table/fh-bigquery:reddit_comments.2015_08).

In [2]:
def download_file(url, filename, expected_size_in_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename) or force == True:
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_size_in_bytes:
    print('Found and verified', filename)
  else:
    raise Exception('Failed to verify {0}. Expected {1}B but found {2}B!'.format(filename, expected_size_in_bytes, statinfo.st_size))
  return filename

In [3]:
dataset_compressed_filename = download_file('https://github.com/SebastienBoisard/DeepLearningTutorials/'+
                                            'raw/master/Language_model_with_RNN/data/',
                                            'reddit-comments-2015-08.data.gz', 
                                             3152770)

Found and verified reddit-comments-2015-08.data.gz


In [4]:
def decompress_file(compressed_filename):
    # Split the gziped file name into a name and the extension
    file_name, file_extension = os.path.splitext(compressed_filename)
    
    if file_extension != '.gz':
        raise Exception('Can\'t decompress file \'', compressed_filename, '\' because this is not a .gz file!')
       
    with gzip.open(compressed_filename, 'rb') as f:
        file_content = f.read()    
       
        with open(file_name, 'wb') as outfile:
            outfile.write(file_content)
    
    return file_name

In [5]:
dataset_filename = decompress_file(dataset_compressed_filename)

In [6]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"


# Read the data file, parse and tokenzie the senteances, and append SENTENCE_START and SENTENCE_END tokens for each sentence.
def extract_sentences(data_filename):
    print("Reading CSV data file:", data_filename)
    with open(data_filename, 'r', encoding='utf-8') as f:
        # Read the data file as a CVS file
        reader = csv.reader(f, skipinitialspace=True)

        # Skip the first element of the data file (which is always "body")
        reader.__next__()

        # Split full comments into sentences
        sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])

        # Append SENTENCE_START and SENTENCE_END at the beginning and end of each sentence
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

        return sentences

dataset_filename = "exemple.data"
sentences = extract_sentences(dataset_filename)

print("Parsed %d sentences." % (len(sentences)))
print("sentences=", sentences)


Reading CSV data file: exemple.data
Parsed 192 sentences.
sentences= ["SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END", "SENTENCE_START it's a slight ppr league- .2 ppr. SENTENCE_END", 'SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END', 'SENTENCE_START my question is, is it wildly clear that qb has the highest potential for points? SENTENCE_END', 'SENTENCE_START i put in the rules at a ranking site and noticed that top qbs had 300 points more than the top rb/wr. SENTENCE_END', 'SENTENCE_START would it be dumb not to grab a qb in the first round? SENTENCE_END', 'SENTENCE_START in your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon. SENTENCE_END', "SENTENCE_START there's no way to enforce it. SENTENCE_END", "SENTENCE_START an hon

In [7]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

print(tokenized_sentences)


[['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END'], ['SENTENCE_START', 'it', "'s", 'a', 'slight', 'ppr', 'league-', '.2', 'ppr', '.', 'SENTENCE_END'], ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', '.2', 'points', 'per', 'completion', ',', '6', 'points', 'per', 'td', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'rec/rush/pass', 'yardage', '.', 'SENTENCE_END'], ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'wildly', 'clear', 'that', 'qb', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'ranking', 'site', 'and', 'noticed', 'that', 'top', 'qbs', 'had', '300', 'points', 'more', 'than', 'the', 'top', 'rb/wr', '.', 'SENTENCE_END'], ['SENTENCE_START', 'would', 'it', 'be', 'dumb', 'not', 'to', 'grab', 'a', 

In [8]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

Found 1103 unique words tokens.


In [9]:
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
print(vocab)


[('SENTENCE_END', 192), ('SENTENCE_START', 192), ('.', 165), ('the', 114), (',', 99), ('to', 91), ('a', 71), ('i', 67), ('and', 64), ('it', 60), ('in', 51), ("n't", 50), ('that', 47), ('is', 45), ('you', 42), ('of', 39), ("'s", 38), ('for', 26), ('do', 25), ('not', 24), ('with', 23), ('have', 21), ('be', 20), ('like', 19), (':', 19), ('they', 19), ('on', 19), (')', 18), ('as', 18), ('was', 18), ('but', 18), ('(', 17), ('my', 17), ('would', 16), ('what', 15), ('or', 15), ('has', 15), ('this', 15), ('if', 14), ('are', 14), ('them', 14), ('all', 13), ('?', 13), ('just', 13), ('people', 12), ('at', 12), ('no', 12), ('so', 12), ('your', 12), ('an', 12), ('more', 11), ("''", 11), ('than', 11), ('who', 11), ('out', 11), ('get', 11), (';', 11), ('&', 10), ('will', 10), ('he', 10), ('when', 10), ('why', 10), ('think', 9), ('really', 9), ('gt', 9), ('had', 9), ('``', 9), ('other', 9), ('which', 9), ('good', 9), ('we', 8), ('then', 8), ('same', 8), ('because', 8), ("'m", 8), ('can', 8), ('even', 

In [10]:
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
print(index_to_word)


['SENTENCE_END', 'SENTENCE_START', '.', 'the', ',', 'to', 'a', 'i', 'and', 'it', 'in', "n't", 'that', 'is', 'you', 'of', "'s", 'for', 'do', 'not', 'with', 'have', 'be', 'like', ':', 'they', 'on', ')', 'as', 'was', 'but', '(', 'my', 'would', 'what', 'or', 'has', 'this', 'if', 'are', 'them', 'all', '?', 'just', 'people', 'at', 'no', 'so', 'your', 'an', 'more', "''", 'than', 'who', 'out', 'get', ';', '&', 'will', 'he', 'when', 'why', 'think', 'really', 'gt', 'had', '``', 'other', 'which', 'good', 'we', 'then', 'same', 'because', "'m", 'can', 'even', 'going', 'first', 'there', 'use', 'by', 'very', 'does', 'about', 'http', 'see', 'any', 'check', '!', 'before', 'game', 'one', 'me', 'their', 'been', 'say', 'teams', "'re", 'team', 'most', 'some', 'players', 'also', 'gun', 'only', 'put', 'take', 'into', 'having', 'bombs', 'points', 'make', '[', 'did', 'used', 'enough', 'reading', 'here', 'last', ']', 'run', 'way', "'ll", 'desire', 'up', 'background', 'mod', 'during', 'without', 'anything', 'poi

In [12]:
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 8000.
The least frequent word in our vocabulary is 'consistent' and appeared 1 times.


In [17]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

In [18]:
print("\nExample sentence: '%s'" % sentences[2])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[2])


Example sentence: 'SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', '.2', 'points', 'per', 'completion', ',', '6', 'points', 'per', 'td', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'rec/rush/pass', 'yardage', '.', 'SENTENCE_END']'
