In [6]:
import itertools
import csv
import nltk
import numpy as np

from model import LSTM
from sklearn.model_selection import train_test_split

In [2]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open('data/reddit-comments-2015-08.csv', 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

In [7]:
# Create the training data
x_data = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_data = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.75, random_state=seed)

In [11]:
# Print an training data example
x_example, y_example = x_data[17], y_data[17]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

print('\n')

# Another one
# Print an training data example
x_example, y_example = x_data[33], y_data[33]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 861, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 861, 54, 25, 34, 69, 1]


x:
SENTENCE_START non-stop UNKNOWN_TOKEN bombs .
[0, 7550, 7999, 3338, 2]

y:
non-stop UNKNOWN_TOKEN bombs . SENTENCE_END
[7550, 7999, 3338, 2, 1]


In [5]:
#Initialize the model
seed = 1337
np.random.seed(seed)
model = LSTM(vocabulary_size, 10, seed=seed)

Constructing Architecture...


Initializing model...


Model Initialized!


In [8]:
model.train(x_train, y_train, num_epochs=500)

Starting training for 500 epochs


ValueError: setting an array element with a sequence.