In [6]:
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np
import collections
import random
from scipy import spatial
 
#Load fasttext vectors
filepath_glove = 'crawl-300d-2M.vec'
glove_vocab = []
glove_embd=[]
embedding_dict = {}

with open(filepath_glove) as file:
    c = 0
    for index, line in enumerate(file):
        values = line.strip().split() # Word and weights separated by space
        if index == 0:
            glove_vocab_size = int(values[0])
            embedding_dim = int(values[1])
        else:
            row = line.strip().split(' ')
            vocab_word = row[0]
            glove_vocab.append(vocab_word)
            embed_vector = [float(i) for i in row[1:]] # convert to list of float
            embedding_dict[vocab_word]=embed_vector
            c += 1
            if c >=20000:
                break
  
print('Loaded GLOVE')
 
print(glove_vocab_size)
print(embedding_dim)

Loaded GLOVE
2000000
300


In [9]:
fable_text = """
long ago , the mice had a general council to consider what measures
they could take to outwit their common enemy , the cat . some said
this , and some said that but at last a young mouse got up and said
he had a proposal to make , which he thought would meet the case . 
you will all agree , said he , that our chief danger consists in the
sly and treacherous manner in which the enemy approaches us . now , 
if we could receive some signal of her approach , we could easily
escape from her . i venture , therefore , to propose that a small
bell be procured , and attached by a ribbon round the neck of the cat
. by this means we should always know when she was about , and could
easily retire while she was in the neighbourhood . this proposal met
with general applause , until an old mouse got up and said that is
all very well , but who is to bell the cat ? the mice looked at one
another and nobody spoke . then the old mouse said it is easy to
propose impossible remedies .
"""
 
fable_text = fable_text.replace('\n','')
 
#this function puts all the words in a single column vector within a numpy array
 
def read_data(raw_text):
    content = raw_text
    content = content.split() #splits the text by spaces (default split character)
    content = np.array(content)
    content = np.reshape(content, [-1, ])
    return content
 
training_data = read_data(fable_text)

In [8]:
#Create dictionary and reverse dictionary with word ids
 
def build_dictionaries(words):
    count = collections.Counter(words).most_common() #creates list of word/count pairs;
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) #len(dictionary) increases each iteration
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary
 
dictionary, reverse_dictionary = build_dictionaries(training_data)

In [10]:
#Create embedding array
 
doc_vocab_size = len(dictionary)
dict_as_list = sorted(dictionary.items(), key = lambda x : x[1])
 
embeddings_tmp=[]
 
for i in range(doc_vocab_size):
    item = dict_as_list[i][0]
    if item in glove_vocab:
        embeddings_tmp.append(embedding_dict[item])
    else:
        rand_num = np.random.uniform(low=-0.2, high=0.2,size=embedding_dim)
        embeddings_tmp.append(rand_num)
 
# final embedding array corresponds to dictionary of words in the document
embedding = np.asarray(embeddings_tmp)
 
# create tree so that we can later search for closest vector to prediction
tree = spatial.KDTree(embedding)

In [11]:
# model parameters
learning_rate = 0.001
n_input = 3 # this is the number of words that are read at a time
n_hidden = 512
 
# create input placeholders
x = tf.placeholder(tf.int32, [None, n_input])
y = tf.placeholder(tf.float32, [None, embedding_dim])
 
# RNN output node weights and biases
weights = { 'out': tf.Variable(tf.random_normal([n_hidden, embedding_dim])) }
biases = { 'out': tf.Variable(tf.random_normal([embedding_dim])) }
 
with tf.name_scope("embedding"):
    W = tf.Variable(tf.constant(0.0, shape=[doc_vocab_size, embedding_dim]), trainable=False, name="W")
    embedding_placeholder = tf.placeholder(tf.float32, [doc_vocab_size, embedding_dim])
    embedding_init = W.assign(embedding_placeholder)
    embedded_chars = tf.nn.embedding_lookup(W,x)
  
# reshape input data
x_unstack = tf.unstack(embedded_chars, n_input, 1)
 
# create RNN cells
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden),rnn.BasicLSTMCell(n_hidden)])
outputs, states = rnn.static_rnn(rnn_cell, x_unstack, dtype=tf.float32)
 
# capture only the last output
pred = tf.matmul(outputs[-1], weights['out']) + biases['out'] 
 
# Create loss function and optimizer
cost = tf.reduce_mean(tf.nn.l2_loss(pred-y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [12]:
# Initialize
 
init=tf.global_variables_initializer()
 
# Launch the graph
  
with tf.Session() as sess:
    sess.run(init)
    sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})

    step=0
    offset = random.randint(0,n_input+1) #random integer between 0 and 3
    end_offset = n_input+1 # in our case tihs is 4
    acc_total = 0
    loss_total = 0
    training_iters = 10000
    display_step = 500

    while step < training_iters:
        ### Generate a minibatch ###
        # when offset gets close to the end of the training data, restart near the beginning

        if offset > (len(training_data) - end_offset):
            offset = random.randint(0, n_input+1)
         # get the integer representations for the input words

        x_integers = [[dictionary[str(training_data[i])]] for i in range(offset, offset+n_input)]
        x_integers = np.reshape(np.array(x_integers), [-1, n_input])

        # create embedding for target vector 

        y_position = offset+n_input
        y_integer = dictionary[training_data[y_position]]
        y_embedding = embedding[y_integer,:]
        y_embedding = np.reshape(y_embedding,[1,-1])


        _,loss, pred_ = sess.run([optimizer, cost,pred], feed_dict = {x: x_integers, y: y_embedding})

        loss_total += loss

        # display output to show progress

        if (step+1) % display_step ==0:
            words_in = [str(training_data[i]) for i in range(offset, offset+n_input)] 
            target_word = str(training_data[y_position])

            nearest_dist,nearest_idx = tree.query(pred_[0],3)
            nearest_words = [reverse_dictionary[idx] for idx in nearest_idx]

            print("%s - [%s] vs [%s]" % (words_in, target_word, nearest_words))
            print("Average Loss= " + "{:.6f}".format(loss_total/display_step))
            loss_total=0

        step +=1
        offset += (n_input+1) 

    print("Finished Optimization")

['and', 'attached', 'by'] - [a] vs [['that', 'and', 'this']]
Average Loss= 8.822912
['got', 'up', 'and'] - [saidhe] vs [[',', 'saidhe', 'and']]
Average Loss= 4.076728
['old', 'mouse', 'got'] - [up] vs [['it', 'and', 'to']]
Average Loss= 3.451214
['approaches', 'us', '.'] - [now] vs [['now', 'that', 'it']]
Average Loss= 2.556441
['had', 'a', 'general'] - [council] vs [[',', 'it', 'that']]
Average Loss= 2.597294
['always', 'know', 'when'] - [she] vs [['she', 'that', 'it']]
Average Loss= 2.197803
['had', 'a', 'proposal'] - [to] vs [['to', 'that', 'the']]
Average Loss= 2.015977
['but', 'who', 'is'] - [to] vs [['to', 'and', 'it']]
Average Loss= 1.788811
['if', 'we', 'could'] - [receive] vs [['receive', 'know', 'to']]
Average Loss= 1.548932
['what', 'measuresthey', 'could'] - [take] vs [['take', 'to', 'it']]
Average Loss= 1.215272
['this', 'means', 'we'] - [should] vs [['should', 'would', 'could']]
Average Loss= 1.206634
['a', 'young', 'mouse'] - [got] vs [['got', 'had', 'now']]
Average Loss