In [1]:
''' https://github.com/roatienza/Deep-Learning-Experiments/blob/master/Experiments/Tensorflow/RNN/rnn_words.py '''

import tensorflow as tf
import numpy as np

In [2]:
training_file = 'input_text.txt'

def read(filename):
    file_handler = open(filename)
    ''' To get content as a string, use file_handler.read()'''
    content_in_a_list = file_handler.readlines() # Returns the entire file content in a list. Delimiter is \n and not full stop.
    file_handler.close()
#     print(content_in_a_list)
#     print(len(content_in_a_list))
    content = [x.strip() for x in content_in_a_list] # Strips trailing whitespaces
#     print(content)
    ''' Split each sentence into words'''
    content = [content[i].split() for i in range(len(content))] # List of lists
#     print(content)
    content = np.array(content) # Therefore, it'll become a 2-dimensional numpy array
#     print(content.shape) # (1, 204)
    content = np.reshape(content, [-1,]) # Converting 2-dimensional numpy array to 1-dimensional array
#     print(content.shape) # (204, )
    return content

In [3]:
print("---Loading Data---")

''' training_data is a 1-dimensional numpy array with each element as a word from the file. Words can repeat'''
training_data = read(training_file) 


---Loading Data---


In [4]:
import collections

''' return 2 dictionaries: 1. word (not character) as key to index as value
                           2. index as key to word (not character) as value
                           
word having maximum frequency is assigned index as 0
words having same frequency are arbitarily given increementing index
'''
def build_dataset(list_of_words):
#         list_of_words = list(list_of_words)
#         word_count = np.array([(word, list_of_words.count(word)) for word in list_of_words])
        count = collections.Counter(list_of_words).most_common() # Taking in numpy array outputs a python list
#         print(count)
#      print(type(count))
#           print(len(count))
       
        ''' Easy to understand implementation'''
        word_to_ix = dict()
        ix = 0
        ix_to_word = dict()
        for word_, count_ in count:
            if word_ not in word_to_ix:
                    word_to_ix[word_] = ix
            
            if ix not in ix_to_word:
                ix_to_word[ix] = word_
                
            ix = ix + 1
            
        ''' Intelligent implementation '''
        word_to_index = dict()
        for word_, count_ in count:
                if word_ not in word_to_index:
                    word_to_index[word_] = len(word_to_index)
                
        
        index_to_word = dict(zip(word_to_index.values(), word_to_index.keys()))
        
        return word_to_ix, ix_to_word
        
            
          

In [5]:
word_to_ix, ix_to_word = build_dataset(training_data)
print(word_to_ix)
# print(ix_to_word)


{'then': 94, 'neck': 37, 'spoke': 38, 'approaches': 39, 'measures': 40, 'easily': 17, 'when': 41, 'had': 21, 'means': 43, 'common': 44, 'a': 6, 'an': 45, 'signal': 97, '.': 2, 'neighbourhood': 46, 'consists': 47, 'thought': 48, 'ribbon': 105, 'said': 4, 'danger': 49, 'was': 18, 'until': 50, 'us': 51, 'in': 11, 'i': 52, 'know': 108, 'ago': 53, 'she': 19, 'receive': 54, 'take': 55, 'will': 56, 'which': 20, 'case': 57, 'now': 58, 'procured': 59, 'what': 60, 'if': 61, 'he': 9, 'meet': 62, 'from': 103, 'all': 22, 'propose': 23, '?': 64, 'you': 65, 'some': 12, 'would': 66, 'remedies': 67, 'looked': 68, 'met': 69, 'but': 24, 'should': 70, 'escape': 72, 'by': 25, 'impossible': 73, 'very': 74, 'at': 26, 'old': 27, 'attached': 75, 'approach': 104, 'therefore': 77, 'could': 7, 'who': 78, 'council': 79, 'chief': 80, 'venture': 81, 'outwit': 82, 'about': 83, 'treacherous': 84, 'mice': 28, ',': 0, 'they': 85, 'her': 33, 'the': 1, 'manner': 86, 'up': 29, 'is': 13, 'consider': 88, 'general': 30, 'to':

In [6]:
''' Works: My implementation of build_dataset'''

def build_my_dataset(list_of_words):
    ''' These two lines are same as collections.Counter(list_of_words).most_common()'''
    list_of_words = list(list_of_words)
    word_count = sorted(list(set([(word,list_of_words.count(word))for word in list_of_words])), key=lambda x:x[1], reverse=True)
#     print(word_count)
#     print(len(word_count))

    
    ''' Easy to understand implementation'''
    word_to_ix = dict()
    ix = 0
    ix_to_word = dict()
    for word_, count_ in word_count:
        if word_ not in word_to_ix:
                word_to_ix[word_] = ix
            
        if ix not in ix_to_word:
                ix_to_word[ix] = word_
                
        ix = ix + 1
            
    return word_to_ix, ix_to_word

In [7]:
# word_to_ix, ix_to_word = build_my_dataset(training_data)
vocab_size = len(word_to_ix) # one-hot encoded output vector will be of shape (vocab_size,1)

In [10]:
from tensorflow.contrib import rnn

n_hidden = 512
n_input = 3
learning_rate = 0.001
no_of_words_to_be_passed_as_input_at_a_time = 3


# Placeholders
x = tf.placeholder(tf.float32, [None, n_input, 1])
y = tf.placeholder(tf.float32, [None, vocab_size])

weights = {
    'out' : tf.Variable(tf.random_normal([n_hidden, vocab_size]))
}

biases = {
    'out': tf.Variable(tf.random_normal([vocab_size]))
}

def RNN(x, weights, biases):
    # x is passed like this: [ [ [11], [2], [15] ] ]
    x = tf.reshape(x, (-1, no_of_words_to_be_passed_as_input_at_a_time))
    # x becomes [ [11, 2, 15] ]
    x1 = tf.split(x, no_of_words_to_be_passed_as_input_at_a_time, 1) # [ [ [11] ], [ [2] ], [ [15] ] ]
    
#     rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)])
    rnn_cell = rnn.BasicLSTMCell(n_hidden)

    # generate predictions
    outputs, states = rnn.static_rnn(rnn_cell, x1, dtype=tf.float32)
    
#     # there are n_input outputs but
    # we only want the last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [11]:
pred = RNN(x, weights, biases)

# Loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

# Model evaluation
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()



In [15]:
start = np.random.randint(0, no_of_words_to_be_passed_as_input_at_a_time+1) # Returns an integer between 0 and 3 inclusive.
max_iterations = 6000
i = 0
loss_total = 0
acc_total = 0
print_after_every_i_iteration = 100

sess = tf.Session()
sess.run(init)

end_index = no_of_words_to_be_passed_as_input_at_a_time + 1

while i < max_iterations:
    
    if start > (len(training_data) - end_index):
        start = np.random.randint(0, no_of_words_to_be_passed_as_input_at_a_time+1)
    
    ''' Preparing Input for the Network'''
    
    ''' list of words. How many words? equal to no_of_words_to_be_passed_as_input_at_a_time'''
    # Assume sentence : Chinto and Pinto are brothers.
    # [ 'Chinto' , 'and', 'Pinto' ] , 3 words in sequence from the document
    list_of_words = [ training_data[j] for j in range(start, start+no_of_words_to_be_passed_as_input_at_a_time)]
    
    # [ 11 , 2, 15 ], lower index means no of occurrence of that particular word was more in the document
    ix_of_corresponding_words = [ word_to_ix[word] for word in list_of_words]
    
    ix_of_corresponding_words_modified = list()
    # [ [11], [2], [15] ] => its a list but if it were numpy array its shape would be (3,1)
    # where no_of_words_to_be_passed_as_input_at_a_time = 3
    for ix in ix_of_corresponding_words:
        ix_of_corresponding_words_modified.append([ix])
        
#     print(ix_of_corresponding_words_modified)
    # three_dimensional_input = [ [ [11], [2], [15] ] ] => shape (1, 3, 1)
    three_dimensional_input = np.reshape(ix_of_corresponding_words_modified, (-1, no_of_words_to_be_passed_as_input_at_a_time, 1))
    
    
    ''' Preparing output for the Network'''
    
    # Get the index of the word after Pinto => index for 'are'
    output_word = training_data[start+no_of_words_to_be_passed_as_input_at_a_time]
    # suppose ix = 5
    ix_of_output_word = word_to_ix[output_word]
    
    # One hot encode the output word's index, assuming vocab_size = 7
    # 
    one_hot_encoded_output = np.zeros((vocab_size), dtype=float) # 1-d numpy array
    one_hot_encoded_output[ix_of_output_word] = 1.0 # [ 0 0 0 0 1.0 0 0 ]
    one_hot_encoded_reshaped = np.reshape(one_hot_encoded_output, (1,-1)) # 2-d numpy array # [ [0 0 0 0 1.0 0 0] ]
    
    ''' Calling Tensorflow Operations '''
    _, acc, loss, onehot_pred = sess.run([optimizer, accuracy, cost, pred], \
                                                feed_dict={x: three_dimensional_input, y: one_hot_encoded_reshaped})
    
    loss_total = loss_total + loss
    acc_total = acc_total + acc
    
    
    if i%print_after_every_i_iteration == 0:
        print("Iteration", i)
        print("Average Loss", loss_total/print_after_every_i_iteration)
        print("Average Accuracy", (acc_total*100)/print_after_every_i_iteration)
        
        acc_total = 0
        loss_total = 0
        
        random_words = [ training_data[z] for z in range(start, start+no_of_words_to_be_passed_as_input_at_a_time) ]
        next_word = training_data[start+no_of_words_to_be_passed_as_input_at_a_time]
        predicted_next_word = ix_to_word[int(tf.argmax(onehot_pred, 1).eval(session=sess))]
        print("words in sequence:", random_words)
        print("next word in sequence:", next_word)
        print("next word predicted for sequence:", predicted_next_word)
        
    
    i = i + 1
    start = start + no_of_words_to_be_passed_as_input_at_a_time + 1
    
print("Optimization Finished.")

Iteration 0
Average Loss 0.0669912624359
Average Accuracy 0.0
words in sequence: ['long', 'ago', ',']
next word in sequence: the
next word predicted for sequence: could
Iteration 100
Average Loss 10.3914401615
Average Accuracy 2.0
words in sequence: ['is', 'easy', 'to']
next word in sequence: propose
next word predicted for sequence: nobody
Iteration 200
Average Loss 6.77364338636
Average Accuracy 0.0
words in sequence: ['is', 'easy', 'to']
next word in sequence: propose
next word predicted for sequence: nobody
Iteration 300
Average Loss 6.95348080337
Average Accuracy 2.0
words in sequence: ['easy', 'to', 'propose']
next word in sequence: impossible
next word predicted for sequence: bell
Iteration 400
Average Loss 5.91246271372
Average Accuracy 5.0
words in sequence: ['it', 'is', 'easy']
next word in sequence: to
next word predicted for sequence: ,
Iteration 500
Average Loss 5.98596815467
Average Accuracy 2.0
words in sequence: ['said', 'it', 'is']
next word in sequence: easy
next word

In [16]:
''' Testing '''

print("Enter", no_of_words_to_be_passed_as_input_at_a_time, " words")
sentence = input()
sentence = sentence.strip()
words = sentence.split(' ')

words_converted_to_index = [word_to_ix[word] for word in words]

print("No of words that the predicted sentence should contain?")
n = int(input())

for i in range(n):
    three_dimensional_test_input = np.reshape(words_converted_to_index, (-1, no_of_words_to_be_passed_as_input_at_a_time, 1))
    onehot_pred = sess.run(pred, feed_dict={x:three_dimensional_test_input})
    onehot_pred_index = int(tf.argmax(onehot_pred,1).eval(session = sess))
    words_converted_to_index = words_converted_to_index[1:]
    words_converted_to_index.append(onehot_pred_index)
    
    sentence = sentence + ' ' + ix_to_word[onehot_pred_index]

print(sentence)

Enter 3  words
had a general
No of words that the predicted sentence should contain?
32
had a general would to until mouse signal , which could a neighbourhood the outwit , treacherous . treacherous mouse easy mouse should , and could , and some , and some , and some
