# 01 HP-LSTM Notebook

In [0]:
from __future__ import print_function

import collections
import datetime
import os
import random
import string
import sys
import time
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn

In [0]:
def build_dataset(words):
    """
    Creates the word <-> integer mapping
    Args:
        words (String):

    Returns:
        dictionary (dict):
        reverse_dictionary (dict):

    """
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

In [0]:
def clean_data(content, include_newlines, include_punct):
    """
    Read the input file content and convert into a numpy vector.

    Args:
        content (str): str content
        include_newlines (boolean): Include newlines?
        include_punct (Boolean): Include punctuation?

    Returns:
        training_data (ndarray): a large ndarray | vector with the file content
    """
    content_list = []

    # Include Newlines?
    if include_newlines:

        # split int lines
        for line in content.split("\n"):

            for word in line.split(" "):

                # handle double space case
                if word == '':
                    continue

                # remove all punctuation
                translator = str.maketrans('', '', string.punctuation)
                new_word = word.translate(translator)

                # remove quotes
                last_word = new_word.replace("\"", "")
                last_word = last_word.replace("”", "")
                last_word = last_word.replace("“", "")

                content_list.append(last_word)

            content_list.append("\n")

    # remove newlines
    else:
        # split int lines

        for line in content.split("\n"):
            if line == '':
                continue

            for word in line.split(" "):
                # handle double space case
                if word == '':
                    continue

                # remove all punctuation
                translator = str.maketrans('', '', string.punctuation)
                new_word = word.translate(translator)

                # remove quotes
                last_word = new_word.replace("\"", "")
                last_word = last_word.replace("”", "")
                last_word = last_word.replace("“", "")

                content_list.append(last_word)


    training_data = np.array(content_list)
    training_data = np.reshape(training_data, [-1, ])
    return training_data

# Get input data up and cleaned

In [0]:
# Google drive link
from google.colab import auth

auth.authenticate_user()


In [0]:
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')


{ "id": "1nCRgvN-y3sw5tmXrcxXDkWfXyImKm6su",
  "name": "sky.txt",},

{ "id": "1uz9lZwqG-xqN_njIvMYbHVu46p1j4_au",
   "name": "HP1-open.txt",},

{ "id": "1RFcV_8lBsXaqRIGLNxP-24-3uzbGXQgj",
  "name": "HP1-1.txt" },

{"id": "1CZFlA6ceO1yXaAUwtE-QwBM1WnSJ0wG5",
 "name": "belling_the_cat.txt",

{"id": "1ycD5uu5ftfFxum-rgkzGvsChCD6kqjEc",
   "name": "HP-chapters.txt",}

In [6]:
# Download the file we just uploaded.
#
# Replace the assignment below with your file ID
# to download a different file.
#
# A file ID looks like: 1uBtlaggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1RFcV_8lBsXaqRIGLNxP-24-3uzbGXQgj'

import io
from googleapiclient.http import MediaIoBaseDownload

request = drive_service.files().get_media(fileId=file_id)
downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, done = downloader.next_chunk()

downloaded.seek(0)
#print('Downloaded file contents are: {}'.format(downloaded.read()))

file_data = downloaded.read().decode('utf-8')
print(len(file_data))
print(file_data)

26481
THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, 
were proud to say that they were perfectly normal, 
thank you very much. They were the last people you’d 
expect to be involved in anything strange or 
mysterious, because they just didn’t hold with such 
nonsense. 

Mr. Dursley was the director of a firm called 
Grunnings, which made drills. He was a big, beefy 
man with hardly any neck, although he did have a 
very large mustache. Mrs. Dursley was thin and 
blonde and had nearly twice the usual amount of 
neck, which came in very useful as she spent so 
much of her time craning over garden fences, spying 
on the neighbors. The Dursleys had a small son 
called Dudley and in their opinion there was no finer 
boy anywhere. 

The Dursleys had everything they wanted, but they 
also had a secret, and their greatest fear was that 
somebody would discover it. They didn’t think they 
could bear it if anyone found out about the Potters. 
Mrs. Potter was Mrs. Dursley

In [0]:
file_data = """THE BOY WHO LIVED 
Mr. and Mrs. Dursley, of number four, Privet Drive, 
were proud to say that they were perfectly normal, 
thank you very much. They were the last people you’d 
expect to be involved in anything strange or 
mysterious, because they just didn’t hold with such 
nonsense. 

Mr. Dursley was the director of a firm called 
Grunnings, which made drills. He was a big, beefy 
man with hardly any neck, although he did have a 
very large mustache. Mrs. Dursley was thin and 
blonde and had nearly twice the usual amount of 
neck, which came in very useful as she spent so 
much of her time craning over garden fences, spying 
on the neighbors. The Dursleys had a small son 
called Dudley and in their opinion there was no finer 
boy anywhere. 

The Dursleys had everything they wanted, but they 
also had a secret, and their greatest fear was that 
somebody would discover it. They didn’t think they 
could bear it if anyone found out about the Potters. 
Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t
met for several years; in fact, Mrs. Dursley pretended
she didn’t have a sister, because her sister and her
good-for-nothing husband were as unDursleyish as it 
was possible to be. The Dursleys shuddered to think
what the neighbors would say if the Potters arrived
in the street. The Dursleys knew that the Potters
had a small son, too, but they had never even seen him. 

This boy was another good reason for keeping the 
Potters away; they didn’t want Dudley mixing with a 
child like that. 

When Mr. and Mrs. Dursley woke up on the dull, gray 
Tuesday our story starts, there was nothing about the 
cloudy sky outside to suggest that strange and 
mysterious things would soon be happening all over 
the country. Mr. Dursley hummed as he picked out 
his most boring tie for work, and Mrs. Dursley 
gossiped away happily as she wrestled a screaming 
Dudley into his high chair. 

None of them noticed a large, tawny owl flutter past 
the window. 

At half past eight, Mr. Dursley picked up his 
briefcase, pecked Mrs. Dursley on the cheek, and 
tried to kiss Dudley good-bye but missed, because 
Dudley was now having a tantrum and throwing his 
cereal at the walls. “Little tyke,” chortled Mr. Dursley 
as he left the house. He got into his car and backed 
out of number four’s drive. 

It was on the corner of the street that he noticed the 
first sign of something peculiar — a cat reading a 
map. For a second, Mr. Dursley didn’t realize what he 
had seen — then he jerked his head around to look 
again. There was a tabby cat standing on the corner 
of Privet Drive, but there wasn’t a map in sight. What 
could he have been thinking of? It must have been a 
trick of the light. Mr. Dursley blinked and stared at 
the cat. It stared back. As Mr. Dursley drove around 
the corner and up the road, he watched the cat in his 
mirror. It was now reading the sign that said Privet 
Drive — no, looking at the sign; cats couldn’t read 
maps or signs. Mr. Dursley gave himself a little shake 
and put the cat out of his mind. As he drove toward 
town he thought of nothing except a large order of 
drills he was hoping to get that day. 

But on the edge of town, drills were driven out of his 
mind by something else. As he sat in the usual 
morning traffic jam, he couldn’t help noticing that 
there seemed to be a lot of strangely dressed people 
about. People in cloaks. Mr. Dursley couldn’t bear 
people who dressed in funny clothes — the getups 
you saw on young people! He supposed this was some 
stupid new fashion. He drummed his fingers on the 
steering wheel and his eyes fell on a huddle of these 
weirdos standing quite close by. They were whispering 
excitedly together. Mr. Dursley was enraged to see 
that a couple of them weren’t young at all; why, that 
man had to be older than he was, and wearing an 
emerald-green cloak! The nerve of him! But then it 
struck Mr. Dursley that this was probably some silly 
stunt — these people were obviously collecting for 
something ... yes, that would be it. The traffic moved 
on and a few minutes later, Mr. Dursley arrived in the 
Grunnings parking lot, his mind back on drills. 

Mr. Dursley always sat with his back to the window in 
his office on the ninth floor. If he hadn’t, he might 
have found it harder to concentrate on drills that 
morning. He didn’t see the owls swooping past in 
broad daylight, though people down in the street did; 
they pointed and gazed open-mouthed as owl after 
owl sped overhead. Most of them had never seen an 
owl even at nighttime. Mr. Dursley, however, had a 
perfectly normal, owl-free morning. He yelled at five 
different people. He made several important telephone 
calls and shouted a bit more. He was in a very good 
mood until lunchtime, when he thought he’d stretch 
his legs and walk across the road to buy himself a 
bun from the bakery.


He’d forgotten all about the people in cloaks until he 
passed a group of them next to the baker’s. He eyed 
them angrily as he passed. He didn’t know why, but 
they made him uneasy. This bunch were whispering 
excitedly, too, and he couldn’t see a single collecting 
tin. It was on his way back past them, clutching a 
large doughnut in a bag, that he caught a few words 
of what they were saying.

“The Potters, that’s right, that’s what I heard — ” 

“ — yes, their son, Harry — ” 

Mr. Dursley stopped dead. Fear flooded him. He 
looked back at the whisperers as if he wanted to say 
something to them, but thought better of it. 

He dashed back across the road, hurried up to his 
office, snapped at his secretary not to disturb him, 
seized his telephone, and had almost finished dialing 
his home number when he changed his mind. He put 
the receiver back down and stroked his mustache, 
thinking ... no, he was being stupid. Potter wasn’t 
such an unusual name. He was sure there were lots 
of people called Potter who had a son called Harry. 
Come to think of it, he wasn’t even sure his nephew 
was called Harry. He’d never even seen the boy. It 
might have been Harvey. Or Harold. There was no 
point in worrying Mrs. Dursley; she always got so 
upset at any mention of her sister. He didn’t blame 
her — if he’d had a sister like that ... but all the 
same, those people in cloaks ... 

"""


In [0]:
def setup_data(content, include_newlines, include_punct):
    """
        returns dictionary and reverse dictionary and training data

    Args:
        training_file_path (str):
        include_newlines (boolean): Include newlines?
        include_punct (Boolean): Include punctuation?

    Returns:
        training_data ():
        dictionary ():
        reverse_dictionary():
    """
#     # Text file containing words for training
#     with open(training_file_path) as f:
#         content = f.read()

    # Clean the training data:
    # The two booleans adjust 'include newlines' or include punctuation
    print("HP-LSTM: Loaded training data...")
    training_data = clean_data(content, include_newlines, include_punct)

    dictionary, reverse_dictionary = build_dataset(training_data)

    return dictionary, reverse_dictionary, training_data

Configure

In [9]:
# Setup the input text

print("HP-LSTM: Starting...")

# Setup Training file parameters
# training_file = 
include_newlines = True
include_punct = False

# # init the save path
# filename = os.path.basename(training_file)
# save_path = get_model_save_path(filename)

# input data is now avaliable at
# print(file_data)


# setup all of the data
# print("HP-LSTM: Training File: {}".format(training_file))
dictionary, reverse_dictionary, training_data = setup_data(file_data,
                                                     include_newlines,
                                                     include_punct)

print(len(training_data))
print(len(dictionary))


HP-LSTM: Starting...
HP-LSTM: Loaded training data...
1282
460


In [0]:
#def train_model(dictionary, reverse_dictionary, training_data, save_path):
start_time = time.time()

# Target log path
logs_path = '/tmp/tensorflow/rnn_words'
writer = tf.summary.FileWriter(logs_path)


# Configure Params

In [0]:
# number of units in RNN cell
n_hidden = 256

# Word buffer
n_input = 26

# samples
training_iters = 50_000
display_step = 5_000

learning_rate = 0.001

In [0]:
def RNN(x, weights, biases, n_input, n_hidden):
    """

    Args:
        x        (tf.placeholder float):
        weights  ():
        biases   ():
        n_input  (int): width of the buffer
        n_hidden (int): units in the rnn cell

    Returns:
        prediction (tensor): Return the y_hat value

    """
    with tf.name_scope(name="RNN_func"):
        # reshape to [1, n_input]
        x = tf.reshape(x, [-1, n_input])

        # Generate a n_input-element sequence of inputs
        # (eg. [had] [a] [general] -> [20] [6] [33])
        x = tf.split(x, n_input, 1)

        # 2-layer LSTM, each layer has n_hidden units.
        # Average Accuracy= 95.20% at 50k iter
        rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden),
                                      rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)])
#                                      rnn.BasicLSTMCell(n_hidden)])

        # 1-layer LSTM with n_hidden units but with lower accuracy.
        # Average Accuracy= 90.60% 50k iter
        # Uncomment line below to test but comment out the 2-layer rnn.MultiRNNCell above
        # rnn_cell = rnn.BasicLSTMCell(n_hidden)

        # generate prediction
        outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)

        # there are n_input outputs but we only want the last one
        return tf.matmul(outputs[-1], weights) + biases

In [13]:
# Length of the vocab size
vocab_size = len(dictionary)

# tf Graph input
with tf.name_scope(name="x-y"):
    x = tf.placeholder("float", [None, n_input, 1], name="x")
    y = tf.placeholder("float", [None, vocab_size], name="y")

weights = tf.Variable(tf.random_normal([n_hidden, vocab_size]), name="weights")
biases = tf.Variable(tf.random_normal([vocab_size]), name="biases")

pred = RNN(x, weights, biases, n_input, n_hidden)

# Loss and optimizer
with tf.name_scope(name="cost"):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y), name="cost")

optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

# Model evaluation
with tf.name_scope(name="model-eval"):
    correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1), name="correct_prediction")
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")

# Initializing the variables
init = tf.global_variables_initializer()

# init the saver model and location
saver = tf.train.Saver()


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [0]:
# Launch the graph

# tf.reset_default_graph()  

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as session:
    session.run(init)
    step = 0
    offset = random.randint(0, n_input+1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0

    writer.add_graph(session.graph)

    print("starting")

    display = False

    while step < training_iters:

        if (step+1) % display_step == 0:
            print("step: = {}".format(step))
            print("Offset: {}".format(offset))
            display = True

        if offset > (len(training_data)-end_offset):
            offset = random.randint(0, n_input+1)

        # setup X
        symbols_in_keys = [[dictionary[str(training_data[i])]] for i in range(offset, offset+n_input)]

        if display:
            print("X: {}".format(symbols_in_keys))

        symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
        symbols_in = [training_data[i] for i in range(offset, offset + n_input)]

        if display:
            print(symbols_in)

        # Setup Y
        symbols_out_onehot = np.zeros([vocab_size], dtype=float)
        symbols_out_onehot[dictionary[str(training_data[offset+n_input])]] = 1.0
        symbols_out_onehot = np.reshape(symbols_out_onehot, [1, -1])

        # if display:
            # print(symbols_out_onehot)

        ###################################################################
        # RUN THE MODEL
        ###################################################################
        _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], \
                                                feed_dict={x: symbols_in_keys, y: symbols_out_onehot})

        # update loss
        loss_total += loss
        acc_total += acc

        ###################################################################
        # Display some stats every 1000 iterations
        ###################################################################
        # if (step+1) % display_step == 0:

        if display:

            print("file_name: hp-titles, length: {}".format(len(training_data)))
            print("units: {}, n_input: {}, layers: 4, iters: {}, lr: {}".format(n_hidden, n_input, training_iters, learning_rate))
            print("Iter= " + str(step+1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                  "{:.2f}%".format(100*acc_total/display_step))
            print("Elapsed time: ", time.time() - start_time )
            acc_total = 0
            loss_total = 0
            symbols_in = [training_data[i] for i in range(offset, offset + n_input)]
            symbols_out = training_data[offset + n_input]
            symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
#             print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))

            display = False

        # update step by 1
        step += 1
        offset += (n_input+1)

#         if display:
#             print("step: {}, Offset: {}".format(step, offset))
#             print()

    #######################################################################
    # Save the model for later
    #######################################################################
#     print("Saving Model")
#     saver.save(session, save_path)


########################################################################
# Session Ended
########################################################################

#     print("HP-LSTM: Optimization Finished!")
#     print("file_name: hp-titles, length: {}".format(len(training_data)))
#     print("Elapsed time: ", time.time() - start_time )
#     print("units: {}, n_input: {}, layers: 3, iters: {}, lr: {}".format(n_hidden, n_input, training_iters, learning_rate))



starting
step: = 4999
Offset: 614
X: [[38], [3], [7], [63], [81], [4], [136], [314], [0], [139], [4], [83], [3], [121], [315], [2], [45], [316], [3], [0], [34], [4], [6], [317], [9], [318]]
['out', 'of', 'his', 'mind', 'As', 'he', 'drove', 'toward', '\n', 'town', 'he', 'thought', 'of', 'nothing', 'except', 'a', 'large', 'order', 'of', '\n', 'drills', 'he', 'was', 'hoping', 'to', 'get']
file_name: hp-titles, length: 1282
units: 256, n_input: 26, layers: 4, iters: 50000, lr: 0.001
Iter= 5000, Average Loss= 5.455857, Average Accuracy= 8.20%
Elapsed time:  208.22317266464233
step: = 9999
Offset: 1054
X: [[2], [416], [12], [4], [417], [2], [157], [418], [0], [3], [54], [16], [17], [419], [0], [0], [26], [39], [169], [420], [169], [54], [421], [422], [22], [29]]
['a', 'bag', 'that', 'he', 'caught', 'a', 'few', 'words', '\n', 'of', 'what', 'they', 'were', 'saying', '\n', '\n', 'The', 'Potters', 'that’s', 'right', 'that’s', 'what', 'I', 'heard', '—', '']
file_name: hp-titles, length: 1282
unit

In [0]:
# log_file_path = "/save/me/ronda.txt"

# def writer(log_file_path, text):
#   # print it on the command line
#   print(text)

#   # write it to a file
#   with open('something.txt', 'a') as f:
#         f.write(text)


# def run_experiment(f_name, iters, n_input, n_hidden):
#     """
#     # file_name: hp-titles, length: 628
#     # units: 256, n_input: 3, layers: 2, samples: 50000, lr: 0.001
#     # Elapsed time:  278.18471693992615
#     # Iter= 50000, Average Loss= 2.000082, Average Accuracy= 51.00%
    

#     """
#     # reset the default graph and clear any / all variables
#     tf.reset_default_graph()  
   

#     writer("file_name: {}, length: {}".format(fname, len(fname)))
#     writer("units: {}, n_input: {}, layers: {}, samples: {}, lr: 0.001".format(iters, n_input))
    
    
#     # setup tensorflow and run the model
#     #train_model(dictionary, r_dictionary, training_data, save_path)

  


# Perform a grid search for the best parameters and loss values for the various files