# Imports and Setup

In [1]:
import tensorflow as tf
import numpy as np
import random
import time
import sys
import os

# Data Utility Functions

In [None]:
# To concatenate all txt files in a directory into one big txt file,
# navigate to the right directory and run this command in terminal:
#           cat *.txt > name_of_new_doc.txt

In [2]:
# load data
# texts is expected to be a list of filepaths to text docs
def make_dictionaries(text_path):
    
    file_string = ""
    
    with open(text_path, 'r') as f:
        file_string += f.read()
    file_string = file_string.lower()
    all_chars = sorted(list(set(file_string)))
    
    # Get vocab set and make dictionaries
    int_to_char = {i: c for i,c in enumerate(all_chars)}
    char_to_int = {c: i for i,c in enumerate(all_chars)}

    vocab_length = len(all_chars)
    
    def char_to_embed(char):
        em = np.zeros((1, 1, vocab_length)).astype(np.float32)
        em[0,0,char_to_int[char]] = 1.0
        return em
    
    return int_to_char, char_to_int, char_to_embed, vocab_length

In [None]:
def data_generator(char_to_int, vocab_length, text_path='all_crime_novels.txt', batch_size=100, time_steps=100):
    
    text = ""
    with open(text_path, 'r') as f:
        text += f.read()
    text = text.lower()
    
    text_length = len(text)
    
    id_length = text_length-time_steps-1
    
    while(True):

        X_data = np.zeros((batch_size, time_steps, vocab_length)).astype(np.float32)
        y_data = np.zeros((batch_size, time_steps, vocab_length)).astype(np.float32)
        for j in range(batch_size):
            # Randomizing the input batches is a good idea
            i = random.randint(0,id_length)
            # One_hot encode X
            X_ints = [char_to_int[char] for char in text[i+j:i+j+time_steps]]
            X_data[j, np.arange(time_steps), X_ints] = 1.0
            # One_hot encode y
            y_ints = [char_to_int[char] for char in text[i+j+1:i+j+time_steps+1]]
            y_data[j, np.arange(time_steps), y_ints] = 1.0

        yield(X_data, y_data) 

In [4]:
def create_model(input_data, init_value, in_size=87, out_size=87, batch_size=100, lstm_size=512, num_layers=3, drop_prob=0.5):
    
    with tf.variable_scope('lstm'):
        # LSTM
        cells = []
        for i in range(num_layers):
            cell = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0, state_is_tuple=False)
            cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=drop_prob)
            cells.append(cell)
        lstm = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=False)

        outputs, lstm_new_state = tf.nn.dynamic_rnn(lstm, input_data, initial_state=init_value, dtype=tf.float32)

        # Feed-Forward layer on top
        W = tf.Variable(tf.random_normal((lstm_size, out_size), stddev=0.01))
        B = tf.Variable(tf.constant(0.1, shape=[out_size]))

        outputs_reshaped = tf.reshape(outputs, [-1, lstm_size])
        model_output = tf.add(tf.matmul(outputs_reshaped, W), B)
    
    return model_output, lstm_new_state
    

In [68]:
class RNN_WRITER:
    
    def __init__(self, in_size=87, out_size=87, num_layers=3, lstm_size=512):
        
        self.output_path = "model"
        
        if not os.path.exists('model'):
            os.mkdir('model')
        
        self.num_layers = num_layers
        self.lstm_size = lstm_size
        self.in_size = in_size
        self.out_size = out_size
        
        self.X = tf.placeholder(tf.float32, shape=(None, None, self.in_size), name="input")
        self.lstm_init_value = tf.placeholder(tf.float32, shape=(None, self.num_layers*2*self.lstm_size), name="lstm_init_value")
        
        self.y = tf.placeholder(tf.float32, shape=(None, None, self.out_size), name="ground_truth")
        y_batch_long = tf.reshape(self.y, [-1, self.out_size])
        
        self.drop_prob = tf.placeholder(tf.float32, name="drop_prob")
        
        self.lstm_last_state = np.zeros((self.num_layers*2*self.lstm_size,))
        
        out, self.next_lstm_state = create_model(self.X, self.lstm_init_value, in_size=self.in_size, out_size=self.out_size, num_layers=self.num_layers, lstm_size=self.lstm_size, drop_prob=self.drop_prob)
        
        self.loss = self.get_loss(out, y_batch_long)
        
        self.final_outputs = tf.reshape(tf.nn.softmax(out), (1, self.out_size,))
        
        
    def get_loss(self, logits, labels):
        
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
        
        return loss
    
    def save(self, sess, model_path):
        
        saver = tf.train.Saver()
        save_path = saver.save(sess, model_path)
        return save_path
    
    def restore(self, sess, model_path):
        
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
    
    def predict(self, model_path, int_to_char, char_to_embed, restore=True, prefix="The ", num_steps=500, weight=False):
        
        init = tf.global_variables_initializer()
        
        with tf.Session() as sess:
            
            sess.run(init)
            
            if restore:
                self.restore(sess, model_path)
            
            out_string = prefix
            
            prefix = prefix.lower()
            
            init_value = np.zeros((1,self.num_layers*2*self.lstm_size))
            
            # Input prefix
            for i in range(len(prefix)):
                
                out, next_lstm_state = sess.run([self.final_outputs, self.next_lstm_state], feed_dict={self.X: char_to_embed(prefix[i]), self.lstm_init_value: init_value, self.drop_prob: 1.0})
                init_value = next_lstm_state
            
            # Continue generating
            for i in range(num_steps):
                
                if (weight):
                    out *= out
                    out /= np.sum(out)
                    
                gen_char = np.random.choice(range(self.out_size), p=np.squeeze(out))
                out_string += int_to_char[gen_char]
                
                out, next_lstm_state = sess.run([self.final_outputs, self.next_lstm_state], feed_dict={self.X: char_to_embed(int_to_char[gen_char]), self.lstm_init_value: init_value, self.drop_prob: 1.0})
                init_value = next_lstm_state
                            
        return out_string
    
    def train(self, data_generator, model_path=None, learning_rate=0.008, batch_size=100, training_iters=1000, display_step=100, restore=False):
        
        if model_path == None:
            model_path = os.path.join(self.output_path, 'rnn.ckpt')
        
        self.learning_rate = learning_rate
        
        self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate, 0.9).minimize(self.loss)
#         self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            
            sess.run(init)
            
            if restore:
                self.restore(sess, model_path)
            
            print("Beginning training")
            
            total_loss = 0.0
                
            for i in range(training_iters):
                
                x_batch, y_batch = next(data_generator)
            
                init_value = np.zeros((batch_size, self.num_layers*2*self.lstm_size))
        
                loss, _ = sess.run([self.loss, self.optimizer], feed_dict={self.X: x_batch, self.y: y_batch, self.lstm_init_value: init_value, self.drop_prob: 0.5})
                
                total_loss += loss
                
                if ((i % display_step == 0) and (i != 0)):
                    print("Avg loss at iteration %d = %f" % (i, (total_loss/display_step)))
                    total_loss = 0.0
                
                
            save_path = self.save(sess, model_path)
        
        return save_path
            
    

# Prepare Dictionaries

In [6]:
text_path = 'all_crime_novels.txt'

In [7]:
int_to_char, char_to_int, char_to_embed, vocab_length = make_dictionaries(text_path)

# Train Model

In [55]:
data_gen = data_generator(char_to_int, vocab_length, 'all_crime_novels.txt')

In [56]:
tf.reset_default_graph()

In [57]:
my_rnn = RNN_WRITER(in_size=vocab_length, out_size=vocab_length)



In [46]:
save_path = 'model/rnn.ckpt'

In [58]:
save_path = my_rnn.train(data_gen, model_path=save_path, learning_rate=0.0003, training_iters=10000, display_step=250, restore=True)


INFO:tensorflow:Restoring parameters from model/rnn.ckpt
Beginning training
Avg loss at iteration 250 = 1.335674
Avg loss at iteration 500 = 1.345187
Avg loss at iteration 750 = 1.322052
Avg loss at iteration 1000 = 1.318800
Avg loss at iteration 1250 = 1.319542
Avg loss at iteration 1500 = 1.308416
Avg loss at iteration 1750 = 1.319194
Avg loss at iteration 2000 = 1.314571
Avg loss at iteration 2250 = 1.303299
Avg loss at iteration 2500 = 1.292210
Avg loss at iteration 2750 = 1.305736
Avg loss at iteration 3000 = 1.302133
Avg loss at iteration 3250 = 1.327167
Avg loss at iteration 3500 = 1.332801
Avg loss at iteration 3750 = 1.304668
Avg loss at iteration 4000 = 1.308191
Avg loss at iteration 4250 = 1.296033
Avg loss at iteration 4500 = 1.293242
Avg loss at iteration 4750 = 1.281998
Avg loss at iteration 5000 = 1.313288
Avg loss at iteration 5250 = 1.286084
Avg loss at iteration 5500 = 1.303965
Avg loss at iteration 5750 = 1.297595
Avg loss at iteration 6000 = 1.283477
Avg loss at ite

# Test Model

In [59]:
tf.reset_default_graph()

In [60]:
my_rnn = RNN_WRITER(in_size=vocab_length, out_size=vocab_length)



In [61]:
output = my_rnn.predict(model_path=save_path, int_to_char=int_to_char, char_to_embed=char_to_embed, restore=True, num_steps=5000)

INFO:tensorflow:Restoring parameters from model/rnn.ckpt


In [137]:
# After 3000 iterations

The aid from white with with the compiel, over that shopt whyrerem fast it wamped eyes pluce yeaon."

she tordered goap a gravto-kard. her was grittenmen stupled
a cluced arthor in sharply,
he wesd't to halks to the stresn. the ploblos
colded and stupked
a luttle beciered the faght under his glaich kadn't pusted nose were eches weite
trigch".a"lockicatseich agay."

breather think on the lew," man carably lalthyly one cur
and for the littlesalace, as yeu all this you furher the wering mrs. jly rust c


In [144]:
# After 23,000 iterations
print(output)

The pemples and gray size, turned the desk and going for clenetch
at the mamento boy this tied, the heavy, whole it, eyes over on the mowe in the
girl.

they were late was who with a drugstoreed large racket. they were thinging
to wald himing along the elevator, floor behind by that was to be even wadean. we got up to sceps a menshand, crafing on the lailing. i know, john that.”

dalmas said: “what could meas?”

he didn’t aramn back to the knobs that fastened for around on his eyes. he
got a
window 


In [151]:
# After 33,000 iterations
print(output)

The brush and come to him
by a wold brush.

"it's a huary."

i said. "the night strike
halfs," i said. "i lay at the cop and don't like it."

"that's captains it think, we wouldn't think he would been have not at lending about them."

"how she is a lot of that
about him trand. he's still been backs while i seemed to
help the
cold
knows and looked for the
left hundred kid.

"he was
still to go there. we nicked because he came as head here address this facts. howed you speak. it figured it was known r


In [164]:
# After 43,000 iterations
print(output)

The girl opened a chair back and unuiled and
not coming from his glass.

reno did it,
line as rose and living some tramp in face. he thought they were working for one right that had a clean curtain of literal willsson has been in marry.

he just didn’t make down the
alley sibred hirrible enough
to attle the pull of fate in daon as if always felt the world was doing the others. the one of
a refice. he had moved
away, in something that could and between the
pursers outside me. by a name masters. no table appeared to cover them. the sour ontachiness.

“miss building is it?”

“there were a red fight to give a goudy luck of this enough, that
is—say it
says
about him, but the mitt had said which was going to be
which the chief is pupping in am
rek burning world, picked up and had you understend. the
chauffeur could see much cut of whisper on0irent a new idea was
only
sitting by and that not the doke he liked and kig dark in girl or about his eyes.

unless he nodded a cartain of the called li

In [16]:
# After 63,000 iterations

The body was gently. she went to the gun on his top. there was something any cop.

“come in.”

he sipped it out last night, and mired, stiffened, went across the door. “you
go most of the order of the steve. so they had lipped it in a noisy. steve.”

bock of elihu was alteration of garaganis occupes. in it that
forecanelil or somebody in her head to some sound with that. i believe it, i can
give it noos.
i’m going to talk. how do you see what i wouldn’t get away. what else still points a home. i wouldn’t know’s.”

steve said: “what you don’t don’t
like a copper and out of noshical, don’t let the king of the gun, bill to the as don’t pick up a
bean-haired flatmere looked like the gray cards in breathing and thinks there’s
nothing to do it?”

“of course.”

“jack upstairs reading a look at that door once. here.”

he looked at the torn blocks and stood motionless. really had taken one the last drivers to.

he took out it. a week that just empty stuff, but it was a man that
means it’s fisti

In [62]:
# After 75,000 iterations
print(output)

The state. cigarettes. it stopped back.

“heh. but” and that let him alenave left. i think you know that is even patient, could
ed aon believe that.”

he pointed. “gutman’s nails like it just moint."

"that’s why he was
a guard, damn book and i don't know a guy could get would not go over against the chance,” one is to watch it. it’s drinking
to push out it laight.

they went on right against the books, with my lohic little push that no chief, you think the frest in delieine makes me probably really near? my guilty bundles. he could do his short around the rug. what happened to my importance for us. the musace sweat got out to his asin. the boy had probably signed his teeth into it, and
got up of colors down other window was gloved. no scressed captoon-looking
apartment eyes. he was still
a stelp. the pocket floating behind me until he began to chepe from the floor night in your atfers."

he said: “hello, hr. is the way to one man they are running plenty around my
fingers."

i looked a

# Test with weight

In [69]:
tf.reset_default_graph()

In [70]:
my_rnn = RNN_WRITER(in_size=vocab_length, out_size=vocab_length)



In [71]:
# This time weight the probabilities we're sampling from a little bit
# so that the higher probabilities will be chosen even more than they
# would have normally and the lower probabilities even less
output = my_rnn.predict(model_path=save_path, int_to_char=int_to_char, char_to_embed=char_to_embed, restore=True, num_steps=5000, weight=True)

INFO:tensorflow:Restoring parameters from model/rnn.ckpt


In [72]:
print(output)

The car sounded back at the desk. he set it close to the top of the corner of the corner and stared at him.

“you say the world is a fact that i can make it to her with him in the car and then go to the big man and they want to do anything. i heard you think it’s a fat man that played it to me. i said the guy who didn’t want a couple of cards, and the stuff is a big car and the shop and the bedroom was a lot of pretty sort of car out of the sofa and passed a cigarette on the desk.

“i don’t know anything. i is for me on the street. i got a little pale of it. i’m looking at and what are you made my and the real cut the story the right to call me anything. and the tough drink was the trick. they had been funny in the truth. i didn’t know where you seem to have to have the night he could see him and the taxi hotel in the man don’t like that in the interesting job.

the man who had some of them will expect him to be all he was manner with the one and the old man was about the one that was 

The weighted result has fewer spelling mistakes but less variety and is more repetitive