In [8]:
# referred https://towardsdatascience.com/creating-a-spell-checker-with-tensorflow-d35b23939f60
# https://github.com/Currie32/Spell-Checker
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import time
import re
from sklearn.model_selection import train_test_split

In [9]:
def load_book(path):
    """Load a book from its file"""
    input_file = os.path.join(path)
    with open(input_file) as f:
        book = f.read()
    return book

path = './books/'
book_files = [f for f in listdir(path) if isfile(join(path, f))]
book_files = book_files[1:]

# Load the books using the file names
books = []
for book in book_files:
    books.append(load_book(path+book))

# Compare the number of words in each book 
tot_words=0
for i in range(len(books)):
    num_words = len(books[i].split())
    tot_words+=num_words
    print("There are {} words in {}.".format(num_words, book_files[i]))
print("Total number of words are {}.".format(tot_words))
# print books[0][:500]

There are 30423 words in Alices_Adventures_in_Wonderland_by_Lewis_Carroll.rtf.
There are 361612 words in Anna_Karenina_by_Leo_Tolstoy.rtf.
There are 113452 words in David_Copperfield_by_Charles_Dickens.rtf.
There are 433993 words in Don_Quixote_by_Miguel_de_Cervantes.rtf.
There are 166996 words in Dracula_by_Bram_Stoker.rtf.
There are 163109 words in Emma_by_Jane_Austen.rtf.
There are 78912 words in Frankenstein_by_Mary_Shelley.rtf.
There are 191598 words in Great_Expectations_by_Charles_Dickens.rtf.
There are 105428 words in Grimms_Fairy_Tales_by_The_Brothers_Grimm.rtf.
There are 25395 words in Metamorphosis_by_Franz_Kafka.rtf.
There are 165188 words in Oliver_Twist_by_Charles_Dickens.rtf.
There are 126999 words in Pride_and_Prejudice_by_Jane_Austen.rtf.
There are 110213 words in The_Adventures_of_Sherlock_Holmes_by_Arthur_Conan_Doyle.rtf.
There are 96185 words in The_Adventures_of_Tom_Sawyer_by_Mark_Twain.rtf.
There are 480495 words in The_Count_of_Monte_Cristo_by_Alexandre_Dumas.rtf

In [10]:
def clean_text(text):
    '''Remove unwanted characters and extra spaces from the text'''
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'[{}@_*>()\\#%+=\[\]]','', text)
    text = re.sub('a0','', text)
    text = re.sub('\'92t','\'t', text)
    text = re.sub('\'92s','\'s', text)
    text = re.sub('\'92m','\'m', text)
    text = re.sub('\'92ll','\'ll', text)
    text = re.sub('\'91','', text)
    text = re.sub('\'92','', text)
    text = re.sub('\'93','', text)
    text = re.sub('\'94','', text)
    text = re.sub('\.','. ', text)
    text = re.sub('\!','! ', text)
    text = re.sub('\?','? ', text)
    text = re.sub(' +',' ', text)
    return text

# Clean the text of the books
clean_books = []
for book in books:
    clean_books.append(clean_text(book))

# print clean_books[0][:500]

In [11]:
# Create a dictionary to convert the vocabulary (characters) to integers
vocab_to_int = {}
count = 0
for book in clean_books:
    for character in book:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

# Add special tokens to vocab_to_int
codes = ['<PAD>','<EOS>','<GO>']
for code in codes:
    vocab_to_int[code] = count
    count += 1

# Check the size of vocabulary and all of the values
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

# Create another dictionary to convert integers to their respective characters
int_to_vocab = {}
for character, value in vocab_to_int.items():
    int_to_vocab[value] = character

The vocabulary contains 78 characters.
[' ', '!', '"', '$', '&', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<EOS>', '<GO>', '<PAD>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [12]:
# Split the text from the books into sentences.
sentences = []
for book in clean_books:
    for sentence in book.split('. '):
        sentences.append(sentence + '.')
print("There are {} sentences.".format(len(sentences)))

# print sentences[:5]
# Note that 1st sentence won't effect because it is long and will be filtered

There are 133494 sentences.


In [13]:
# Convert sentences to integers
int_sentences = []

for sentence in sentences:
    int_sentence = []
    for character in sentence:
        int_sentence.append(vocab_to_int[character])
    int_sentences.append(int_sentence)

# Find the length of each sentence
lengths = []
for sentence in int_sentences:
    lengths.append(len(sentence))
lengths = pd.DataFrame(lengths, columns=["counts"])

#print lengths.describe()

In [14]:
# Limit the data we will use to train our model
max_length = 92
min_length = 10

good_sentences = []

for sentence in int_sentences:
    if len(sentence) <= max_length and len(sentence) >= min_length:
        good_sentences.append(sentence)

print("We will use {} to train and test our model.".format(len(good_sentences)))

# Split the data into training and testing sentences
training, testing = train_test_split(good_sentences, test_size = 0.15, random_state = 2)

print("Number of training sentences:", len(training))
print("Number of testing sentences:", len(testing))

We will use 59885 to train and test our model.
('Number of training sentences:', 50902)
('Number of testing sentences:', 8983)


In [15]:
# Sort the sentences by length to reduce padding, which will allow the model to train faster
training_sorted = []
testing_sorted = []

for i in range(min_length, max_length+1):
    for sentence in training:
        if len(sentence) == i:
            training_sorted.append(sentence)
    for sentence in testing:
        if len(sentence) == i:
            testing_sorted.append(sentence)

# Check to ensure the sentences have been selected and sorted correctly
#for i in range(5):
#    print(training_sorted[i], len(training_sorted[i]))

In [16]:
letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]

def noise_maker(sentence, threshold):
    '''Relocate, remove, or add characters to create spelling mistakes'''
    #Returns list of numbers.
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0,1,1)
        # Most characters will be correct since the threshold value is high
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0,1,1)
            # ~33% chance characters will swap locations
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    # If last character in sentence, it will not be typed
                    continue
                else:
                    # if any other character, swap order with following character
                    noisy_sentence.append(sentence[i+1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            # ~33% chance an extra lower case letter will be added to the sentence
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(vocab_to_int[random_letter])
                noisy_sentence.append(sentence[i])
            # ~33% chance a character will not be typed
            else:
                pass     
        i += 1
    return noisy_sentence

# Check to ensure noise_maker is making mistakes correctly.
#threshold = 0.9
#for sentence in training_sorted[:5]:
#    print(sentence)
#    print(noise_maker(sentence, threshold)) #Returns list of numbers
#    print ()

In [17]:
def model_inputs():
    '''Create palceholders for inputs to the model'''
    
    with tf.name_scope('inputs'):
        inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    with tf.name_scope('targets'):
        targets = tf.placeholder(tf.int32, [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    inputs_length = tf.placeholder(tf.int32, (None,), name='inputs_length')
    targets_length = tf.placeholder(tf.int32, (None,), name='targets_length')
    max_target_length = tf.reduce_max(targets_length, name='max_target_len')

    return inputs, targets, keep_prob, inputs_length, targets_length, max_target_length

def process_encoding_input(targets, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    
    with tf.name_scope("process_encoding"):
        ending = tf.strided_slice(targets, [0, 0], [batch_size, -1], [1, 1])
        dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob, direction):
    '''Create the encoding layer'''
    
    if direction == 1:
        with tf.name_scope("RNN_Encoder_Cell_1D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    lstm = tf.contrib.rnn.LSTMCell(rnn_size)

                    drop = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                         input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.dynamic_rnn(drop, 
                                                              rnn_inputs,
                                                              sequence_length,
                                                              dtype=tf.float32)

            return enc_output, enc_state
        
        
    if direction == 2:
        with tf.name_scope("RNN_Encoder_Cell_2D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    cell_fw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                            input_keep_prob = keep_prob)

                    cell_bw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                            input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                            cell_bw, 
                                                                            rnn_inputs,
                                                                            sequence_length,
                                                                            dtype=tf.float32)
            # Join outputs since we are using a bidirectional RNN
            enc_output = tf.concat(enc_output,2)
            # Use only the forward state because the model can't use both states at once
            return enc_output, enc_state[0]
        
def training_decoding_layer(dec_embed_input, targets_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_target_length):
    '''Create the training logits'''
    
    with tf.name_scope("Training_Decoder"):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                            sequence_length=targets_length,
                                                            time_major=False)

        training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                           training_helper,
                                                           initial_state,
                                                           output_layer) 

        training_logits, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                               output_time_major=False,
                                                               impute_finished=True,
                                                               maximum_iterations=max_target_length)
        return training_logits

def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_target_length, batch_size):
    '''Create the inference logits'''
    
    with tf.name_scope("Inference_Decoder"):
        start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')

        inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                    start_tokens,
                                                                    end_token)

        inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                            inference_helper,
                                                            initial_state,
                                                            output_layer)

        inference_logits, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                                output_time_major=False,
                                                                impute_finished=True,
                                                                maximum_iterations=max_target_length)

        return inference_logits

def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, inputs_length, targets_length, 
                   max_target_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers, direction):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    
    with tf.name_scope("RNN_Decoder_Cell"):
        for layer in range(num_layers):
            with tf.variable_scope('decoder_{}'.format(layer)):
                lstm = tf.contrib.rnn.LSTMCell(rnn_size)
                dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                         input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  inputs_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')
    
    with tf.name_scope("Attention_Wrapper"):
        dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(dec_cell, attn_mech, rnn_size)
        #dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, rnn_size)
    
    initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state,_zero_state_tensors(rnn_size, batch_size, tf.float32))
    #initial_state = tf.contrib.seq2seq.AttentionWrapperState(enc_state,_zero_state_tensors(rnn_size, batch_size, tf.float32))

    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, 
                                                  targets_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_target_length)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,  
                                                    vocab_to_int['<GO>'], 
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_target_length,
                                                    batch_size)

    return training_logits, inference_logits

def seq2seq_model(inputs, targets, keep_prob, inputs_length, targets_length, max_target_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, embedding_size, direction):
    '''Use the previous functions to create the training and inference logits'''
    
    enc_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    enc_embed_input = tf.nn.embedding_lookup(enc_embeddings, inputs)
    enc_output, enc_state = encoding_layer(rnn_size, inputs_length, num_layers, 
                                           enc_embed_input, keep_prob, direction)
    
    dec_embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    dec_input = process_encoding_input(targets, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        dec_embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        inputs_length, 
                                                        targets_length, 
                                                        max_target_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers,
                                                        direction)
    
    return training_logits, inference_logits

def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

def get_batches(sentences, batch_size, threshold):
    """Batch sentences, noisy sentences, and the lengths of their sentences together.
       With each epoch, sentences will receive new mistakes"""
    
    for batch_i in range(0, len(sentences)//batch_size):
        start_i = batch_i * batch_size
        sentences_batch = sentences[start_i:start_i + batch_size]
        
        sentences_batch_noisy = []
        for sentence in sentences_batch:
            sentences_batch_noisy.append(noise_maker(sentence, threshold))
            
        sentences_batch_eos = []
        for sentence in sentences_batch:
            sentence.append(vocab_to_int['<EOS>'])
            sentences_batch_eos.append(sentence)
            
        pad_sentences_batch = np.array(pad_sentence_batch(sentences_batch_eos))
        pad_sentences_noisy_batch = np.array(pad_sentence_batch(sentences_batch_noisy))
        
        # Need the lengths for the _lengths parameters
        pad_sentences_lengths = []
        for sentence in pad_sentences_batch:
            pad_sentences_lengths.append(len(sentence))
        
        pad_sentences_noisy_lengths = []
        for sentence in pad_sentences_noisy_batch:
            pad_sentences_noisy_lengths.append(len(sentence))
        
        yield pad_sentences_noisy_batch, pad_sentences_batch, pad_sentences_noisy_lengths, pad_sentences_lengths

In [18]:
# The default parameters
epochs = 100
batch_size = 128
num_layers = 2
rnn_size = 512
embedding_size = 128
learning_rate = 0.0005
direction = 2
threshold = 0.95
keep_probability = 0.75

# v1: Default from https://github.com/Currie32/Spell-Checker/blob/master/SpellChecker.ipynb
# v2: Adding num_layers as list [2,4]. Changed stop to 4 (consequtively we need 4 now).

In [19]:
def build_graph(keep_prob, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction):

    tf.reset_default_graph()
    
    # Load the model inputs    
    inputs, targets, keep_prob, inputs_length, targets_length, max_target_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(inputs, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      inputs_length,
                                                      targets_length,
                                                      max_target_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size,
                                                      embedding_size,
                                                      direction)

    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')

    with tf.name_scope('predictions'):
        predictions = tf.identity(inference_logits.sample_id, name='predictions')
        tf.summary.histogram('predictions', predictions)

    # Create the weights for sequence_loss
    masks = tf.sequence_mask(targets_length, max_target_length, dtype=tf.float32, name='masks')
    
    with tf.name_scope("cost"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, 
                                                targets, 
                                                masks)
        tf.summary.scalar('cost', cost)

    with tf.name_scope("optimze"):
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

    # Merge all of the summaries
    merged = tf.summary.merge_all()    

    # Export the nodes 
    export_nodes = ['inputs', 'targets', 'keep_prob', 'cost', 'inputs_length', 'targets_length',
                    'predictions', 'merged', 'train_op','optimizer']
    Graph = namedtuple('Graph', export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])

    return graph

In [20]:
def train(model, epochs, log_string):
    '''Train the RNN'''
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # Used to determine when to stop the training early
        testing_loss_summary = []

        # Keep track of which batch iteration is being trained
        iteration = 0
        
        display_step = 30 # The progress of the training will be displayed after every 30 batches
        stop_early = 0 
        stop = 4 # If the batch_loss_testing does not decrease in 3 consecutive checks, stop training
        per_epoch = 3 # Test the model 3 times per epoch
        testing_check = (len(training_sorted)//batch_size//per_epoch)-1

        print()
        print("Training Model: {}".format(log_string))

        train_writer = tf.summary.FileWriter('./logs/ori_v2/train/{}'.format(log_string), sess.graph)
        test_writer = tf.summary.FileWriter('./logs/ori_v2/test/{}'.format(log_string))

        for epoch_i in range(1, epochs+1): 
            batch_loss = 0
            batch_time = 0
            
            for batch_i, (input_batch, target_batch, input_length, target_length) in enumerate(
                    get_batches(training_sorted, batch_size, threshold)):
                start_time = time.time()

                summary, loss, _ = sess.run([model.merged,
                                             model.cost, 
                                             model.train_op], 
                                             {model.inputs: input_batch,
                                              model.targets: target_batch,
                                              model.inputs_length: input_length,
                                              model.targets_length: target_length,
                                              model.keep_prob: keep_probability})


                batch_loss += loss
                end_time = time.time()
                batch_time += end_time - start_time

                # Record the progress of training
                train_writer.add_summary(summary, iteration)

                iteration += 1

                if batch_i % display_step == 0 and batch_i > 0:
                    print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                          .format(epoch_i,
                                  epochs, 
                                  batch_i, 
                                  len(training_sorted) // batch_size, 
                                  batch_loss / display_step, 
                                  batch_time))
                    batch_loss = 0
                    batch_time = 0

                #### Testing ####
                if batch_i % testing_check == 0 and batch_i > 0:
                    batch_loss_testing = 0
                    batch_time_testing = 0
                    for batch_i, (input_batch, target_batch, input_length, target_length) in enumerate(
                            get_batches(testing_sorted, batch_size, threshold)):
                        start_time_testing = time.time()
                        summary, loss = sess.run([model.merged,
                                                  model.cost], 
                                                     {model.inputs: input_batch,
                                                      model.targets: target_batch,
                                                      model.inputs_length: input_length,
                                                      model.targets_length: target_length,
                                                      model.keep_prob: 1})

                        batch_loss_testing += loss
                        end_time_testing = time.time()
                        batch_time_testing += end_time_testing - start_time_testing

                        # Record the progress of testing
                        test_writer.add_summary(summary, iteration)

                    n_batches_testing = batch_i + 1
                    print('Testing Loss: {:>6.3f}, Seconds: {:>4.2f}'
                          .format(batch_loss_testing / n_batches_testing, 
                                  batch_time_testing))
                    
                    batch_time_testing = 0

                    # If the batch_loss_testing is at a new minimum, save the model
                    testing_loss_summary.append(batch_loss_testing)
                    if batch_loss_testing <= min(testing_loss_summary):
                        print('New Record!') 
                        stop_early = 0
                        checkpoint = "./{}.ckpt".format(log_string)
                        saver = tf.train.Saver()
                        saver.save(sess, checkpoint)

                    else:
                        print("No Improvement.")
                        stop_early += 1
                        if stop_early == stop:
                            break

            if stop_early == stop:
                print("Stopping Training.")
                break

In [21]:
# ## DONT RUN! THIS IS FOR TRAINING!

# # Train the model with the desired tuning parameters

# for keep_probability in [0.75]:
#     for num_layers in [2,4]:
#         for threshold in [0.95]:
#             log_string = 'kp={},nl={},th={}_v2'.format(keep_probability,
#                                                     num_layers,
#                                                     threshold) 
#             model = build_graph(keep_probability, rnn_size, num_layers, batch_size, 
#                                 learning_rate, embedding_size, direction)
#             train(model, epochs, log_string)

In [22]:
def text_to_ints(text):
    '''Prepare the text for the model'''
    
    text = clean_text(text)
    return [vocab_to_int[word] for word in text]

#checkpoint = "./kp=0.75,nl=2,th=0.95_v1.ckpt" #Version 1
checkpoint = "./kp=0.75,nl=2,th=0.95_v2.ckpt" #Version 2

model = build_graph(keep_probability, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction)
print("Done building graph")
# Create your own sentence or use one from the dataset
texts = ["Spellin is difficult, whch is wyh you need to study everyday.",
         "The first days of her existence in th country were vrey hard for Dolly.",
         "Thi is really something impressiv thaat we should look into rigt away!"]
for text in texts:
    text = text_to_ints(text)

    #random = np.random.randint(0,len(testing_sorted))
    #text = testing_sorted[random]
    #text = noise_maker(text, 0.95)

     
    print (batch_size)    
    with tf.Session() as sess:
        # Load saved model
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint)

        #Multiply by batch_size to match the model's input parameters
        answer_logits = sess.run(model.predictions, {model.inputs: [text]*batch_size, 
                                                     model.inputs_length: [len(text)]*batch_size,
                                                     model.targets_length: [len(text)+1], 
                                                     model.keep_prob: [1.0]})[0]

    # Remove the padding from the generated sentence
    pad = vocab_to_int["<PAD>"] 

    print('\nText')
    print('  Word Ids:    {}'.format([i for i in text]))
    print('  Input Words: {}'.format("".join([int_to_vocab[i] for i in text])))

    print('\nSummary')
    print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
    print('  Response Words: {}'.format("".join([int_to_vocab[i] for i in answer_logits if i != pad])))

Done building graph
128
INFO:tensorflow:Restoring parameters from ./kp=0.75,nl=2,th=0.95_v2.ckpt

Text
  Word Ids:    [57, 9, 23, 20, 20, 7, 5, 19, 7, 6, 19, 22, 7, 2, 2, 7, 8, 16, 20, 1, 38, 19, 28, 24, 8, 24, 19, 7, 6, 19, 28, 39, 24, 19, 39, 13, 16, 19, 5, 23, 23, 22, 19, 1, 13, 19, 6, 1, 16, 22, 39, 19, 23, 27, 23, 0, 39, 22, 4, 39, 43, 19]
  Input Words: Spellin is difficult, whch is wyh you need to study everyday. 

Summary
  Word Ids:       [57, 9, 23, 20, 20, 7, 5, 19, 7, 6, 19, 22, 7, 2, 2, 7, 8, 16, 20, 1, 38, 19, 28, 24, 24, 19, 7, 6, 19, 28, 39, 19, 39, 13, 16, 19, 5, 23, 23, 22, 19, 1, 13, 19, 6, 1, 16, 22, 39, 19, 23, 27, 23, 0, 39, 19, 1, 13, 19, 6, 1, 16, 22]
  Response Words: Spellin is difficult, whh is wy you need to study every to stud
128
INFO:tensorflow:Restoring parameters from ./kp=0.75,nl=2,th=0.95_v2.ckpt

Text
  Word Ids:    [41, 24, 23, 19, 2, 7, 0, 6, 1, 19, 22, 4, 39, 6, 19, 13, 2, 19, 24, 23, 0, 19, 23, 31, 7, 6, 1, 23, 5, 8, 23, 19, 7, 5, 19, 1, 24, 19, 

In [23]:
for _ in range(7):
    #text = text_to_ints(text)

    random = np.random.randint(0,len(testing_sorted))
    text = testing_sorted[random]
    text = noise_maker(text, 0.95)

 
    with tf.Session() as sess:
        # Load saved model
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint)

        print(text)
        print([text]*10)
        #Multiply by batch_size to match the model's input parameters
        answer_logits = sess.run(model.predictions, {model.inputs: [text]*batch_size, 
                                                     model.inputs_length: [len(text)]*batch_size,
                                                     model.targets_length: [len(text)+1], 
                                                     model.keep_prob: [1.0]})[0]

    # Remove the padding from the generated sentence
    pad = vocab_to_int["<PAD>"] 

    print('\nText')
    print('  Word Ids:    {}'.format([i for i in text]))
    print('  Input Words: {}'.format("".join([int_to_vocab[i] for i in text])))

    print('\nSummary')
    print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
    print('  Response Words: {}'.format("".join([int_to_vocab[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./kp=0.75,nl=2,th=0.95_v2.ckpt
[58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43]
[[58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43], [58, 4, 0, 9, 23, 0, 19, 2, 3, 19, 16, 8, 15, 16, 55, 12, 12, 63, 63, 19, 2, 15, 19, 6, 43


Text
  Word Ids:    [36, 5, 13, 22, 19, 8, 0, 1, 6, 23, 39, 19, 28, 24, 7, 20, 23, 19, 39, 13, 16, 0, 23, 19, 1, 24, 7, 5, 30, 7, 5, 10, 19, 28, 4, 1, 19, 1, 13, 45, 5, 45, 28, 24, 4, 1, 19, 1, 13, 19, 9, 16, 0, 0, 43]
  Input Words: Anod crtsey while youre thinking wat to-n-what to purr.

Summary
  Word Ids:       [36, 5, 22, 13, 22, 19, 8, 0, 1, 6, 23, 39, 19, 28, 24, 7, 20, 23, 19, 39, 13, 16, 0, 23, 19, 1, 24, 7, 5, 30, 7, 5, 10, 19, 28, 4, 1, 19, 1, 13, 45, 5, 45, 28, 24, 4, 1, 19, 1, 13, 19, 9, 16, 0, 0, 43]
  Response Words: Andod crtsey while youre thinking wat to-n-what to purr.
INFO:tensorflow:Restoring parameters from ./kp=0.75,nl=2,th=0.95_v2.ckpt
[59, 19, 21, 23, 5, 1, 7, 13, 5, 8, 19, 5, 13, 19, 5, 4, 21, 23, 6, 26, 19, 17, 16, 1, 24, 19, 4, 9, 9, 39, 19, 1, 24, 23, 19, 21, 5, 4, 19, 28, 24, 13, 19, 8, 24, 4, 5, 10, 23, 6, 19, 49, 21, 21, 4, 19, 2, 13, 0, 19, 58, 4, 0, 0, 7, 23, 65, 1, 25, 19, 58, 36, 32, 41, 49, 50, 19, 70, 59, 62, 19, 51, 0, 6, 43]
[[59, 19, 21, 23, 5,


Text
  Word Ids:    [32, 23, 1, 0, 7, 1, 6, 30, 39, 19, 33, 16, 21, 9, 23, 22, 19, 16, 9, 19, 6, 16, 22, 22, 23, 5, 20, 39, 19, 13, 5, 1, 13, 19, 24, 7, 6, 19, 30, 5, 23, 23, 6, 19, 4, 5, 22, 19, 20, 13, 13, 22, 30, 23, 22, 19, 0, 13, 16, 8, 5, 43]
  Input Words: Petritsky jumped up suddenly onto his knees and loodked roucn.

Summary
  Word Ids:       [32, 23, 1, 0, 7, 1, 6, 30, 39, 19, 33, 16, 21, 9, 23, 22, 19, 16, 9, 19, 6, 16, 22, 22, 23, 23, 22, 19, 16, 9, 19, 6, 16, 22, 22, 23, 5, 20, 39, 19, 13, 5, 1, 13, 19, 24, 7, 6, 19, 30, 5, 23, 23, 6, 19, 4, 5, 22, 19, 20, 13, 13, 22]
  Response Words: Petritsky jumped up suddeed up suddenly onto his knees and lood


In [24]:
def text_to_ints(text):
    '''Prepare the text for the model'''
    
    text = clean_text(text)
    return [vocab_to_int[word] for word in text]

def run_curie(data,data_type,checkpoint):

    pred=[]
    print(len(data))
#     checkpoint = "./kp=0.75,nl=2,th=0.95_v2.ckpt" #Version 2
    print(keep_probability, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction)
    model = build_graph(keep_probability, rnn_size, num_layers, batch_size, learning_rate, embedding_size, direction)
    print("Done building graph")
    
    with tf.Session() as sess:
        # Load saved model
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint)

       
        for i,sent in enumerate(data):
            if i%100==0:
                print(i)
            text = text_to_ints(sent)           
            #Multiply by batch_size to match the model's input parameters
            answer_logits = sess.run(model.predictions, {model.inputs: [text]*batch_size, 
                                                     model.inputs_length: [len(text)]*batch_size,
                                                     model.targets_length: [len(text)+1], 
                                                     model.keep_prob: [1.0]})[0]
            
            # Remove the padding from the generated sentence
            pad = vocab_to_int["<PAD>"] 

            op="".join([int_to_vocab[i] for i in answer_logits if i != pad])
            pred.append(op)
                        
#             print(op)
        
        with open('./currie_' + data_type +checkpoint[2:] + '.pkl', 'wb') as handle:
            pkl.dump(pred, handle, protocol=pkl.HIGHEST_PROTOCOL)
        print('Saved predictions in pickle')
        
    return   

In [37]:
# %load_ext autoreload
# %autoreload 2
import pickle as pkl
from eval_all import *
# dir(eval_all)
jdb_X,jdb_Y,jdb_tX,jdb_tY=read_jdb()
trec_X,trec_Y=read_trec()
qspell_X,qspell_Y=read_qspell()


# checkpoint = "./kp=0.75,nl=2,th=0.95_v2.ckpt" #Version 2
checkpoint='./kp=0.75,nl=2,th=0.95_ori_v3.ckpt'
# checkpoint='./version3_kp=0.75,nl=2,th=0.95.ckpt'
# checkpoint='./version4_kp=0.75,nl=2,th=0.95.ckpt'



# run_curie(jdb_X,'jdb_train',checkpoint)
# run_curie(jdb_tX,'jdb',checkpoint)
# run_curie(trec_X,'trec',checkpoint)
# run_curie(qspell_X,'qspell',checkpoint)
# eval_file(jdb_X,jdb_Y,'currie_jdbkp=0.75,nl=2,th=0.95_v2.ckpt.pkl')

('./JDBv1.0/train.txt', 6000, Counter({1: 5548, 2: 434, 3: 15, 4: 3}))
('./JDBv1.0/test-split1.txt', 1711, Counter({1: 1581, 2: 126, 3: 4}))
('./JDBv1.0/test-split2.txt', 1711, Counter({1: 1567, 2: 137, 3: 7}))
('./JDBv1.0/test-split3.txt', 1711, Counter({1: 1574, 2: 131, 3: 5, 4: 1}))
('Train:', 6000)
('Test:', 5133)
('./Speller Challenge TREC Dataset/Speller Challenge TREC Dataset.txt', 5892, Counter({1: 5030, 2: 824, 3: 35, 4: 3}))
('./corpus-webis-qspell-17.csv', 54772, Counter({1: 51506, 2: 2523, 3: 548, 4: 132, 5: 35, 6: 24, 7: 4}))


In [38]:
%load_ext autoreload
%autoreload 2
from eval_all import *


# eval_file(jdb_X,jdb_Y,'currie_jdb_trainkp=0.75,nl=2,th=0.95_ori_v3.ckpt.pkl')
# eval_file(jdb_tX,jdb_tY,'currie_jdbkp=0.75,nl=2,th=0.95_ori_v3.ckpt.pkl')
# eval_file(trec_X,trec_Y,'currie_treckp=0.75,nl=2,th=0.95_ori_v3.ckpt.pkl')
eval_file(qspell_X,qspell_Y,'currie_qspellkp=0.75,nl=2,th=0.95_ori_v3.ckpt.pkl')

# eval_file(jdb_X,jdb_Y,'currie_jdbkp=0.75,nl=2,th=0.95_v2.ckpt.pkl')
# eval_file(trec_X,trec_Y,'currie_treckp=0.75,nl=2,th=0.95_v2.ckpt.pkl')
# eval_file(qspell_X,qspell_Y,'currie_qspellkp=0.75,nl=2,th=0.95_v2.ckpt.pkl')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
evaluating
NEW eval
(6744.1, ' out of ', 54772, ' not in the input')
(59.1, ' out of ', 6744.1, ' correct\n')
('Method X : ', (0.00876321525481532, 0.8779504126195866, 0.8768713941429928))
