In [18]:
import os
print(os.getcwd())
import time
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)
import tensorflow_hub as hub
from collections import Counter
import nltk
nltk.download('punkt')

import data_process
import model

from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import tensor_array_ops

/Users/xinsun/Dev_env/Text-Summarization-Project/Implementation1
1.14.0
[nltk_data] Downloading package punkt to /Users/xinsun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
import pickle
def __pickleStuff(filename, stuff):
    save_stuff = open(filename, "wb")
    pickle.dump(stuff, save_stuff)
    save_stuff.close()
def __loadStuff(filename):
    saved_stuff = open(filename,"rb")
    stuff = pickle.load(saved_stuff)
    saved_stuff.close()
    return stuff

In [20]:
file_path = './Reviews.csv'
data = pd.read_csv(file_path)

In [21]:
print(data.shape)
data.head(1)

(568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...


In [22]:
data.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [23]:
data.dropna(subset=['Summary'],inplace = True)
data = data[['Summary', 'Text']]
data.head(2)

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [24]:
raw_texts = []
raw_summaries = []

for text, summary in zip(data.Text, data.Summary):
    if 20 < len(text) < 300:
        raw_texts.append(text)
        raw_summaries.append(summary)

In [25]:
print(data.shape)
len(raw_texts), len(raw_summaries)

(568427, 2)


(282345, 282345)

## Clean and prepare the data

In [26]:
# the function gives us the option to keep_most of the characters inisde the texts and summaries, meaning
# punctuation, question marks, slashes...
# or we can set it to False, meaning we only want to keep letters and numbers like here.
processed_texts, processed_summaries, words_counted = data_process.preprocess_texts_and_summaries(
            raw_texts,
            raw_summaries,
            keep_most=False)

Processing Time:  201.13380217552185


In [27]:
for t,s in zip(processed_texts[:1], processed_summaries[:1]):
    print('Text\n:', t, '\n')
    print('Summary:\n', s, '\n\n\n')

Text
: ['i', 'have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', 'the', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', 'my', 'labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', 'most'] 

Summary:
 ['good', 'quality', 'dog', 'food'] 





### Create lookup dicts

We cannot feed our network actual words, but numbers. So we first have to create our lookup dicts, where each words gets and int value (high or low, depending on its frequency in our corpus). Those help us to later convert the texts into numbers.

We also add special tokens. EndOfSentence and StartOfSentence are crucial for the Seq2Seq model we later use.
Pad token, because all summaries and texts in a batch need to have the same length, pad token helps us do that.

So we need 2 lookup dicts:
 - From word to index 
 - from index to word. 

In [28]:
specials = ["<SOS>", "<EOS>", "<PAD>", "<UNK>"]
word2ind, ind2word,  ignore_words = data_process.create_word_indx_dicts(words_counted, specials=specials)

print(len(word2ind), len(ind2word), len(ignore_words))

57160 57160 0


### Pretrained embeddings

Optionally we can use pretrained word embeddings. Those have proved to increase training speed and accuracy.
Here I used two different options. Either we use glove embeddings or embeddings from tf_hub.
The ones from tf_hub worked better, so we use those. 

In [29]:
word2ind

{'<SOS>': 0,
 '<EOS>': 1,
 '<PAD>': 2,
 '<UNK>': 3,
 'the': 4,
 'i': 5,
 'and': 6,
 'a': 7,
 'it': 8,
 'this': 9,
 'to': 10,
 'is': 11,
 'for': 12,
 'of': 13,
 'my': 14,
 'in': 15,
 'great': 16,
 'good': 17,
 'not': 18,
 'but': 19,
 'are': 20,
 'these': 21,
 'have': 22,
 'they': 23,
 'you': 24,
 'with': 25,
 'was': 26,
 'that': 27,
 'love': 28,
 's': 29,
 'like': 30,
 't': 31,
 'coffee': 32,
 'them': 33,
 'very': 34,
 'as': 35,
 'on': 36,
 'product': 37,
 'so': 38,
 'taste': 39,
 'tea': 40,
 'flavor': 41,
 'best': 42,
 'can': 43,
 'just': 44,
 'at': 45,
 'one': 46,
 'all': 47,
 'be': 48,
 'we': 49,
 'will': 50,
 'price': 51,
 'or': 52,
 'if': 53,
 'too': 54,
 'has': 55,
 'from': 56,
 'more': 57,
 'really': 58,
 'than': 59,
 'buy': 60,
 'would': 61,
 'when': 62,
 'me': 63,
 'had': 64,
 'amazon': 65,
 'no': 66,
 'delicious': 67,
 'get': 68,
 'food': 69,
 'much': 70,
 'dog': 71,
 'only': 72,
 'find': 73,
 'better': 74,
 'out': 75,
 'time': 76,
 'use': 77,
 'little': 78,
 'other': 79,
 'lo

In [30]:
# the embeddings from tf_hub. 
# embed = hub.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
embed = hub.Module("https://tfhub.dev/google/Wiki-words-250/1")
emb = embed([key for key in word2ind.keys()])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    embedding_matrix = sess.run(emb)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [31]:
np.shape(embedding_matrix)
#embedding_matrix[0]
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [32]:
print(embedding_matrix.shape)
np.save('./tf_hub_embedding.npy', embedding_matrix)

(57160, 250)


In [33]:
# converts words in texts and summaries to indices
# it looks like we have to set eos here to False
converted_texts, unknown_words_in_texts = data_process.convert_text_to_indx(processed_texts, word2ind, eos=False, sos=False)

converted_summaries, unknown_words_in_summaries = data_process.convert_text_to_indx(processed_summaries, word2ind, eos=True, sos=False)

In [34]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [35]:
create_lengths(converted_texts[:3])


Unnamed: 0,counts
0,48
1,32
2,41


In [36]:
lengths_summaries = create_lengths(converted_summaries)
lengths_texts = create_lengths(converted_texts)

print("Summaries:")
print(lengths_summaries.describe())
print()
print("Texts:")
print(lengths_texts.describe())

Summaries:
              counts
count  282345.000000
mean        4.526537
std         2.229878
min         1.000000
25%         3.000000
50%         4.000000
75%         6.000000
max        31.000000

Texts:
              counts
count  282345.000000
mean       35.325765
std        11.547204
min         4.000000
25%        26.000000
50%        34.000000
75%        44.000000
max        73.000000


In [37]:
# Inspect the length of texts
print(np.percentile(lengths_texts.counts, 89.5))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))

52.0
55.0
60.0


In [38]:
# Inspect the length of summaries
print(np.percentile(lengths_summaries.counts, 90))
print(np.percentile(lengths_summaries.counts, 95))
print(np.percentile(lengths_summaries.counts, 99))

7.0
9.0
12.0


In [39]:
print("'<PAD>' has id: {}".format(word2ind['<PAD>']))

sorted_summaries_samples = converted_summaries[7:50]
sorted_texts_samples = converted_texts[7:50]

pad_summaries_batch_samples, pad_texts_batch_samples, pad_summaries_lengths_samples, pad_texts_lengths_samples = next(data_process.get_batches(word2ind, sorted_summaries_samples, sorted_texts_samples, 20))

print("pad summaries batch samples:\n\r {}".format(pad_summaries_batch_samples))

'<PAD>' has id: 2
pad summaries batch samples:
 [[  152     6  1073     1     2     2     2     2]
 [  960  4020   145     1     2     2     2     2]
 [  556    13  4020    44    85    24   817     1]
 [  958    39     1     2     2     2     2     2]
 [   28     8     1     2     2     2     2     2]
 [  342   525 16420     1     2     2     2     2]
 [  161   152     1     2     2     2     2     2]
 [   67    37     1     2     2     2     2     2]
 [ 4020     1     2     2     2     2     2     2]
 [  610   840    21    15  3178     1     2     2]
 [ 4020   960     1     2     2     2     2     2]
 [  868    66    41     1     2     2     2     2]
 [   16  1141    12     4    51     1     2     2]
 [    9    11    14    39     1     2     2     2]
 [   28   182   100   489     1     2     2     2]
 [    8    29   489     1     2     2     2     2]
 [   69    16     1     2     2     2     2     2]
 [   17   139   309     1     2     2     2     2]
 [   16    39     6   934     1   

In [40]:
__pickleStuff("./data/embedding_matrix.p", embedding_matrix)

__pickleStuff("./data/converted_summaries.p", converted_summaries)
__pickleStuff("./data/converted_texts.p", converted_texts)

__pickleStuff("./data/word2ind.p",word2ind)
__pickleStuff("./data/ind2word.p",ind2word)

# Data preprocessing finished here!

In [45]:
word_embedding_matrix = __loadStuff("./data/embedding_matrix.p")

converted_summaries = __loadStuff("./data/converted_summaries.p")
converted_texts = __loadStuff("./data/converted_texts.p")

word2ind = __loadStuff("./data/word2ind.p")
ind2word = __loadStuff("./data/ind2word.p")

In [46]:
word2ind
ind2word
converted_summaries[:3]
#converted_texts[:3]
#word_embedding_matrix[2]

[[17, 142, 71, 69, 1], [18, 35, 895, 1], [2801, 1717, 1]]

## Model

In [47]:
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length



def process_decoder_train_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<SOS>']), ending], 1)

    return dec_input


def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                    input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                    input_keep_prob = keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                    cell_bw, 
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)
            enc_output = tf.concat(enc_output,2)
            # original code is missing this line below, that is how we connect layers 
            # by feeding the current layer's output to next layer's input
            rnn_inputs = enc_output
    return enc_output, enc_state



def training_decoding_layer(dec_embed_input, summary_length, dec_cell, output_layer, vocab_size, max_summary_length, batch_size):
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=dec_cell,
                                                       helper=training_helper,
                                                       initial_state=dec_cell.zero_state(dtype=tf.float32, batch_size=batch_size),
                                                       output_layer = output_layer)

    training_logits = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)
    return training_logits



def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, output_layer, max_summary_length, batch_size):
    '''Create the inference logits'''
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        dec_cell.zero_state(dtype=tf.float32, batch_size=batch_size),
                                                        output_layer)
                
    inference_logits = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    return inference_logits



def lstm_cell(lstm_size, keep_prob):
    cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    return tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = keep_prob)



def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    dec_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell(rnn_size, keep_prob) for _ in range(num_layers)])
    output_layer = Dense(vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                     enc_output,
                                                     text_length,
                                                     normalize=False,
                                                     name='BahdanauAttention')
    dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,attn_mech,rnn_size)
    
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input,summary_length,dec_cell,
                                                  output_layer,
                                                  vocab_size,
                                                  max_summary_length,
                                                  batch_size)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,
                                                    vocab_to_int['<SOS>'],
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell,
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
    return training_logits, inference_logits



def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, word_embedding_matrix):
    '''Use the previous functions to create the training and inference logits'''
    
    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    
    dec_input = process_decoder_train_input(target_data, vocab_to_int, batch_size) #shape=(batch_size, senquence length) each seq start with index of<GO>
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)

    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers,
                                                        )
    return training_logits, inference_logits

## Training.

In [48]:
# Set the Hyperparameters
epochs = 10
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.01
keep_probability = 0.95

In [49]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(word2ind)+1,
                                                      rnn_size, 
                                                      num_layers,
                                                      word2ind, 
                                                      batch_size,
                                                      word_embedding_matrix)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits[0].rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits[0].sample_id, name='predictions')
    
    # Create the weights for sequence_loss, the sould be all True across since each batch is padded
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
        
print("Graph is built.")
graph_location = "./graph"
print(graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(train_graph)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions f

In [50]:
len(converted_summaries)

282345

In [52]:
# Subset the data for training
start = 100000
end = start + 150000
sorted_summaries_short = sorted(converted_summaries[start:end], key=lambda item: len(item))
sorted_texts_short = sorted(converted_texts[start:end], key=lambda item: len(item))
print("The shortest text length:", len(sorted_texts_short[0]))
print("The longest text length:",len(sorted_texts_short[-1]))


The shortest text length: 4
The longest text length: 73


In [53]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 100 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model

checkpoint = "./best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                data_process.get_batches(word2ind, sorted_summaries_short, sorted_texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/10 Batch  100/2343 - Loss:  3.722, Seconds: 127.54
Epoch   1/10 Batch  200/2343 - Loss:  2.938, Seconds: 87.56
Epoch   1/10 Batch  300/2343 - Loss:  2.897, Seconds: 141.60


KeyboardInterrupt: 

### Prediction test.

In [75]:
assert len(processed_texts) == len(converted_texts)
assert len(processed_texts[5000]) == len(converted_texts[5000])

converted_texts[5000]

processed_texts[5000]
processed_summaries[5000]

['mary']

In [67]:
def text_to_seq(text):
    '''Prepare the text for the model'''

    return [word2ind.get(word, word2ind['<UNK>']) for word in text.split()]

In [76]:
'''
input_sentences=["The coffee tasted great and was at such a good price! I highly recommend this to everyone!", "love individual oatmeal cups found years ago sam quit selling sound big lots quit selling found target expensive buy individually trilled get entire case time go anywhere need water microwave spoon know quaker flavor packets"]
'''
input_sentences = ['the flowers do not get as opened as they look on the picture and the tea does not taste that well very dissapointing']


generagte_summary_length =  5

texts = [text_to_seq(input_sentence) for input_sentence in input_sentences]

checkpoint = "./best_model.ckpt"

if type(generagte_summary_length) is list:
    if len(input_sentences)!=len(generagte_summary_length):
        raise Exception("[Error] makeSummaries parameter generagte_summary_length must be same length as input_sentences or an integer")
    generagte_summary_length_list = generagte_summary_length
else:
    generagte_summary_length_list = [generagte_summary_length] * len(texts)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    #Multiply by batch_size to match the model's input parameters
    for i, text in enumerate(texts):
        generagte_summary_length = generagte_summary_length_list[i]
        answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                          summary_length: [generagte_summary_length], #summary_length: [np.random.randint(5,8)], 
                                          text_length: [len(text)]*batch_size,
                                          keep_prob: 1.0})[0] 
        # Remove the padding from the summaries
        pad = word2ind["<PAD>"] 
        print('- Review:\n\r {}'.format(input_sentences[i]))
        print('- Summary:\n\r {}\n\r\n\r'.format(" ".join([ind2word[i] for i in answer_logits if i != pad])))


INFO:tensorflow:Restoring parameters from ./best_model.ckpt
INFO:tensorflow:Restoring parameters from ./best_model.ckpt
- Review:
 the flowers do not get as opened as they look on the picture and the tea does not taste that well very dissapointing
- Summary:
 <SOS> zico hhhmmm hhhmmm <EOS>




In [80]:
word2ind['zico']

1360

## The model

Now we can build and train our model. First we define the hyperparameters we want to use. Then we create our Summarizer and call the function .build_graph(), which as the name suggests, builds the computation graph. 
Then we can train the model using .train()

After training we can try our model using .infer()

### Training

We can optionally use a cyclic learning rate, which we do here. 
I trained the model for 20 epochs and the loss was low then, but we could train it longer and would probably get better results.

Unfortunately I do not have the resources to find the perfect (or right) hyperparameters, but these do pretty well. 

In [None]:
# model hyperparametes
num_layers_encoder = 4
num_layers_decoder = 4
rnn_size_encoder = 512
rnn_size_decoder = 512

batch_size = 256
epochs = 200
clip = 5
keep_probability = 0.5
learning_rate = 0.0005
max_lr=0.005
learning_rate_decay_steps = 700
learning_rate_decay = 0.90


pretrained_embeddings_path = './tf_hub_embedding.npy'
summary_dir = os.path.join('./tensorboard', str('Nn_' + str(rnn_size_encoder) + '_Lr_' + str(learning_rate)))


use_cyclic_lr = True
inference_targets=True

In [None]:
len(converted_summaries)

In [None]:
round(78862*0.9)

In [None]:
# build graph and train the model 
summarizer_model_utils.reset_graph()
summarizer = Summarizer.Summarizer(word2ind,
                                   ind2word,
                                   save_path='./models/amazon/my_model',
                                   mode='TRAIN',
                                   num_layers_encoder = num_layers_encoder,
                                   num_layers_decoder = num_layers_decoder,
                                   rnn_size_encoder = rnn_size_encoder,
                                   rnn_size_decoder = rnn_size_decoder,
                                   batch_size = batch_size,
                                   clip = clip,
                                   keep_probability = keep_probability,
                                   learning_rate = learning_rate,
                                   max_lr=max_lr,
                                   learning_rate_decay_steps = learning_rate_decay_steps,
                                   learning_rate_decay = learning_rate_decay,
                                   epochs = epochs,
                                   pretrained_embeddings_path = pretrained_embeddings_path,
                                   use_cyclic_lr = use_cyclic_lr,
                                   summary_dir = summary_dir)           

summarizer.build_graph()
summarizer.train(converted_texts[:70976], 
                 converted_summaries[:70976],
                 validation_inputs=converted_texts[70976:],
                 validation_targets=converted_summaries[70976:])


# hidden training output.
# both train and validation loss decrease nicely.

### Inference
Now we can use our trained model to create summaries. 

In [None]:
summarizer_model_utils.reset_graph()
summarizer = Summarizer.Summarizer(word2ind,
                                   ind2word,
                                   './models/amazon/my_model',
                                   'INFER',
                                   num_layers_encoder = num_layers_encoder,
                                   num_layers_decoder = num_layers_decoder,
                                   batch_size = len(converted_texts[:50]),
                                   clip = clip,
                                   keep_probability = 1.0,
                                   learning_rate = 0.0,
                                   beam_width = 5,
                                   rnn_size_encoder = rnn_size_encoder,
                                   rnn_size_decoder = rnn_size_decoder,
                                   inference_targets = True,
                                   pretrained_embeddings_path = pretrained_embeddings_path)

summarizer.build_graph()
preds = summarizer.infer(converted_texts[:50],
                         restore_path =  './models/amazon/my_model',
                         targets = converted_summaries[:50])


In [None]:
# show results
summarizer_model_utils.sample_results(preds,
                                      ind2word,
                                      word2ind,
                                      converted_summaries[:50],
                                      converted_texts[:50])

In [5]:
a = [1,2,3,4,5]
b = a.pop(-1)

In [7]:
b
a

[1, 2, 3, 4]

In [8]:
a[::-1]

[4, 3, 2, 1]