In [1]:
print(1)

1


In [2]:
from src2.transformations import transformation_of_output_summary
from src2.load_glove import load_global_vectors
from src2.word_vector_conversions import vector_to_word
from src2.model import model
import numpy as np
import pickle
import string
import tensorflow as tf

In [3]:
# Global variables for cross-file access
vocabulary = None
positions = None
embeddings = None
dimension_of_word_vector = None

In [4]:
# GloVe file name
glove_file_name = '../glove.6B/glove.6B.50d.txt'

# load the glove file
vocabulary, positions = load_global_vectors(glove_file_name)

GloVe Loading Complete!


In [5]:
# convert positions to np array and change their data-type to float32
embeddings = np.asarray(positions)
embeddings = embeddings.astype(np.float32)

# The dimensions of all vectors will be same, so we just use the 1st vector
# to find the dimensions
dimension_of_word_vector = len(embeddings[0])

In [6]:
# The following is the pickled binary file of summaries in vector form which
# we un-pickle now
with open('../processed_data/amazon_reviews/_summaries_in_vector_form', 'rb') as fp:
    _summaries_in_vector_form = pickle.load(fp)

# The following is the pickled binary file of texts in vector form which
# we un-pickle now
with open('../processed_data/amazon_reviews/_texts_in_vector_form', 'rb') as fp:
    _texts_in_vector_form = pickle.load(fp)

# The following is the pickled binary file of vocabulary of reviews
# in vector form which we un-pickle now
with open('../processed_data/amazon_reviews/_reviews_vocabulary', 'rb') as fp:
    _reviews_vocabulary = pickle.load(fp)

# The following is the pickled binary file of embeddings(positions)
# of reviews vocabulary(in vector form) which we un-pickle now
with open('../processed_data/amazon_reviews/_reviews_embeddings', 'rb') as fp:
    _reviews_embeddings = pickle.load(fp)

In [7]:
# SOS -> start of sentence token, which is also added to our vocabulary
_reviews_vocabulary.append('<SOS>')

# Its position is a vector with values as 0 so that it has no effect on the
# processing of the data
_SOS_position = np.zeros(dimension_of_word_vector, dtype=np.float32)

# then we append its position to the embeddings as well
_reviews_embeddings.append(_SOS_position)

# numpy array format of the review embeddings
_np_reviews_embeddings = np.asarray(_reviews_embeddings, dtype=np.float32)

# 80 percent of data for training
_n_percent = 80

# n% of the total data will be used for training
_length_of_train_data = int(len(_texts_in_vector_form) * _n_percent / 100)

# choosing the training texts
_train_texts_in_vector_form = _texts_in_vector_form[:_length_of_train_data]
# choosing the training summaries
_train_summaries_in_vector_form = _summaries_in_vector_form[:_length_of_train_data]

# choosing the test texts
_test_texts_in_vector_form = _texts_in_vector_form[_length_of_train_data:]
# choosing the test summaries
_test_summaries_in_vector_form = _summaries_in_vector_form[_length_of_train_data:]

In [8]:
# range_width is a hyper-parameter:
# (parameter whose value is set before the learning process begins).
# Windows size for local attention will be (2 * range_width) + 1
range_width = 10

# window: let the current phrase under processing be p. We have a range
# width of 10. So, we will use local attention for all points from p - 10 to
# p + 10. Window is all the points. hence window size will be, 10 points before
# p, p, and 10 points after p = 10 + 1 + 10 = 2 * 10 + 1 = 2 * range_width + 1
window_size = 2 * range_width + 1

# Removing all the summaries that have length, greater than window size
# Removing all the texts that have length less than window size or greater than
# max_allowed_text_length
MAX_ALLOWED_LENGTH_OF_TEXT = 80  # arbitrary
MAX_ALLOWED_LENGTH_OF_SUMMARY = 7  # arbitrary

# Storing their values in a temp variable
_temp_summaries_in_vector_form = _summaries_in_vector_form
_temp_texts_in_vector_form = _texts_in_vector_form

# We initialize them as empty lists, so we can select only those that
# meet our length requirements
_summaries_in_vector_form = []
_texts_in_vector_form = []

In [9]:
# We iterate over all the summaries
for i, _summary in enumerate(_temp_summaries_in_vector_form, 0):

    # if the length of the summary is less than max_allowed_length_for_summary
    # and the length of the corresponding text lies in window_size
    # and max_allowed_length_for_text
    if len(_summary) <= MAX_ALLOWED_LENGTH_OF_SUMMARY\
            and window_size <= len(_temp_texts_in_vector_form[i])\
            <= MAX_ALLOWED_LENGTH_OF_TEXT:

        # We select the summary and its corresponding text
        _summaries_in_vector_form.append(_summary)
        _texts_in_vector_form.append(_temp_texts_in_vector_form[i])

In [10]:
# Actual beginning of training

# number of hidden layer neurons
_size_of_hidden_layer = 500

# how much each iteration affects the weights
_learning_rate = 0.003

# no. of previous hidden states to consider for residual connections.
# special hyper-parameter
_K = 5

# quite obvious
_length_of_vocabulary = len(_reviews_vocabulary)
# again, quite obvious
_training_iterations = 5

# initializing some place-holders
_tf_text = tf.placeholder(tf.float32, [None, dimension_of_word_vector])
_tf_length_of_sequence = tf.placeholder(tf.int32)
_tf_summary = tf.placeholder(tf.int32, [None])
_tf_length_of_output = tf.placeholder(tf.int32)

In [11]:
# Create the model
_output = model(_tf_text, _tf_length_of_sequence, _tf_length_of_output,
                dimension_of_word_vector, _length_of_vocabulary, _np_reviews_embeddings,
                _SOS_position)

# Optimizer and cost
_cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
    logits=_output, labels=_tf_summary))
_optimizer = tf.train.AdamOptimizer(learning_rate=_learning_rate).minimize(_cost)

# prediction
_prediction = tf.TensorArray(size=_tf_length_of_output, dtype=tf.int32)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [12]:
# Predicting the body
def _body_prediction(__i, __prediction):
    __prediction = __prediction.write(__i, tf.cast(tf.argmax(_output[__i]), tf.int32))
    return __i + 1, __prediction

_, _prediction = tf.while_loop(lambda __i, _a1: __i < _tf_length_of_output,
                               _body_prediction,
                               [0, _prediction])

_prediction = _prediction.stack()

# initialize global variables
init = tf.global_variables_initializer()

In [13]:
# start session
with tf.Session() as sess:  # Start Tensor-flow Session

    saver = tf.train.Saver()
    # Prepares variable for saving the model
    sess.run(init)  # initialize all variables
    step = 0
    loss_list = []
    acc_list = []
    val_loss_list = []
    val_acc_list = []
    best_val_acc = 0
    display_step = 1

    while step < _training_iterations:

        total_loss = 0
        total_acc = 0
        total_val_loss = 0
        total_val_acc = 0

        for i in range(0, _length_of_train_data):

            train_out = transformation_of_output_summary(
                _train_summaries_in_vector_form[i][:-1], _reviews_vocabulary, _reviews_embeddings)

            if i % display_step == 0:
                print("\nIteration: " + str(i))
                print("Training input sequence length: " +
                      str(len(_train_texts_in_vector_form[i])))
                print("Training target outputs sequence length: " + str(len(train_out)))

                print("\nTEXT:")
                flag = 0
                for vec in _train_texts_in_vector_form[i]:
                    if vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings)\
                            in string.punctuation or flag == 0:
                        print(str(vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings)), end='')
                    else:
                        print((" " + str(vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings))), end='')
                    flag = 1

                print("\n")

            # Run optimization operation (back-propagation)
            _, loss, _prediction_ = sess.run([_optimizer, _cost, _prediction],
                                             feed_dict={
                                         _tf_text: _train_texts_in_vector_form[i],
                                         _tf_length_of_sequence: len(_train_texts_in_vector_form[i]),
                                         _tf_summary: train_out, _tf_length_of_output: len(train_out)
                                     })

            if i % display_step == 0:
                print("\nPREDICTED SUMMARY:\n")
                flag = 0
                for index in _prediction_:
                    # if int(index)!=vocab_limit.index('eos'):
                    if _reviews_vocabulary[int(index)] in string.punctuation or flag == 0:
                        print(str(_reviews_vocabulary[int(index)]), end='')
                    else:
                        print(" " + str(_reviews_vocabulary[int(index)]), end='')
                    flag = 1
                print("\n")

                print("ACTUAL SUMMARY:\n")
                flag = 0
                for vec in _train_summaries_in_vector_form[i]:
                    if vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings) != 'eos':
                        if vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings)\
                                in string.punctuation or flag == 0:
                            print(str(vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings)), end='')
                        else:
                            print((" " +
                                   str(vector_to_word(vec, _reviews_vocabulary, _reviews_embeddings))),
                                  end='')
                    flag = 1

                print("\n")
                print("loss=" + str(loss))

        step = step + 1


Iteration: 0
Training input sequence length: 6
Training target outputs sequence length: 6

TEXT:
< filter object at unk>



AlreadyExistsError: Resource __per_step_2/while/ArithmeticOptimizer/AddOpsRewrite_add_4/tmp_var/N10tensorflow19TemporaryVariableOp6TmpVarE
	 [[{{node while/ArithmeticOptimizer/AddOpsRewrite_add_4/tmp_var}}]]