## GRU with Attention

In [2]:
# print(tf.__version__)

In [1]:
from __future__ import absolute_import, division, print_function
import tensorflow 
tensorflow.enable_eager_execution()

In [2]:
!pip install scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/99/6c/bbbf3452cd5c8ed8e6cb51d37e06ebea3113d347085a59a21f19ee76c8eb/scikit_learn-0.21.2-cp35-cp35m-manylinux1_x86_64.whl (6.6MB)
[K    100% |████████████████████████████████| 6.6MB 13.8MB/s ta 0:00:011    66% |█████████████████████▎          | 4.4MB 44.5MB/s eta 0:00:01
Collecting joblib>=0.11 (from scikit-learn)
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl (278kB)
[K    100% |████████████████████████████████| 286kB 58.6MB/s ta 0:00:01
Installing collected packages: joblib, scikit-learn
Successfully installed joblib-0.13.2 scikit-learn-0.21.2
[33mYou are using pip version 19.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import os
import time

In [6]:
# load dataset       
import gzip    

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

# path = '../../data/reviews_cleaned.json.gz' 
path = '../../data/reviews.json.gz' 

# n=1
# encoder and decoder
reviews, summaries = list(), list()
for data in parse(path):
    try:        
        if data['summary'] in data['review']:
            continue
        if ''.join(data['summary'].strip().split()[:3]) ==  ''.join(data['summary'].strip().split()[:3]):
            continue        
        
        reviews.append(data['review'])
        # Appending SOS and EOS to target data (decoder)
        summaries.append('sos' + data['summary'] + 'eos')
#     n+=1
#     if n>2000:
#         break
    except:
        print(data)
all_data = reviews + summaries

num_enc_samples = len(summaries)
num_dec_samples = len(reviews)
print('num_en_samples: ', num_enc_samples)
print('num_de_samples: ', num_dec_samples)

num_en_samples:  4032305
num_de_samples:  4032305


In [4]:
# import pickle
# tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))

In [7]:
# train, test data split
# train_X, test_X, train_Y, test_Y = train_test_split(source_padded, target_padded, test_size=0.01)
train_reviews, test_reviews, train_summaries, test_summaries = train_test_split(reviews, summaries, test_size=0.01)

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
# running time check
import timeit
start = timeit.default_timer()

tokenizer = Tokenizer(num_words=16384) #16384#2**14 #32768# 2**15, same as t2t model
tokenizer.fit_on_texts(all_data) 
vocab_size = 16384 #len(tokenizer.word_index) #min(10000, len(tokenizer.word_index) + 1)

# encoder source data
source_token = tokenizer.texts_to_sequences(train_reviews)
max_encoder_seq_length = 525 #max([len(sentence) for sentence in source_token])
source_padded = pad_sequences(source_token, maxlen=max_encoder_seq_length, padding = "post")
# decoder target data
target_token = tokenizer.texts_to_sequences(train_summaries)
max_decoder_seq_length = max([len(sentence) for sentence in target_token])
target_padded = pad_sequences(target_token, maxlen=max_decoder_seq_length, padding = "post")

stop = timeit.default_timer()
print('Time: {} s'.format(round(stop - start,2)))

Time: 824.66 s


In [9]:
vocab_size

16384

# GRU with attention

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import CuDNNGRU, Dense, Input, Embedding, TimeDistributed, Flatten, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

In [13]:
train_X = source_padded
train_Y = target_padded

In [14]:
BUFFER_SIZE = len(train_X)
BATCH_SIZE = 32#64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 100 #256
units = 512 #1024
vocab_size = 16384
vocab_inp_size = vocab_size 
vocab_tar_size = vocab_size 
dec_units = train_Y.shape[1]
enc_units = train_X.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((train_X, train_Y)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    def call(self, x, hidden):
        x = self.embedding(x)   
        encoder_dropout   = (TimeDistributed(Dropout(rate = 0.2)))(x) #dropout_rate
        gru_layer_1 = CuDNNGRU(self.enc_units, return_sequences=True) \
                            (encoder_dropout, initial_state = hidden)
        gru_layer_2 = CuDNNGRU(self.enc_units, return_state=True)
        output, state = gru_layer_2(gru_layer_1)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)      
        
#         self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        gru_dec_layer_1 = CuDNNGRU(self.dec_units,return_sequences=True)(x)     
        gru_dec_layer_2 = CuDNNGRU(self.dec_units,return_sequences=True, return_state=True)
        output, state = gru_dec_layer_2(gru_dec_layer_1)   
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [19]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [20]:
optimizer = tf.train.AdamOptimizer()

def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [21]:
checkpoint_dir = './checkpoints_v3/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)

In [22]:
import time
# running time check
import timeit
start = timeit.default_timer()

EPOCHS = 2

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([tokenizer.word_index['sos']] * BATCH_SIZE, 1)
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
stop = timeit.default_timer()
print('Time: {} s'.format(round(stop - start,2)))

Instructions for updating:
Colocations handled automatically by placer.
Epoch 1 Batch 0 Loss 1.6192
Epoch 1 Batch 100 Loss 1.7391
Epoch 1 Batch 200 Loss 1.8297
Epoch 1 Batch 300 Loss 1.6525
Epoch 1 Batch 400 Loss 1.6939
Epoch 1 Batch 500 Loss 1.7272
Epoch 1 Batch 600 Loss 1.5917
Epoch 1 Batch 700 Loss 1.5265
Epoch 1 Batch 800 Loss 1.5971
Epoch 1 Batch 900 Loss 1.5877
Epoch 1 Batch 1000 Loss 1.4587
Epoch 1 Batch 1100 Loss 1.5660
Epoch 1 Batch 1200 Loss 1.5420
Epoch 1 Batch 1300 Loss 1.6159
Epoch 1 Batch 1400 Loss 1.5123
Epoch 1 Batch 1500 Loss 1.5129
Epoch 1 Batch 1600 Loss 1.5976
Epoch 1 Batch 1700 Loss 1.6570
Epoch 1 Batch 1800 Loss 1.5752
Epoch 1 Batch 1900 Loss 1.3501
Epoch 1 Batch 2000 Loss 1.4758
Epoch 1 Batch 2100 Loss 1.5511
Epoch 1 Batch 2200 Loss 1.3209
Epoch 1 Batch 2300 Loss 1.3935
Epoch 1 Batch 2400 Loss 1.5293
Epoch 1 Batch 2500 Loss 1.4261
Epoch 1 Batch 2600 Loss 1.5182
Epoch 1 Batch 2700 Loss 1.3979
Epoch 1 Batch 2800 Loss 1.4831
Epoch 1 Batch 2900 Loss 1.2923
Epoch 1 Ba

MemoryError: 

In [23]:
idx2word = {idx: word for word, idx in tokenizer.word_index.items()}


In [24]:
# def summarize(text, encoder, decoder, tokenizer, max_encoder_seq_length, max_decoder_seq_length):
def summarize(text):
    attention_plot = np.zeros((max_decoder_seq_length, max_encoder_seq_length))
    
#     text = preprocess(text)
    inputs = tokenizer.texts_to_sequences([text])
    inputs = pad_sequences(inputs, maxlen=max_encoder_seq_length, padding = "post")
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
#     print(inputs)
#     print(hidden)
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tokenizer.word_index['sos']], 0)

    for t in range(max_decoder_seq_length):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += idx2word[predicted_id] + ' '

        if idx2word[predicted_id] == 'eos':
            return result, text, attention_plot
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, text, attention_plot

In [25]:
pred_summaries = []
for review in test_reviews[0:10]:
    summary, review, attention_plot = summarize(review)
    pred_summaries.append(summary)
    print("summary: ", summary, "\n")
    print("review: ", review)

MemoryError: 

In [266]:
pred_summaries = []
for review in test_reviews[0:10]:
    summary, review, attention_plot = summarize(review)
    pred_summaries.append(summary)
    print("summary: ", summary, "\n")
    print("review: ", review)

summary:  and and and and and and and and of and and and and and to and and the and the and and and and and and and and  

review:  the archetypical american novel features a solitary protagonist undertaking an odyssey in which heshe confronts both physical and moral challenges and through which heshe emerges with a renewed spirit, transformed by the crucible of confrontations with adversaries, both real and imagined  sara gruens engaging water for elephants is an eminently american work, set in the depths of the depression and featuring a brokenhearted young man whose unplanned existential leap of faith catapults him into a chaotic and unknown way of life  jacob jankowski discovers his untapped reservoir of courage, conviction and compassion, and in so doing, he, in every manner of the american definition of the word, emerges as a hero  water for elephants is a triumph    of a writer fully in control of her craft, of a character resolute in his determination to discover life and of a 

summary:  and to this of of the and the the of and and and the the the and of and to and and and and of and and the  

review:  flash jumps around a bib in this adventure, but in the west his manner improved form deplorable to tolerably despicablei do not think it would be amused with the oh show tune , i want to be an indian too , would not you  like a sioux but he honest shows a slice of what the encroaching society effect were on the fictional charterers and there action and reaction to the new world  as far as the locals, heck it was their street corner so i will not make any judgment
summary:  and and the and and and and the and and the and of and the this the and and and and and the and and the and and  

review:  a song of fire and ice is one of the few fantasy series which becomes more enjoyable with each novel released  a dance with dragons was a long time coming but the wait was worth itabout half of the main characters were missing from a feast for crows, so i thought that d

summary:  and and the and the the of this and the and to and to of the and to and and the this to and the and to the  

review:  this is a remarkable book that will introduce you to the process of science and a fascinating aspect of the emergence of life  trilobites are among the best fossils for children to get to know because they are very distinct the tri lobed shells and very different from anything currently living the horseshoe crab on american atlantic beaches is comparable in unique appearance and attracts children with similar fascinationfor those who want a better system of american science education, fortey gives some powerful hints  consider his language quotthe fever of discovery was upon me i found a trilobitethe textbook came alivethis was my first discovery of the animals that would change my life p18quot  he continues, i knew, by some principle which i could not articulate, that the wider end was the head of the animal  and of course upon the head there were the eyes  

In [236]:
!pip install nltk

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/73/56/90178929712ce427ebad179f8dc46c8deef4e89d4c853092bee1efd57d05/nltk-3.4.1.zip (3.1MB)
[K    100% |████████████████████████████████| 3.1MB 24.9MB/s ta 0:00:011
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/97/8a/10/d646015f33c525688e91986c4544c68019b19a473cb33d3b55
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.4.1
[33mYou are using pip version 19.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [237]:
!pip install py-rouge


[33mYou are using pip version 19.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [263]:
import nltk
nltk.download('punkt')
import rouge

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [269]:
# pip install py-rouge
import rouge

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg', 'Best']: #, 'Individual'
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    prediction = pred_summaries
    gold = test_summaries[0:10]

    scores = evaluator.get_scores(prediction, gold)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
#         if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
#             for hypothesis_id, results_per_ref in enumerate(results):
#                 nb_references = len(results_per_ref['p'])
#                 for reference_id in range(nb_references):
#                     print('\Pred_summary #{} & Gold_summary #{}: '.format(hypothesis_id, reference_id))
#                     print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
#             print()
        if apply_avg or apply_best:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

Evaluation with Avg
	rouge-1:	P:  3.93	R:  8.60	F1:  5.30
	rouge-2:	P:  0.00	R:  0.00	F1:  0.00
	rouge-3:	P:  0.00	R:  0.00	F1:  0.00
	rouge-4:	P:  0.00	R:  0.00	F1:  0.00
	rouge-l:	P:  6.17	R: 11.81	F1:  8.00
	rouge-w:	P:  3.54	R:  4.71	F1:  3.94

Evaluation with Best
	rouge-1:	P:  3.93	R:  8.60	F1:  5.30
	rouge-2:	P:  0.00	R:  0.00	F1:  0.00
	rouge-3:	P:  0.00	R:  0.00	F1:  0.00
	rouge-4:	P:  0.00	R:  0.00	F1:  0.00
	rouge-l:	P:  6.17	R: 11.81	F1:  8.00
	rouge-w:	P:  3.54	R:  4.71	F1:  3.94

Evaluation with Individual
	Hypothesis #0 & Reference #0: 
		rouge-1:	P:  0.00	R:  0.00	F1:  0.00
	Hypothesis #1 & Reference #0: 
		rouge-1:	P:  7.14	R: 16.67	F1: 10.00
	Hypothesis #2 & Reference #0: 
		rouge-1:	P:  0.00	R:  0.00	F1:  0.00
	Hypothesis #3 & Reference #0: 
		rouge-1:	P:  7.14	R: 20.00	F1: 10.53
	Hypothesis #4 & Reference #0: 
		rouge-1:	P:  7.14	R: 16.67	F1: 10.00
	Hypothesis #5 & Reference #0: 
		rouge-1:	P:  7.14	R:  8.33	F1:  7.69
	Hypothesis #6 & Reference #0: 
		rouge-1:	P:  3

In [229]:
# function for plotting the attention weights
def plot_attention(attention, text, predicted_summary):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    
    fontdict = {'fontsize': 14}
    
    ax.set_xticklabels([''] + text, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_summary, fontdict=fontdict)

    plt.show()

In [230]:
# def print_result(text, encoder, decoder, tokenizer, max_encoder_seq_length, max_decoder_seq_length):
def print_result(tex):
    result, text, attention_plot = summarize(text, encoder, decoder, tokenizer, max_encoder_seq_length, max_decoder_seq_length)
        
    print('Input: {}'.format(text))
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split(' ')), :len(text.split(' '))]
    plot_attention(attention_plot, text.split(' '), result.split(' '))
    

In [231]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.checkpointable.util.CheckpointLoadStatus at 0x7f5dc88ecef0>

In [264]:
aaaa = '''
we have many of the old, old issue but the number had depleted there were not  enough books to allow us to use them regularly with the additional supply the books will be used more often  they arre a good old standby for gospel singing
'''
print_result(aaaa)


In [256]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.legend(['train','validation'])
# plt.show()

In [None]:
# loss = model.history['loss']
# epoch = [i for i in range(epochs)]

# plt.plot(epoch, loss) #, label=str(batch_size)
    
# plt.legend()
# # plt.title('different batch size');
# plt.xlabel('epoch'); 
# plt.ylabel('loss')
# plt.show() 

In [217]:
# glove_6B_100d_file_path_name = "../glove.6B/glove.6B.100d.txt"

# embeddings_index = dict()

# f = open(glove_6B_100d_file_path_name)

# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
    
# f.close()
# print('Loaded %s word vectors.' % len(embeddings_index))

# # Tokenize
# vocabulary_size = len(all_glove_words)
# tokenizer = Tokenizer() #num_words= vocabulary_size
# tokenizer.fit_on_texts(all_glove_words) 

# # create a weight matrix for words in training docs
# embedding_matrix = np.zeros((vocabulary_size, 100)) 
# for word, index in tokenizer.word_index.items():
#     if index > vocabulary_size - 1:
#         break
#     else:
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[index] = embedding_vector

In [None]:
# model.predict()

In [None]:
# model = load_model(data_path + "\model-40.hdf5")
# dummy_iters = 40
# example_training_generator = KerasBatchGenerator(train_data, num_steps, 1, vocabulary,
#                                                      skip_step=1)
# print("Training data:")
# for i in range(dummy_iters):
#     dummy = next(example_training_generator.generate())
# num_predict = 10
# true_print_out = "Actual words: "
# pred_print_out = "Predicted words: "
# for i in range(num_predict):
#     data = next(example_training_generator.generate())
#     prediction = model.predict(data[0])
#     predict_word = np.argmax(prediction[:, num_steps-1, :])
#     true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
#     pred_print_out += reversed_dictionary[predict_word] + " "
# print(true_print_out)
# print(pred_print_out)

In [None]:
# def test_summary_generation(reviews):
 
#      # clean inputs
#     cleaned = cleaning_data(reviews) 
#     # tokenize
#     tokenized = tokenizer.texts_to_sequences([cleaned]) 
#     # padding
#     sequence = pad_sequences(tokenized, maxlen = maxlen)  
   
#     # encode
#     state = encoder_model.predict(sequence)


#     # collect predictions
#     output = list()
#     for t in [answer_word2index['_B_'], answer_word2index['_U_']]:
#         # predict next sequence
#         target_seq = np.eye(n_class)[[t]]
#         target_seq = target_seq[newaxis,:, : ]
#         yhat, h, c = decoder_model.predict([target_seq] + state)
#         # save first prediction
#         output.append(yhat[0,0,:])
#         # update state
#         state = [h, c]
#         # update target sequence
#         target_seq = yhat
    
#     # select max probability words and decode
#     output_sequence = [np.argmax(vector) for vector in np.array(output)]
#     decoded = [answer_index2word[i] for i in output_sequence]

#     # Remove anything after '_E_'        
#     if "_E_" in decoded:
#         end = decoded.index('_E_')
#         answer = ' '.join(decoded[:end])
#     else :
#         answer = ' '.join(decoded[:])    
#     # if no answer return choose random answer    
#     if answer:
#         result = answer
#     else: 
#         result = np.random.random_integers(100)
#     return result