### Seq2seq

In [1]:
import numpy as np
import csv
import tensorflow # as tf

tensorflow.enable_eager_execution()

In [2]:
# load dataset       
import gzip    

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

path = '../../data/reviews_cleaned.json.gz'        

n=1
# encoder and decoder
reviews, summaries = list(), list()
for data in parse(path):
    reviews.append(data['review'])
    # Appending SOS and EOS to target data (decoder)
    summaries.append('SOS_ ' + data['summary'] + ' _EOS')
    n+=1
    if n>50000:
        break

all_data = reviews + summaries

num_enc_samples = len(summaries)
num_dec_samples = len(reviews)
print('num_en_samples: ', num_enc_samples)
print('num_de_samples: ', num_dec_samples)

num_en_samples:  50000
num_de_samples:  50000


In [4]:
import pickle

tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))

In [3]:
# n_samples =10000

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
# running time calculation
import timeit
start = timeit.default_timer()

# tokenizer = Tokenizer(num_words=32768) # 2**15, same as t2t model
# tokenizer.fit_on_texts(all_data) 
# vocab_size = min(32768, len(tokenizer.word_index) + 1)

# encoder source data
source_token = tokenizer.texts_to_sequences(reviews)
max_encoder_seq_length = 525 #max([len(sentence) for sentence in source_token])
source_padded = pad_sequences(source_token, maxlen=max_encoder_seq_length, padding = "post")
# decoder target data
target_token = tokenizer.texts_to_sequences(summaries)
max_decoder_seq_length = max([len(sentence) for sentence in target_token])
target_padded = pad_sequences(target_token, maxlen=max_decoder_seq_length, padding = "post")

stop = timeit.default_timer()
print('Time: {} s'.format(round(stop - start,2)))

Time: 5.59 s


In [6]:
print('vocab_size: ',vocab_size)
print('max_encoder_seq_length: ', max_encoder_seq_length)
# print('max_decoder_seq_length: ', max_decoder_seq_length)

NameError: name 'vocab_size' is not defined

In [205]:
target_padded.shape

(50000, 45)

In [171]:
n_samples = num_enc_samples #100

# prepare data for the LSTM
decoder_input_data, decoder_target_data = list(), list()
for i in range(n_samples):
#######
    dec_input = target_padded[i][:-1]
    target = target_padded[i][1:]
    decoder_input_data.append(dec_input)
    
#     dec_input = target_token[i][:-1] # dec_input = target_padded[i][:-1]
#     target = target_token[i][1:] # target = target_padded[i][1:]
#    decoder_input_data.append(np.array(dec_input)) #tar2_encoded
    
    decoder_target_data.append(target)

encoder_input_data = source_padded #np.array(X1)
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = np.array(decoder_target_data)

print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(50000, 525)
(50000, 44)
(50000, 44)


In [32]:
print(type(decoder_input_data[0]))

<class 'numpy.ndarray'>


### Embedding from here

In [23]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

vec_len = 300
n_units = 128
dropout_rate = 0.3   

# Input layer of the encoder :
encoder_input = Input(shape=(None,))

# Hidden layers of the encoder :
encoder_embedding = Embedding(input_dim = vocab_size, output_dim = vec_len)(encoder_input)
encoder_dropout   = (TimeDistributed(Dropout(rate = dropout_rate)))(encoder_embedding)
encoder_LSTM      = CuDNNLSTM(n_units, return_sequences=True)(encoder_dropout)

# Output layer of the encoder 
encoder_LSTM2_layer = CuDNNLSTM(n_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM2_layer(encoder_LSTM)

# encoder states
encoder_states = [state_h, state_c]

# decoder
# Input layer of the decoder :
decoder_input = Input(shape=(None,))

# Hidden layers of the decoder :
decoder_embedding_layer = Embedding(input_dim = vocab_size, output_dim = vec_len)
decoder_embedding = decoder_embedding_layer(decoder_input)

decoder_dropout_layer = (TimeDistributed(Dropout(rate = dropout_rate)))
decoder_dropout = decoder_dropout_layer(decoder_embedding)

decoder_LSTM_layer = CuDNNLSTM(n_units, return_sequences=True)
decoder_LSTM = decoder_LSTM_layer(decoder_dropout, initial_state = encoder_states)

decoder_LSTM_2_layer = CuDNNLSTM(n_units, return_sequences=True, return_state=True)
decoder_LSTM_2,_,_ = decoder_LSTM_2_layer(decoder_LSTM)

# Output layer of the decoder :
decoder_dense = Dense(vocab_size, activation='linear', name='decoder_output')
decoder_outputs = decoder_dense(decoder_LSTM_2)


# Define encoder model
encoder_model = Model(encoder_input, encoder_states)

# Define training model
model = Model([encoder_input, decoder_input], decoder_outputs)

# Define decoder model
dec_h = Input(shape=(n_units,))
dec_c = Input(shape=(n_units,))
dec_states_inputs = [dec_h, dec_c]
decoder_outputs, state_h, state_c = decoder_LSTM_2_layer(decoder_embedding, initial_state=dec_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_input] + dec_states_inputs, [decoder_outputs] + decoder_states)

model.summary() 

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    9830400     input_5[0][0]                    
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, None, 300)    0           embedding_2[0][0]                
__________________________________________________________________________________________________
embedding_

In [33]:
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
%matplotlib inline

learning_rate=0.01    
batch_size    = 32#64   
epochs        = 1#30  

# define loss function: use sparse_softmax_cross_entropy_with_logits
def sparse_loss(targets, decoder_outputs):
    return tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=decoder_outputs)

# Define a checkpoint callback 
# checkpoint_name = './checkpoint/Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
# callbacks_list = [checkpoint]

# Run training
decoder_target = tf.placeholder(dtype='int32', shape=(None, vocab_size))   

model.compile(optimizer=Adam(lr=learning_rate),
                loss=sparse_loss,
                target_tensors=[decoder_target])

model.fit([encoder_input_data,decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# ,
#           callbacks = callbacks_list

Train on 9000 samples, validate on 1000 samples


<tensorflow.python.keras.callbacks.History at 0x7f17c71295f8>

In [104]:
encoder_input_data[:10,:][1].shape

(525,)

In [101]:
np.zeros((10000,32))[:10,:][0].shape

(32,)

In [106]:
p = model.predict(encoder_input_data[:10,:][0],  batch_size=1) #, np.zeros((10000,32))[:10,:][0]]

ValueError: Error when checking input: expected embedding_6_input to have shape (525,) but got array with shape (1,)

In [91]:
encoder_input_data[:,0].shape

(10000,)

In [76]:
p = model.predict_classes([encoder_input_data[:10,0], np.zeros((10000,32))[:10,0]])

AttributeError: 'Model' object has no attribute 'predict_classes'

In [75]:
# for i in range(3):
#     data = next(example_training_generator.generate())
for i in range(5):
    data = next([encoder_input_data[:10,0], np.zeros((10000,32))[:10,0]])
    prediction = model.predict(data, batch_size=1)
    predict_word = np.argmax(prediction[0,0,:])
#     true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
    pred_print_out += reversed_dictionary[predict_word] + " "
    
#     predict_word = np.argmax(prediction[:, num_steps-1, :])
print(pred_print_out)

TypeError: 'list' object is not an iterator

In [71]:
prediction[0,0,:].shape

(32768,)

In [72]:
predict_word = np.argmax(prediction[0,0,:])

In [73]:
predict_word

5

In [74]:
tokenizer.sequences_to_texts(np.array([[predict_word]]))


['a']

In [51]:
prediction = model.predict([encoder_input_data[0],np.zeros(10000,)[0]])


tokenizer.sequences_to_texts(np.array([[predict_word]]))


for i in range(dummy_iters):
    dummy = next(example_training_generator.generate())
num_predict = 10
true_print_out = "Actual words: "
pred_print_out = "Predicted words: "
for i in range(num_predict):
    data = next(example_training_generator.generate())
    prediction = model.predict(data[0])
    predict_word = np.argmax(prediction[:, num_steps-1, :])
    true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
    pred_print_out += reversed_dictionary[predict_word] + " "
print(true_print_out)
print(pred_print_out)


['']

In [172]:
vocabulary_size =10000
embedding_matrix = np.zeros((vocabulary_size, 100)) 
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [198]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, RepeatVector

def define_model(in_vocab,out_vocab, in_timesteps, out_timesteps,units):
    model = Sequential()
    model.add(Embedding(in_vocab, 100, input_length=in_timesteps, weights=[embedding_matrix], trainable=False, mask_zero=True))
#     model_glove.add(Embedding(vocabulary_size, output_dim=100, input_length=maxlen))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model
# model compilation
# vocab_size
model = define_model(10000, 10000, 525, 45, 256)

In [199]:
from tensorflow.keras import optimizers
# rms = optimizers.RMSprop(lr=0.001)
learning_rate=0.01
model.compile(optimizer=Adam(lr=learning_rate), loss='sparse_categorical_crossentropy')


In [200]:
print(target_padded[:50000,:].shape)
print(source_padded[:50000,:].shape)
trainY = target_padded[:50000,:]
trainY.reshape(trainY.shape[0], trainY.shape[1], 1).shape

(50000, 45)
(50000, 525)


(50000, 45, 1)

In [187]:
# trainY.reshape(trainY.shape[0], trainY.shape[1], 1)[2]
source_padded[:50000,:][0].shape

(525,)

In [202]:
# filename = 'model.h1.v3'
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# Define a checkpoint callback 
checkpoint_name = './checkpoint_v3/Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True,mode ='auto') 
# callbacks_list = [checkpoint]


model.fit(source_padded[:50000,:], trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=1, batch_size=32, validation_split = 0.1, 
                    verbose=1, callbacks=[checkpoint])

Train on 45000 samples, validate on 5000 samples
 5568/45000 [==>...........................] - ETA: 40:01 - loss: nan

KeyboardInterrupt: 

In [208]:
target_tensor_train.shape[1]

45

# New

In [11]:
from __future__ import absolute_import, division, print_function

# Import TensorFlow >= 1.10 and enable eager execution


import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
import tensorflow as tf
import unicodedata
import re
import numpy as np
import os
import time

print(tensorflow.__version__)

1.13.0-rc1


In [12]:
input_tensor_train = source_padded
target_tensor_train = target_padded

In [13]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 512 #1024
vocab_inp_size = 8000 #len(inp_lang.word2idx)
vocab_tar_size = 8000 #len(targ_lang.word2idx)
dec_units = target_tensor_train.shape[1]
enc_units = input_tensor_train.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [14]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
    if tf.test.is_gpu_available():
        return tf.keras.layers.CuDNNGRU(units, 
                                        return_sequences=True, 
                                        return_state=True, 
                                        recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(units, 
                                   return_sequences=True, 
                                   return_state=True, 
                                   recurrent_activation='sigmoid', 
                                   recurrent_initializer='glorot_uniform')

In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [16]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [17]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [18]:
optimizer = tf.train.AdamOptimizer()


def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [19]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [20]:
import time
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Instructions for updating:
Colocations handled automatically by placer.


InvalidArgumentError: indices[60,12] = 14813 is not in [0, 8000) [Op:ResourceGather] name: encoder/embedding/embedding_lookup/

In [233]:
def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word2idx[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        if targ_lang.idx2word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    
    fontdict = {'fontsize': 14}
    
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    plt.show()

In [None]:
def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
        
    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)

In [85]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

NameError: name 'history' is not defined

In [None]:
# loss = model.history['loss']
# epoch = [i for i in range(epochs)]

# plt.plot(epoch, loss) #, label=str(batch_size)
    
# plt.legend()
# # plt.title('different batch size');
# plt.xlabel('epoch'); 
# plt.ylabel('loss')
# plt.show() 

In [None]:
# model.predict()

In [None]:
# model = load_model(data_path + "\model-40.hdf5")
# dummy_iters = 40
# example_training_generator = KerasBatchGenerator(train_data, num_steps, 1, vocabulary,
#                                                      skip_step=1)
# print("Training data:")
# for i in range(dummy_iters):
#     dummy = next(example_training_generator.generate())
# num_predict = 10
# true_print_out = "Actual words: "
# pred_print_out = "Predicted words: "
# for i in range(num_predict):
#     data = next(example_training_generator.generate())
#     prediction = model.predict(data[0])
#     predict_word = np.argmax(prediction[:, num_steps-1, :])
#     true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
#     pred_print_out += reversed_dictionary[predict_word] + " "
# print(true_print_out)
# print(pred_print_out)

In [None]:
# def test_summary_generation(reviews):
 
#      # clean inputs
#     cleaned = cleaning_data(reviews) 
#     # tokenize
#     tokenized = tokenizer.texts_to_sequences([cleaned]) 
#     # padding
#     sequence = pad_sequences(tokenized, maxlen = maxlen)  
   
#     # encode
#     state = encoder_model.predict(sequence)


#     # collect predictions
#     output = list()
#     for t in [answer_word2index['_B_'], answer_word2index['_U_']]:
#         # predict next sequence
#         target_seq = np.eye(n_class)[[t]]
#         target_seq = target_seq[newaxis,:, : ]
#         yhat, h, c = decoder_model.predict([target_seq] + state)
#         # save first prediction
#         output.append(yhat[0,0,:])
#         # update state
#         state = [h, c]
#         # update target sequence
#         target_seq = yhat
    
#     # select max probability words and decode
#     output_sequence = [np.argmax(vector) for vector in np.array(output)]
#     decoded = [answer_index2word[i] for i in output_sequence]

#     # Remove anything after '_E_'        
#     if "_E_" in decoded:
#         end = decoded.index('_E_')
#         answer = ' '.join(decoded[:end])
#     else :
#         answer = ' '.join(decoded[:])    
#     # if no answer return choose random answer    
#     if answer:
#         result = answer
#     else: 
#         result = np.random.random_integers(100)
#     return result

In [None]:
# pip install py-rouge

# import rouge

# def prepare_results(p, r, f):
#     return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


# for aggregator in ['Avg', 'Best', 'Individual']:
#     print('Evaluation with {}'.format(aggregator))
#     apply_avg = aggregator == 'Avg'
#     apply_best = aggregator == 'Best'

#     evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
#                            max_n=4,
#                            limit_length=True,
#                            length_limit=100,
#                            length_limit_type='words',
#                            apply_avg=apply_avg,
#                            apply_best=apply_best,
#                            alpha=0.5, # Default F1_score
#                            weight_factor=1.2,
#                            stemming=True)

#     hypothesis_1 = "King Norodom Sihanouk has declined requests to chair a summit of Cambodia 's top political leaders , saying the meeting would not bring any progress in deadlocked negotiations to form a government .\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen 's party to form a new government failed .\nHun Sen 's ruling party narrowly won a majority in elections in July , but the opposition _ claiming widespread intimidation and fraud _ has denied Hun Sen the two-thirds vote in parliament required to approve the next government .\n"
#     references_1 = ["Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\nKing Sihanouk declined to chair talks in either place.\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.",
#                     "Cambodian prime minister Hun Sen rejects demands of 2 opposition parties for talks in Beijing after failing to win a 2/3 majority in recent elections.\nSihanouk refuses to host talks in Beijing.\nOpposition parties ask the Asian Development Bank to stop loans to Hun Sen's government.\nCCP defends Hun Sen to the US Senate.\nFUNCINPEC refuses to share the presidency.\nHun Sen and Ranariddh eventually form a coalition at summit convened by Sihanouk.\nHun Sen remains prime minister, Ranariddh is president of the national assembly, and a new senate will be formed.\nOpposition leader Rainsy left out.\nHe seeks strong assurance of safety should he return to Cambodia.\n",
#                     ]

#     hypothesis_2 = "China 's government said Thursday that two prominent dissidents arrested this week are suspected of endangering national security _ the clearest sign yet Chinese leaders plan to quash a would-be opposition party .\nOne leader of a suppressed new political party will be tried on Dec. 17 on a charge of colluding with foreign enemies of China '' to incite the subversion of state power , '' according to court documents given to his wife on Monday .\nWith attorneys locked up , harassed or plain scared , two prominent dissidents will defend themselves against charges of subversion Thursday in China 's highest-profile dissident trials in two years .\n"
#     references_2 = "Hurricane Mitch, category 5 hurricane, brought widespread death and destruction to Central American.\nEspecially hard hit was Honduras where an estimated 6,076 people lost their lives.\nThe hurricane, which lingered off the coast of Honduras for 3 days before moving off, flooded large areas, destroying crops and property.\nThe U.S. and European Union were joined by Pope John Paul II in a call for money and workers to help the stricken area.\nPresident Clinton sent Tipper Gore, wife of Vice President Gore to the area to deliver much needed supplies to the area, demonstrating U.S. commitment to the recovery of the region.\n"

#     all_hypothesis = [hypothesis_1, hypothesis_2]
#     all_references = [references_1, references_2]

#     scores = evaluator.get_scores(all_hypothesis, all_references)

#     for metric, results in sorted(scores.items(), key=lambda x: x[0]):
#         if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
#             for hypothesis_id, results_per_ref in enumerate(results):
#                 nb_references = len(results_per_ref['p'])
#                 for reference_id in range(nb_references):
#                     print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
#                     print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
#             print()
#         else:
#             print(prepare_results(results['p'], results['r'], results['f']))
#     print()