### Seq2seq

In [6]:
import numpy as np
import csv

In [5]:
# load dataset       
import gzip    

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

path = '../../data/reviews_cleaned.json.gz'        

# encoder and decoder
reviews, summaries = list(), list()
for data in parse(path):
    reviews.append(data['review'])
    # Appending SOS and EOS to target data (decoder)
    summaries.append('SOS_ ' + data['summary'] + ' _EOS')

all_data = reviews + summaries

num_enc_samples = len(summaries)
num_dec_samples = len(reviews)
print('num_en_samples: ', num_enc_samples)
print('num_de_samples: ', num_dec_samples)

num_en_samples:  3797263
num_de_samples:  3797263


In [8]:
# n_samples =10000

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
# running time calculation
import timeit
start = timeit.default_timer()

tokenizer = Tokenizer(num_words=32768) # 2**15, same as t2t model
tokenizer.fit_on_texts(all_data) 
vocab_size = min(32768, len(tokenizer.word_index) + 1)

# encoder source data
source_token = tokenizer.texts_to_sequences(reviews)
max_encoder_seq_length = max([len(sentence) for sentence in source_token])
source_padded = pad_sequences(source_token, maxlen=max_encoder_seq_length, padding = "post")
# decoder target data
target_token = tokenizer.texts_to_sequences(summaries)
max_decoder_seq_length = max([len(sentence) for sentence in target_token])
target_padded = pad_sequences(target_token, maxlen=max_decoder_seq_length, padding = "post")

stop = timeit.default_timer()
print('Time: {} s'.format(round(stop - start,2)))

In [19]:
print('vocab_size: ',vocab_size)
print('max_encoder_seq_length: ', max_encoder_seq_length)
print('max_decoder_seq_length: ', max_decoder_seq_length)

vocab_size:  32768
max_encoder_seq_length:  525
max_decoder_seq_length:  38


In [20]:
n_samples = num_enc_samples #100

# prepare data for the LSTM
decoder_input_data, decoder_target_data = list(), list()
for i in range(n_samples):
    dec_input = target_padded[i][:-1]
    target = target_padded[i][1:]     
    decoder_input_data.append(dec_input) #tar2_encoded
    decoder_target_data.append(target)

encoder_input_data = source_padded #np.array(X1)
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = np.array(decoder_target_data)

print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(3797263, 525)
(3797263, 37)
(3797263, 37)
Time: 5.64 s


### Embedding from here

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

vec_len = 300
n_units = 128
dropout_rate = 0.3   
# vocab_size = 32768

# Input layer of the encoder :
encoder_input = Input(shape=(None,))

# Hidden layers of the encoder :
encoder_embedding = Embedding(input_dim = vocab_size, output_dim = vec_len)(encoder_input)
encoder_dropout   = (TimeDistributed(Dropout(rate = dropout_rate)))(encoder_embedding)
encoder_LSTM      = CuDNNLSTM(n_units, return_sequences=True)(encoder_dropout)

# Output layer of the encoder 
encoder_LSTM2_layer = CuDNNLSTM(n_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM2_layer(encoder_LSTM)

# encoder states
encoder_states = [state_h, state_c]

# decoder
# Input layer of the decoder :
decoder_input = Input(shape=(None,))

# Hidden layers of the decoder :
decoder_embedding_layer = Embedding(input_dim = vocab_size, output_dim = vec_len)
decoder_embedding = decoder_embedding_layer(decoder_input)

decoder_dropout_layer = (TimeDistributed(Dropout(rate = dropout_rate)))
decoder_dropout = decoder_dropout_layer(decoder_embedding)

decoder_LSTM_layer = CuDNNLSTM(n_units, return_sequences=True)
decoder_LSTM = decoder_LSTM_layer(decoder_dropout, initial_state = encoder_states)

decoder_LSTM_2_layer = CuDNNLSTM(n_units, return_sequences=True, return_state=True)
decoder_LSTM_2,_,_ = decoder_LSTM_2_layer(decoder_LSTM)

# Output layer of the decoder :
decoder_dense = Dense(vocab_size, activation='linear', name='decoder_output')
decoder_outputs = decoder_dense(decoder_LSTM_2)


# Define encoder model
encoder_model = Model(encoder_input, encoder_states)

# Define training model
model = Model([encoder_input, decoder_input], decoder_outputs)

# Define decoder model
dec_h = Input(shape=(n_units,))
dec_c = Input(shape=(n_units,))
dec_states_inputs = [dec_h, dec_c]
decoder_outputs, state_h, state_c = decoder_LSTM_2_layer(decoder_embedding, initial_state=dec_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_input] + dec_states_inputs, [decoder_outputs] + decoder_states)

model.summary() 

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    9830400     input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
time_distributed (Tim

In [23]:
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
%matplotlib inline

learning_rate=0.01    
batch_size    = 32#64   
epochs        = 10#30  

# define loss function: use sparse_softmax_cross_entropy_with_logits
def sparse_loss(targets, decoder_outputs):
    return tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=decoder_outputs)

# Define a checkpoint callback 
checkpoint_name = './checkpoint/Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

# Run training
decoder_target = tf.placeholder(dtype='int32', shape=(None, vocab_size))   

model.compile(optimizer=Adam(lr=learning_rate),
                loss=sparse_loss,
                target_tensors=[decoder_target])

model.fit([encoder_input_data,
               decoder_input_data],
               decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1,
          callbacks = callbacks_list)


Train on 3417536 samples, validate on 379727 samples
Epoch 1/10

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 00001: val_loss improved from inf to 1.76697, saving model to ./checkpoint/Weights-001--1.76697.hdf5


RuntimeError: Can't decrement id ref count (unable to close file, errno = 5, error message = 'Input/output error')

In [None]:
loss = model.history['loss']
epoch = [i for i in range(epochs)]

plt.plot(epoch, loss) #, label=str(batch_size)
    
plt.legend()
# plt.title('different batch size');
plt.xlabel('epoch'); 
plt.ylabel('loss')
plt.show() 

In [10]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras import utils
# import tensorflow as tf
# from tensorflow.keras.models import Sequential, Model
# from tensorflow.keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
# from tensorflow.keras.callbacks import ModelCheckpoint

from tensorflow.keras.models import load_model
# model = load_model("checkpoint_v2/Weights-001--1.76697.hdf5")

model.load_weights("./checkpoint_v2/Weights-001--1.76697.hdf5")

OSError: Unable to open file (truncated file: eof = 254820352, sblock->base_addr = 0, stored_eof = 295170392)

In [None]:
# model = load_model(data_path + "\model-40.hdf5")
# dummy_iters = 40
# example_training_generator = KerasBatchGenerator(train_data, num_steps, 1, vocabulary,
#                                                      skip_step=1)
# print("Training data:")
# for i in range(dummy_iters):
#     dummy = next(example_training_generator.generate())
# num_predict = 10
# true_print_out = "Actual words: "
# pred_print_out = "Predicted words: "
# for i in range(num_predict):
#     data = next(example_training_generator.generate())
#     prediction = model.predict(data[0])
#     predict_word = np.argmax(prediction[:, num_steps-1, :])
#     true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
#     pred_print_out += reversed_dictionary[predict_word] + " "
# print(true_print_out)
# print(pred_print_out)

In [None]:
def test_summary_generation(reviews):
 
     # clean inputs
    cleaned = cleaning_data(reviews) 
    # tokenize
    tokenized = tokenizer.texts_to_sequences([cleaned]) 
    # padding
    sequence = pad_sequences(tokenized, maxlen = maxlen)  
   
    # encode
    state = encoder_model.predict(sequence)

#     # collect predictions
#     output = list()
#     for t in [answer_word2index['_B_'], answer_word2index['_U_']]:
#         # predict next sequence
#         target_seq = np.eye(n_class)[[t]]
#         target_seq = target_seq[newaxis,:, : ]
#         yhat, h, c = decoder_model.predict([target_seq] + state)
#         # save first prediction
#         output.append(yhat[0,0,:])
#         # update state
#         state = [h, c]
#         # update target sequence
#         target_seq = yhat
    
#     # select max probability words and decode
#     output_sequence = [np.argmax(vector) for vector in np.array(output)]
#     decoded = [answer_index2word[i] for i in output_sequence]

#     # Remove anything after '_E_'        
#     if "_E_" in decoded:
#         end = decoded.index('_E_')
#         answer = ' '.join(decoded[:end])
#     else :
#         answer = ' '.join(decoded[:])    
#     # if no answer return choose random answer    
#     if answer:
#         result = answer
#     else: 
#         result = np.random.random_integers(100)
#     return result

In [None]:
# pip install py-rouge

import rouge

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg', 'Best', 'Individual']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    hypothesis_1 = "King Norodom Sihanouk has declined requests to chair a summit of Cambodia 's top political leaders , saying the meeting would not bring any progress in deadlocked negotiations to form a government .\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen 's party to form a new government failed .\nHun Sen 's ruling party narrowly won a majority in elections in July , but the opposition _ claiming widespread intimidation and fraud _ has denied Hun Sen the two-thirds vote in parliament required to approve the next government .\n"
    references_1 = ["Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\nKing Sihanouk declined to chair talks in either place.\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.",
                    "Cambodian prime minister Hun Sen rejects demands of 2 opposition parties for talks in Beijing after failing to win a 2/3 majority in recent elections.\nSihanouk refuses to host talks in Beijing.\nOpposition parties ask the Asian Development Bank to stop loans to Hun Sen's government.\nCCP defends Hun Sen to the US Senate.\nFUNCINPEC refuses to share the presidency.\nHun Sen and Ranariddh eventually form a coalition at summit convened by Sihanouk.\nHun Sen remains prime minister, Ranariddh is president of the national assembly, and a new senate will be formed.\nOpposition leader Rainsy left out.\nHe seeks strong assurance of safety should he return to Cambodia.\n",
                    ]

    hypothesis_2 = "China 's government said Thursday that two prominent dissidents arrested this week are suspected of endangering national security _ the clearest sign yet Chinese leaders plan to quash a would-be opposition party .\nOne leader of a suppressed new political party will be tried on Dec. 17 on a charge of colluding with foreign enemies of China '' to incite the subversion of state power , '' according to court documents given to his wife on Monday .\nWith attorneys locked up , harassed or plain scared , two prominent dissidents will defend themselves against charges of subversion Thursday in China 's highest-profile dissident trials in two years .\n"
    references_2 = "Hurricane Mitch, category 5 hurricane, brought widespread death and destruction to Central American.\nEspecially hard hit was Honduras where an estimated 6,076 people lost their lives.\nThe hurricane, which lingered off the coast of Honduras for 3 days before moving off, flooded large areas, destroying crops and property.\nThe U.S. and European Union were joined by Pope John Paul II in a call for money and workers to help the stricken area.\nPresident Clinton sent Tipper Gore, wife of Vice President Gore to the area to deliver much needed supplies to the area, demonstrating U.S. commitment to the recovery of the region.\n"

    all_hypothesis = [hypothesis_1, hypothesis_2]
    all_references = [references_1, references_2]

    scores = evaluator.get_scores(all_hypothesis, all_references)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()