In [0]:
!pip install rouge
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

! cp 'gdrive/My Drive/Ab_Summarization/Our_Own_Code/Seq2Seq Combos/Seq2Seq.py' .

In [27]:
import os
import time
import pickle
import numpy as np
import pandas as pd
from rouge import Rouge
import tensorflow as tf
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

from Seq2Seq import Seq2Seq_Model

TensorFlow Version: 1.13.1


**The different types of experiments**

<ol>
  <li> Type 1 - emb-pos, bahdanau, greedy, 2 enc-dec, 128 units </li>
  <li> Type 2 - emb-pos, luong, greedy, 2 enc-dec, 128 units </li>
  <li> Type 3 - emb-pos, bahdanau, beam(10), 2 enc-dec, 128 units </li>
  <li> Type 4 - emb-pos, luong, beam(10), 2 enc-dec, 128 units </li>
  <li> Type 5 - emb, bahdanau, greedy, 2 enc-dec, 128 units </li>
  <li> Type 6 - emb, luong, greedy, 2 enc-dec, 128 units </li>
  <li> Type 7 - emb, bahdanau, beam(10), 2 enc-dec, 128 units </li>
  <li> Type 8 - emb, luong, beam(10), 2 enc-dec, 128 units </li>
  <li> Type 9 - emb, None, greedy, 2 enc-dec, 128 units </li>
  <li> Type 10 - emb, None, beam(10), 2 enc-dec, 128 units </li>
  <li> Type 11 - emb-pos, None, greedy, 2 enc-dec, 128 units </li>
  <li> Type 12 - emb-pos, None, beam(10), 2 enc-dec, 128 units </li>
 </ol>

In [0]:
path = 'gdrive/My Drive/Ab_Summarization/Our_Own_Code/sumdata/train/'
ckpt_path = "gdrive/My Drive/Ab_Summarization/Our_Own_Code/Seq2Seq Combos/Model_Checkpoints_Final/"
type_val = 'type3'

if 1 in [c in type_val for c in ['1','2','3','4','11','12']]:
  emb_name = 'word_embedding_matrix1'
else:
  emb_name = 'word_embedding_matrix'
  
if 1 in [c in type_val for c in ['1','3','5','7']]:
  attn = 'Bahdanau'
else:
  attn = 'Luong'
  
if 1 in [c in type_val for c in ['1','2','5','6','9','11']]:
  dec_mech = 'greedy'
else:
  dec_mech = 'beam'
  
if 1 in [c in type_val for c in ['9','10','11','12']]:
  attn = None

In [0]:
sorted_summaries = pickle.load(open(path+'summaries.pkl', 'rb'))
sorted_texts = pickle.load(open(path+'texts.pkl', 'rb'))
word_embedding_matrix = pickle.load(open(path+emb_name, 'rb'))
vocab_to_int = pickle.load(open(path+'vocab_to_int', 'rb'))
int_to_vocab = pickle.load(open(path+'int_to_vocab', 'rb'))

In [0]:
params = {
    'attention_mechanism':attn, #Luong, Bahdanau, None
    'batch_size':128,
    'bidirectional':True, #False, True
    'cell_type':'LSTM',
    'embeddings':word_embedding_matrix,
    'vocab_size':word_embedding_matrix.shape[0],
    'inference_mechanism':dec_mech, #beam, greedy
    'num_decoder_layers':2,
    'num_encoder_layers':2,
    'num_units':128,
    'end_token':'<EOS>',
    'beam_size':10,
    'word2int':vocab_to_int
}

In [0]:
# use function call help to see the different options possible for each param
ob = Seq2Seq_Model(params)

In [31]:
train_graph = ob.build_graph(infer=True)

ValueError: ignored

In [0]:
len(sorted_texts), len(sorted_summaries)

In [0]:
# choose number of samples
sorted_sum = sorted_summaries[:1000000]
sorted_tex = sorted_texts[:1000000]
del sorted_summaries, sorted_texts

In [0]:
split = int(0.7*len(sorted_sum))
train_summaries, test_summaries = sorted_sum[:split], sorted_sum[split:]
train_texts, test_texts = sorted_tex[:split], sorted_tex[split:]

In [0]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [0]:
def get_batches(summaries, texts, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

## Testing Scores Overall

In [0]:
val = 1000
test_limit = 0.20
train_limit = 0.40

In [0]:
learning_rate = 0.005
checkpoint = "final_best_model_multigraph.ckpt"
with tf.Session(graph=train_graph) as sess:
  saver = tf.train.Saver()
  saver.restore(sess,os.path.join(ckpt_path,type_val,checkpoint))
  test_logits = []
  for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
          get_batches(test_summaries, test_texts, params['batch_size'])):
    ilog = sess.run( ob.inference_logits,
          {ob.encoder_inputs: texts_batch,
           ob.decoder_targets: summaries_batch,
           ob.learning_rate: learning_rate,
           ob.decoder_lengths: summaries_lengths,
           ob.encoder_lengths: texts_lengths})
    print("Batch No. %d of %d"%(batch_i+1,int(len(test_summaries)/params['batch_size'])))
    test_logits.extend(ilog.tolist())
  rouge = Rouge()
  all_pred, all_summ = [], []
  for i in range(len(test_logits)):
    pred = " ".join([int_to_vocab[test_logits[i][k]] if test_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(test_logits[i]))])
    true = " ".join([int_to_vocab[test_summaries[i][k]] if test_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(test_summaries[i]))])
    all_pred.append(pred)
    all_summ.append(true)
  
  scores = rouge.get_scores(all_pred, all_summ, avg = True)
  slist = []
  slist.append(np.round(scores['rouge-1']['p'],decimals=4))
  slist.append(np.round(scores['rouge-1']['r'],decimals=4))
  slist.append(np.round(scores['rouge-1']['f'],decimals=4))
  slist.append(np.round(scores['rouge-2']['p'],decimals=4))
  slist.append(np.round(scores['rouge-2']['r'],decimals=4))
  slist.append(np.round(scores['rouge-2']['f'],decimals=4))
  slist.append(np.round(scores['rouge-l']['p'],decimals=4))
  slist.append(np.round(scores['rouge-l']['r'],decimals=4))
  slist.append(np.round(scores['rouge-l']['f'],decimals=4))
  
  print(scores)
  print(slist)

##Test Result Cases

In [0]:
checkpoint = "final_best_model_multigraph.ckpt" 
with tf.Session(graph=train_graph) as sess:
  saver = tf.train.Saver()
  saver.restore(sess,os.path.join(ckpt_path,type_val,checkpoint))
  test_logits = []
  for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
          get_batches(test_summaries[:val], test_texts[:val], params['batch_size'])):
    ilog = sess.run( ob.inference_logits,
          {ob.encoder_inputs: texts_batch,
           ob.decoder_targets: summaries_batch,
           ob.learning_rate: learning_rate,
           ob.decoder_lengths: summaries_lengths,
           ob.encoder_lengths: texts_lengths})
    print("Batch No. %d of %d"%(batch_i+1,int(len(test_summaries)/params['batch_size'])))
    test_logits.extend(ilog.tolist())
  rouge = Rouge()
  all_pred, all_summ, all_texts = [], [], []
  for i in range(len(test_logits)):
    pred = " ".join([int_to_vocab[test_logits[i][k]] if test_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(test_logits[i]))])
    true = " ".join([int_to_vocab[test_summaries[:val][i][k]] if test_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(test_summaries[:val][i]))])
    text = " ".join([int_to_vocab[test_texts[:val][i][k]] for k in range(len(test_texts[:val][i]))])
    r_values = rouge.get_scores(pred, true)
    if(r_values[0]['rouge-1']['f'] > test_limit):
      print("Text : ", text, "\n", "Actual Summary : ", true, "\n System Summary : ", pred, "\n", r_values, "\n")

INFO:tensorflow:Restoring parameters from gdrive/My Drive/Ab_Summarization/Our_Own_Code/Seq2Seq Combos/Model_Checkpoints_Multigraph/type1/final_best_model_multigraph.ckpt
Batch No. 1 of 2
Batch No. 2 of 2
Text :  a taiwanese man was indicted on friday for allegedly kicking former president chen shui bian in the rear an official at the taipei district prosecutor s office said <EOS> 
 Actual Summary :  man indicted for kicking taiwan s ex president in rear 
 System Summary :  us soldier in iraq for iraq s iraq s iraq s iraq 
 [{'rouge-1': {'f': 0.3749999953125, 'p': 0.5, 'r': 0.3}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.22368421052631188, 'p': 0.3333333333333333, 'r': 0.2}}] 

Text :  malaysian share prices ended percent higher friday amid rising plantation and financial stocks but political tensions capped sentiment dealers said <EOS> 
 Actual Summary :  malaysian shares close up percent 
 System Summary :  malaysian shares close lower on oil oil prices rise on oi

##Training Overall Results

In [0]:
checkpoint = "final_best_model_multigraph.ckpt" 
with tf.Session(graph=train_graph) as sess:
  saver = tf.train.Saver()
  saver.restore(sess,os.path.join(ckpt_path,type_val,checkpoint))
  train_logits = []
  for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
          get_batches(train_summaries, train_texts, params['batch_size'])):
    ilog = sess.run( ob.inference_logits,
          {ob.encoder_inputs: texts_batch,
           ob.decoder_targets: summaries_batch,
           ob.learning_rate: learning_rate,
           ob.decoder_lengths: summaries_lengths,
           ob.encoder_lengths: texts_lengths})
    print("Batch No. %d of %d"%(batch_i+1,int(len(train_summaries)/params['batch_size'])))
    train_logits.extend(ilog.tolist())
  rouge = Rouge()
  all_pred, all_summ = [], []
  for i in range(len(train_logits)):
    pred = " ".join([int_to_vocab[train_logits[i][k]] if train_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(train_logits[i]))])
    true = " ".join([int_to_vocab[train_summaries[i][k]] if train_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(train_summaries[i]))])
    all_pred.append(pred)
    all_summ.append(true)
  print("Average Rouge Socres :", rouge.get_scores(all_pred, all_summ, avg = True))

INFO:tensorflow:Restoring parameters from gdrive/My Drive/Ab_Summarization/Our_Own_Code/Model_Check_Points/final_best_model.ckpt
Batch No. 1 of 10937
Batch No. 2 of 10937
Batch No. 3 of 10937
Batch No. 4 of 10937
Batch No. 5 of 10937
Batch No. 6 of 10937
Batch No. 7 of 10937
Batch No. 8 of 10937
Batch No. 9 of 10937
Batch No. 10 of 10937
Batch No. 11 of 10937
Batch No. 12 of 10937
Batch No. 13 of 10937
Batch No. 14 of 10937
Batch No. 15 of 10937
Batch No. 16 of 10937
Batch No. 17 of 10937
Batch No. 18 of 10937
Batch No. 19 of 10937
Batch No. 20 of 10937
Batch No. 21 of 10937
Batch No. 22 of 10937
Batch No. 23 of 10937
Batch No. 24 of 10937
Batch No. 25 of 10937
Batch No. 26 of 10937
Batch No. 27 of 10937
Batch No. 28 of 10937
Batch No. 29 of 10937
Batch No. 30 of 10937
Batch No. 31 of 10937
Batch No. 32 of 10937
Batch No. 33 of 10937
Batch No. 34 of 10937
Batch No. 35 of 10937
Batch No. 36 of 10937
Batch No. 37 of 10937
Batch No. 38 of 10937
Batch No. 39 of 10937
Batch No. 40 of 10937


KeyboardInterrupt: ignored

In [0]:
print("Average Rouge Socres :", rouge.get_scores(all_pred, all_summ, avg = True))

Average Rouge Socres : {'rouge-1': {'f': 0.12376427897287923, 'p': 0.13526231720904244, 'r': 0.12198089229377379}, 'rouge-2': {'f': 0.01626121191644408, 'p': 0.015450675832679069, 'r': 0.018597545720308317}, 'rouge-l': {'f': 0.11026323894579852, 'p': 0.12765404174533782, 'r': 0.11488721068993046}}


##Training Result Cases

In [0]:
checkpoint = "final_best_model_multigraph.ckpt" 
with tf.Session(graph=train_graph) as sess:
  saver = tf.train.Saver()
  saver.restore(sess,os.path.join(ckpt_path,type_val,checkpoint))
  train_logits = []
  for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
          get_batches(train_summaries[:val], train_texts[:val], params['batch_size'])):
    ilog = sess.run( ob.inference_logits,
          {ob.encoder_inputs: texts_batch,
           ob.decoder_targets: summaries_batch,
           ob.learning_rate: learning_rate,
           ob.decoder_lengths: summaries_lengths,
           ob.encoder_lengths: texts_lengths})
    print("Batch No. %d of %d"%(batch_i+1,int(len(train_summaries[:val])/params['batch_size'])))
    train_logits.extend(ilog.tolist())
  rouge = Rouge()
  all_pred, all_summ, all_texts = [], [], []
  for i in range(len(train_logits)):
    pred = " ".join([int_to_vocab[train_logits[i][k]] if train_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(train_logits[i]))])
    true = " ".join([int_to_vocab[train_summaries[:val][i][k]] if train_logits[i][k] in int_to_vocab.keys() else '<UNK>' for k in range(len(train_summaries[:val][i]))])
    text = " ".join([int_to_vocab[train_texts[:val][i][k]] for k in range(len(train_texts[:val][i]))])
    r_values = rouge.get_scores(pred, true)
    if(r_values[0]['rouge-1']['f'] > train_limit):
      print("Text : ", text, "\n", "Actual Summary : ", true, "\n System Summary : ", pred, "\n", r_values, "\n")

INFO:tensorflow:Restoring parameters from gdrive/My Drive/Ab_Summarization/Our_Own_Code/Model_Check_Points/final_best_model.ckpt
Batch No. 1 of 15
Batch No. 2 of 15
Batch No. 3 of 15
Batch No. 4 of 15
Batch No. 5 of 15
Batch No. 6 of 15
Batch No. 7 of 15
Batch No. 8 of 15
Batch No. 9 of 15
Batch No. 10 of 15
Batch No. 11 of 15
Batch No. 12 of 15
Batch No. 13 of 15
Batch No. 14 of 15
Batch No. 15 of 15
Text :  taiwan shares closed down percent monday on wall street weakness and lacklustre interim earnings from electronics manufacturing giant hon hai dealers said <EOS> 
 Actual Summary :  taiwan shares close down percent 
 System Summary :  two shares close percent lower on weak jobs concerns jobs concerns 
 [{'rouge-1': {'f': 0.4285714239795918, 'p': 0.3333333333333333, 'r': 0.6}, 'rouge-2': {'f': 0.15384614958579892, 'p': 0.1111111111111111, 'r': 0.25}, 'rouge-l': {'f': 0.3723653395778134, 'p': 0.3333333333333333, 'r': 0.6}}] 

Text :  the united states will provide million dollars in 