<h1>Summarizing news data</h1>

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import gzip
import codecs
import re
import time
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.seq2seq import TrainingHelper, GreedyEmbeddingHelper, BasicDecoder, dynamic_decode
from tensorflow.contrib.seq2seq import BahdanauAttention, AttentionWrapper, sequence_loss
from tensorflow.contrib.rnn import GRUCell, DropoutWrapper
TOKEN_GO = '<GO>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'
TOKEN_UNK = '<UNK>'

In [None]:
titledata=[]
artdata=[]
with gzip.open('data/news.txt.gz') as artfile:
    for li in artfile:
        artdata.append(li.decode())
with gzip.open('data/summary.txt.gz') as titlefile:
    for li in titlefile:
        titledata.append(li.decode())
news = pd.DataFrame({'Text':artdata,'Summary':titledata})
news = news.sample(frac=0.5)
news['Text_len'] = news.Text.apply(lambda x: len(x.split()))
news['Summary_len'] = news.Summary.apply(lambda x: len(x.split()))

In [None]:
print(news['Text'].head(2).values)
print(news['Summary'].head(2).values)

In [None]:
news_summaries = []
for summary in news.Summary:
    news_summaries.append(summary)
news_texts = []
for text in news.Text:
    news_texts.append(text)

In [None]:
def count_words(words_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1

In [None]:
word_counts_dict = {}
count_words(word_counts_dict, news_summaries)
count_words(word_counts_dict, news_texts)
            
print("Total words in Vocabulary:", len(word_counts_dict))

In [None]:
def build_word_vector_matrix(vector_file):
    embedding_index = {}
    with codecs.open(vector_file, 'r', 'utf-8') as f:
        for i, line in enumerate(f):
            sr = line.split()
            if(len(sr)<26):
                continue
            word = sr[0]
            embedding = np.asarray(sr[1:], dtype='float32')
            embedding_index[word] = embedding
    return embedding_index
# Replace the path here to point to the glove.6B.50d.txt vectors file on your system
embeddings_index = build_word_vector_matrix('../../temp/glove.6B.50d.txt')

In [None]:
word2int = {} 
count_threshold = 20
value = 0
for word, count in word_counts_dict.items():
    if count >= count_threshold or word in embeddings_index:
        word2int[word] = value
        value += 1


special_codes = [TOKEN_UNK,TOKEN_PAD,TOKEN_EOS,TOKEN_GO]   

for code in special_codes:
    word2int[code] = len(word2int)

int2word = {}
for word, value in word2int.items():
    int2word[value] = word

In [None]:
embedding_dim = 50
nwords = len(word2int)

word_emb_matrix = np.zeros((nwords, embedding_dim), dtype=np.float32)
for word, i in word2int.items():
    if word in embeddings_index:
        word_emb_matrix[i] = embeddings_index[word]
    else:
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        word_emb_matrix[i] = new_embedding
print("Length of word embeddings: ", len(word_emb_matrix))

In [None]:
def convert_sentence_to_ids(text, eos=False):
    wordints = []
    word_count = 0
    for sentence in text:
        sentence2ints = []
        for word in sentence.split():
            word_count += 1
            if word in word2int:
                sentence2ints.append(word2int[word])
            else:
                sentence2ints.append(word2int[TOKEN_UNK])
        if eos:
            sentence2ints.append(word2int[TOKEN_EOS])
        wordints.append(sentence2ints)
    return wordints, word_count

In [None]:
id_summaries, word_count = convert_sentence_to_ids(news_summaries)
id_texts, word_count = convert_sentence_to_ids(news_texts, eos=True)

In [None]:
def unknown_tokens(sentence):
    unk_token_count = 0
    for word in sentence:
        if word == word2int[TOKEN_UNK]:
            unk_token_count += 1
    return unk_token_count

In [None]:
news_summaries_filtered = []
news_texts_filtered = []
max_text_length = int(news.Text_len.mean() + news.Text_len.std())
max_summary_length = int(int(news.Summary_len.mean() + news.Summary_len.std()))
min_length = 4
unknown_token_text_limit = 10
unknown_token_summary_limit = 4

for count,text in enumerate(id_texts):
    unknown_token_text = unknown_tokens(id_texts[count])
    unknown_token_summary = unknown_tokens(id_summaries[count])
    text_len = len(id_texts[count])
    summary_len = len(id_summaries[count])
    if((unknown_token_text>unknown_token_text_limit) or (unknown_token_summary>unknown_token_summary_limit)):
        continue
    if(text_len<min_length or summary_len<min_length or text_len>max_text_length or summary_len>max_summary_length):
        continue
    news_summaries_filtered.append(id_summaries[count])
    news_texts_filtered.append(id_texts[count])

In [None]:
def model_inputs():
    inputs_data = tf.placeholder(tf.int32, [None, None], name='input_data')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    dropout_probs = tf.placeholder(tf.float32, name='dropout_probs')
    summary_len = tf.placeholder(tf.int32, (None,), name='summary_len')
    max_summary_len = tf.reduce_max(summary_len, name='max_summary_len')
    text_len = tf.placeholder(tf.int32, (None,), name='text_len')
    return inputs_data, targets, learning_rate, dropout_probs, summary_len, max_summary_len, text_len

In [None]:
def process_encoding_input(target_data, word2int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    decoding_input = tf.concat([tf.fill([batch_size, 1], word2int[TOKEN_GO]), ending], 1)
    return decoding_input

In [None]:
def get_cell(csize,dprob):
    rnc = GRUCell(csize)
    rnc = DropoutWrapper(rnc, input_keep_prob = dprob)
    return rnc

def encoding_layer(csize, len_s, nl, rinp, dprob):
    for l in range(nl):
        with tf.variable_scope('encoding_l_{}'.format(l)):
            rnn_frnt = get_cell(csize,dprob)
            rnn_bkwd = get_cell(csize,dprob)
            eop, est = tf.nn.bidirectional_dynamic_rnn(rnn_frnt, rnn_bkwd, 
                                                                    rinp,
                                                                    len_s,
                                                                    dtype=tf.float32)
    eop = tf.concat(eop,2)
    return eop, est

In [None]:
def trng_dec_layer(dec_emb_inp, summ_len, cell_dec, st_init, lyr_op, 
                            v_size, max_summ_len):
    helper = TrainingHelper(inputs=dec_emb_inp,sequence_length=summ_len, time_major=False)
    dec = BasicDecoder(cell_dec,helper,st_init,lyr_op) 
    logits, _, _ = dynamic_decode(dec,output_time_major=False,impute_finished=True, 
                                  maximum_iterations=max_summ_len)
    return logits

In [None]:
def infr_dec_layer(embeddings, start_token, end_token, decoding_cell, initial_state, op_layer,
                             max_summary_len, batch_size):
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    inf_helper = GreedyEmbeddingHelper(embeddings,start_tokens,end_token)
    inf_decoder = BasicDecoder(decoding_cell,inf_helper,initial_state,op_layer)       
    inf_logits, _, _ = dynamic_decode(inf_decoder,output_time_major=False,impute_finished=True,
                                                            maximum_iterations=max_summary_len)
    return inf_logits

In [None]:
def decoding_layer(dec_emb_op, embs, enc_op, enc_st, v_size, txt_len, 
                   summ_len,mx_summ_len, rnsize, word2int, dprob, batch_size, nlyrs):
    
    for l in range(nlyrs):
        with tf.variable_scope('dec_rnn_layer_{}'.format(l)):
            gru = tf.contrib.rnn.GRUCell(rnn_len)
            cell_dec = tf.contrib.rnn.DropoutWrapper(gru,input_keep_prob = dprob)
    out_l = Dense(v_size, kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attention = BahdanauAttention(rnsize, enc_op,txt_len,
                                                  normalize=False,
                                                  name='BahdanauAttention')
    cell_dec =  AttentionWrapper(cell_dec,attention,rnn_len)
    attn_zstate = cell_dec.zero_state(batch_size , tf.float32 )
    attn_zstate = attn_zstate.clone(cell_state = enc_st[0])
    with tf.variable_scope("decoding_layer"):
        tr_dec_op = trng_dec_layer(dec_emb_op, 
                                                  summ_len, 
                                                  cell_dec, 
                                                  attn_zstate,
                                                  out_l,
                                                  v_size, 
                                                  mx_summ_len)
    with tf.variable_scope("decoding_layer", reuse=True):
        inf_dec_op = infr_dec_layer(embs,  
                                                    word2int[TOKEN_GO], 
                                                    word2int[TOKEN_EOS],
                                                    cell_dec, 
                                                    attn_zstate, 
                                                    out_l,
                                                    mx_summ_len,
                                                    batch_size)

    return tr_dec_op, inf_dec_op

In [None]:
def seq2seq_model(data_inp, data_summ_tgt, dprob, len_txt, len_summ, max_len_summ, 
                  v_size, rnsize, nlyrs, word2int, batch_size):
    
    inp_emb = word_emb_matrix
    word_embs = tf.Variable(inp_emb, name="word_embs")
    inp_enc_emb = tf.nn.embedding_lookup(word_embs, data_inp)
    op_enc, st_enc = encoding_layer(rnsize, len_txt, nlyrs, inp_enc_emb, dprob)
    
    inp_dec = process_encoding_input(data_summ_tgt, word2int, batch_size)
    inp_dec_emb = tf.nn.embedding_lookup(inp_emb, inp_dec)
    
    op_tr, op_inf  = decoding_layer(inp_dec_emb, 
                                                        inp_emb,
                                                        op_enc,
                                                        st_enc, 
                                                        v_size, 
                                                        len_txt, 
                                                        len_summ, 
                                                        max_len_summ,
                                                        rnsize, 
                                                        word2int, 
                                                        dprob, 
                                                        batch_size,
                                                        nlyrs)
    
    return op_tr, op_inf

In [None]:
def pad_sentences(sentences_batch):

    max_sentence = max([len(sentence) for sentence in sentences_batch])
    return [sentence + [word2int[TOKEN_PAD]] * (max_sentence - len(sentence)) for sentence in sentences_batch]

In [None]:
def get_batches(summaries, texts, batch_size):
    for batch_idx in range(0, len(texts)//batch_size):
        start_idx = batch_idx * batch_size
        summaries_batch = summaries[start_idx:start_idx + batch_size]
        texts_batch = texts[start_idx:start_idx + batch_size]
        pad_summaries_batch = np.array(pad_sentences(summaries_batch))
        pad_texts_batch = np.array(pad_sentences(texts_batch))

        pad_summaries_lens = []
        for summary in pad_summaries_batch:
            pad_summaries_lens.append(len(summary))
        
        pad_texts_lens = []
        for text in pad_texts_batch:
            pad_texts_lens.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lens, pad_texts_lens

In [None]:
epochs = 20
batch_size = 64
rnn_len = 256
n_layers = 2
lr = 0.005
dr_prob = 0.75
logs_path='/tmp/models/'

In [None]:
train_graph = tf.Graph()
with train_graph.as_default():
    
    data_inp, tgts, lrt, dprobs, len_summ, max_len_summ, len_txt = model_inputs()

    tr_op, inf_op = seq2seq_model(tf.reverse(data_inp, [-1]),
                                                      tgts, 
                                                      dprobs,   
                                                      len_txt,
                                                      len_summ,
                                                      max_len_summ,
                                                      len(word2int)+1,
                                                      rnn_len, 
                                                      n_layers, 
                                                      word2int,
                                                      batch_size)
    
    tr_op = tf.identity(tr_op.rnn_output, 'tr_op')
    inf_op = tf.identity(inf_op.sample_id, name='predictions')
    
    seq_masks = tf.sequence_mask(len_summ, max_len_summ, dtype=tf.float32, name='masks')

    with tf.name_scope("optimizer"):
        tr_cost = sequence_loss(tr_op,tgts,seq_masks)
        optzr = tf.train.AdamOptimizer(lrt)
        grds = optzr.compute_gradients(tr_cost)
        capped_grds = [(tf.clip_by_value(grd, -5., 5.), var) for grd, var in grds 
                        if grd is not None]
        train_op = optzr.apply_gradients(capped_grds)
    tf.summary.scalar("cost", tr_cost)
print("Graph created.")

In [None]:
min_learning_rate = 0.0006
display_step = 20 
early_stop_cnt = 0 
early_stop_cnt_max = 3 
per_epoch = 3 


update_loss = 0 
batch_loss = 0
summary_update_loss = [] 

news_summaries_train = news_summaries_filtered[0:3000]
news_texts_train = news_texts_filtered[0:3000]
update_check = (len(news_texts_train)//batch_size//per_epoch)-1
checkpoint = logs_path + 'best_so_far_model.ckpt' 
with tf.Session(graph=train_graph) as sess:
    tf_summary_writer = tf.summary.FileWriter(logs_path, graph=train_graph)
    merged_summary_op = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_len, texts_len) in enumerate(
                get_batches(news_summaries_train, news_texts_train, batch_size)):
            before = time.time()
            _,loss,summary = sess.run(
                [train_op, tr_cost,merged_summary_op],
                {data_inp: texts_batch,
                 tgts: summaries_batch,
                 lrt: lr,
                 len_summ: summaries_len,
                 len_txt: texts_len,
                 dprobs: dr_prob})
            batch_loss += loss
            update_loss += loss
            after = time.time()
            batch_time = after - before
            tf_summary_writer.add_summary(summary, epoch_i * batch_size + batch_i)
            if batch_i % display_step == 0 and batch_i > 0:
                print('** Epoch {:>3}/{} Batch {:>4}/{} - Batch Loss: {:>6.3f}, seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(news_texts_filtered) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                if update_loss <= min(summary_update_loss):
                    print('Saving model') 
                    early_stop_cnt = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    early_stop_cnt += 1
                    if early_stop_cnt == early_stop_cnt_max:
                        break
                update_loss = 0

        if early_stop_cnt == early_stop_cnt_max:
            print("Stopping Training.")
            break

In [None]:
def text_to_seq(text):
    return [word2int.get(word, word2int[TOKEN_UNK]) for word in text.split()]

In [None]:
#random = np.random.randint(3000,len(news_texts_filtered))
random = np.random.randint(0,3000)
text = news_texts_filtered[random]

checkpoint = logs_path + 'best_so_far_model.ckpt'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    input_data = loaded_graph.get_tensor_by_name('input_data:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_len:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_len:0')
    keep_prob = loaded_graph.get_tensor_by_name('dropout_probs:0')
    result_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 

pad = word2int[TOKEN_PAD] 

#print('\nOriginal Text:', input_sentence)

print('\nText')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join( [int2word[i].decode('utf-8') for i in text if type(int2word[i]) is bytes] )))

print('\nSummary')
print('  Word Ids:       {}'.format([i for i in result_logits if i != pad]))
print('  Response Words: {}'.format(" ".join( [int2word[i].decode('utf-8') for i in result_logits
                                               if type(int2word[i]) is bytes and i!=pad] )))
print(' Ground Truth: {}'.format(" ".join( [int2word[i].decode('utf-8') for i in news_summaries_filtered[random] 
                                            if type(int2word[i]) is bytes] )))