In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
from tqdm import tqdm
import regex as re
from tqdm import tqdm
import tensorflow as tf
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from bert_embedding import BertEmbedding
print('TensorFlow Version: {}'.format(tf.__version__))


stop_words = stopwords.words("english")
lemma = WordNetLemmatizer()
threshold = 20
missing_words = 0
bert_embedding = BertEmbedding()

TensorFlow Version: 1.15.0


##### Methods for running the project

In [3]:
# Contractions dictionary for changing the short form words
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [4]:
# Method for cleaning the dataset:

def cleaning(data, remove_stop = False):
    new_text = []
    pattern = "[^A-Z a-z]"
    try:
        if data != "":
            br_free = re.sub("<br />", " ", data)
            br_free = br_free.lower()
            html_free_text = BeautifulSoup(br_free, 'html.parser').get_text()
            only_words = re.sub(pattern, " ", html_free_text)
            lemmatized = "".join([lemma.lemmatize(word) for word in only_words])


            text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', lemmatized, flags=re.MULTILINE)
            text = re.sub(r'\<a href', ' ', text)
            text = re.sub(r'&amp;', ' ', text) 
            text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
            text = re.sub(r'<br />', ' ', text)
            text = re.sub(r'\'', ' ', text)

            stop_free = []
            if remove_stop:
                for word in text.split():
                    if word not in stop_words:
                        stop_free.append(word)
            else:
                stop_free = text.split()
            for word in stop_free:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)

            return " ".join(new_text)
        else:
            return "None"
    except:
        return data

In [5]:
# Vocabulary creation:

def vocab_dic(text, count_dict):
    for word in text.split():
        if word in count_dict.keys():
            count_dict[word] += 1
        else:
            count_dict[word] = 1

In [6]:
# Creating embedding index using glove embedding

def embedding_index_creation():
    embedding_index = dict()
    file = open("C://Users//sudhk//Documents//UML//CODES//Machine Learning//Text_Summerizer//Data//glove.42B.300d.txt", encoding='utf-8')
    for line in tqdm(file):
        values = line.split()
        word = values[0]
        coeff = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coeff
    file.close()
    return embedding_index

In [65]:
# Creating embedding matrix for glove:

def embedding_matrix_creation(embedding_dim):
    embedding_dim = 300
    nb_words = len(vocab_to_int)

    embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
    for word,ind in (tqdm(vocab_to_int.items())):
        if word in embedding_index:
            embedding_matrix[ind] = embedding_index[word]
        else:
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            #embedding_index[word] = new_embedding
            embedding_matrix[ind] = new_embedding
    print("\n")
    print("Embedding matrix is created !!!")
    return embedding_matrix

In [38]:
# Creating embedding matrix for bert:

def bert_embedding_matrix_creation():
    embedding_dim = 768
    nb_words = len(vocab_to_int)

    embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
    for voc,ind in (tqdm(vocab_to_int.items())):
        if voc in embedding_index:
            word = voc.split()  # Removing the contextual advantage of bert.
            embedding_matrix[ind] = bert_embedding(word)[0][1][0]
        else:
            word = voc.split()
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            embedding_matrix[ind] = new_embedding
    print("\n")
    print("Embedding matrix is created !!!")
    return embedding_matrix    

In [9]:
def to_ints(text, word_count, unk_count, eos = False):
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

###### Building the model

In [10]:
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None, ), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None, ), name='text_length')
    
    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [11]:
def process_encoding_input(target_data, vocab_to_int, batch_sieze):
    ending = tf.strided_slice(target_data, [0,0], [batch_size, -1], [1,1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int["<GO>"]), ending], 1)
    
    return dec_input

In [12]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob = keep_prob)
            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob = keep_prob)
            
            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, rnn_inputs, sequence_length, dtype=tf.float32)
            
    enc_output = tf.concat(enc_output,2)
    
    return enc_output, enc_state

In [13]:
def training_decoding_layer(dec_embd_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embd_input, sequence_length=summary_length, time_major=False)
    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, training_helper, initial_state, output_layer)
    training_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_summary_length)
    
    return training_decoder

In [14]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer, max_summary_length, batch_size):
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens, end_token)
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, inference_helper, initial_state, output_layer)
    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_summary_length)
    
    return inference_decoder

In [15]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
        output_layer = Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
        attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize=False, name='BahdanauAttention')
        dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, rnn_size)
            
        initial_state = dec_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=enc_state[0])

        with tf.variable_scope("decode"):
            training_decoder = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length)
            training_logits, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_summary_length)

        with tf.variable_scope('decode', reuse=True):
            inference_decoder = inference_decoding_layer(embeddings, vocab_to_int['<GO>'], vocab_to_int['<EOS>'], dec_cell, initial_state, output_layer, max_summary_length, batch_size)
            inference_logits,_,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_summary_length)

        return training_logits, inference_logits

In [16]:
def seq_2_seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, embedding_matrix):
    embeddings = embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
    training_logits, inference_logits = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers)
    
    
    return training_logits, inference_logits

In [17]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [18]:
def get_batches(summaries, texts, batch_size):
    for batch_i in range(0, len(texts) // batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        text_batch = texts[start_i:start_i+batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(text_batch))
        
        pad_summaries_lenghts = []
        for summary in pad_summaries_batch:
            pad_summaries_lenghts.append(len(summary))
        
        pad_texts_lenghts = []
        for text in pad_texts_batch:
            pad_texts_lenghts.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lenghts, pad_texts_lenghts

In [20]:
def text_to_seq(text):
    '''Prepare the text for the model'''
    
    text = cleaning(text)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in text.split()]

In [21]:
def prediction(input_sentence, input_summary, load_model_name):
    text = text_to_seq(input_sentence)

    checkpoint = load_model_name

    loaded_graph = tf.Graph()
    with tf.Session(graph=loaded_graph) as sess:
       # Load saved model
        loader = tf.train.import_meta_graph(checkpoint + '.meta')
        loader.restore(sess, checkpoint)

        input_data = loaded_graph.get_tensor_by_name('input:0')
        logits = loaded_graph.get_tensor_by_name('predictions:0')
        text_length = loaded_graph.get_tensor_by_name('text_length:0')
        summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
        keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')

        #Multiply by batch_size to match the model's input parameters
        answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                         summary_length: [np.random.randint(5,8)], 
                                         text_length: [len(text)]*batch_size,
                                         keep_prob: 1.0})[0] 

    # Remove the padding from the tweet
    pad = vocab_to_int["<PAD>"] 

    print('\nOriginal Text:', input_sentence)
        
    print('\nText')
    print('  Word Ids:    {}'.format([i for i in text]))
    print('  Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))

    print('\nSummary')
    print('  Actual summary: {}'.format(input_summary))
    print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
    print('  Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))
    return " ".join([int_to_vocab[i] for i in answer_logits if i != pad])

### Running the project

##### Loading the dataset

In [10]:
dataset = pd.read_csv("Reviews.csv")
dataset = dataset[["Summary", "Text"]]

##### Cleaning the dataset

In [57]:
cleaned_text = []
cleaned_summary = []

for ind in tqdm(range(10000)):
    cleaned_text.append(cleaning(dataset["Text"][ind], remove_stop=True))
    cleaned_summary.append(cleaning(dataset["Summary"][ind], remove_stop=False))

cleaned_dataset = pd.DataFrame(columns = ["Text", "Summary"])
cleaned_dataset["Summary"] = cleaned_summary
cleaned_dataset["Text"] = cleaned_text
cleaned_dataset.to_csv("Cleaned_dataset.csv")

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:42<00:00, 232.97it/s]


In [40]:
cleaned_text = []
cleaned_summary = []
cleaned_dataset = pd.read_csv("Cleaned_dataset.csv")
cleaned_dataset = cleaned_dataset[0:1000] # only 1000 is considered for faster computation. 

for i in tqdm(range(len(cleaned_dataset))):
    cleaned_text.append(cleaned_dataset["Text"][i])
    cleaned_summary.append(cleaned_dataset["Summary"][i])

100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 15186.96it/s]


In [41]:
cleaned_dataset = cleaned_dataset.dropna() # dropping the empty rows

In [43]:
# Creating count dictionary:

count_dict = {}
for ind in tqdm(range(len(cleaned_dataset))):
    vocab_dic(cleaned_dataset["Text"][ind], count_dict)
    vocab_dic(cleaned_dataset["Summary"][ind], count_dict)

100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 11812.35it/s]


In [42]:
cleaned_dataset

Unnamed: 0.1,Unnamed: 0,Text,Summary
0,0,bought several vitality canned dog food produc...,good quality dog food
1,1,product arrived labeled jumbo salted peanuts p...,not as advertised
2,2,confection around centuries light pillowy citr...,delight says it all
3,3,looking secret ingredient robitussin believe f...,cough medicine
4,4,great taffy great price wide assortment yummy ...,great taffy
...,...,...,...
995,995,black market hot sauce wonderful husband loves...,hot flavorful
996,996,man say salsa bomb different kinds almost ever...,great hot sauce and people who run it
997,997,sauce good anything like adding asian food any...,this sauce is the shiznit
998,998,hot like low star reviewer got suckered seeing...,not hot


In [44]:
len(count_dict)

5903

In [45]:
# Creating embedding index:

embedding_index = embedding_index_creation()

1917494it [05:41, 5608.95it/s]


In [46]:
# Removing the missing words:

missing_words = 0
for word,count in count_dict.items():
    if count > 0:
        if word not in embedding_index:
            missing_words += 1

In [47]:
# Vocab to int

value = 0
vocab_to_int = {}

for word, count in count_dict.items():
    if count >= threshold or word in embedding_index:
        vocab_to_int[word] = value
        value += 1

codes = ["<UNK>", "<EOS>", "<PAD>", "<GO>"]
value2 = len(vocab_to_int)
for code in codes:
    vocab_to_int[code] = value2
    value2 += 1

In [48]:
int_to_vocab = {}
for i, word in vocab_to_int.items():
    int_to_vocab[word] = i

In [49]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y =  train_test_split(cleaned_text, cleaned_summary, random_state = 3, test_size = 0.01)

In [50]:
# Converting the clean text and summary to ints:

word_count = 0
unk_count = 0

int_summaries, word_count, unk_count = to_ints(train_y, word_count, unk_count)
int_texts, word_count, unk_count = to_ints(train_x, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in headlines: 39596
Total number of UNKs in headlines: 118
Percent of words that are UNK: 0.3%


In [51]:
def create_length(text):
    lengths = []
    for sen in text:
        lengths.append(len(sen))
    return pd.DataFrame(lengths, columns=["Counts"])

In [52]:
length_summary = create_length(int_summaries)
length_texts = create_length(int_texts)

print("Summaries:")
print(length_summary.describe())
print()
print("Texts:")
print(length_texts.describe())

Summaries:
           Counts
count  990.000000
mean     4.157576
std      2.791258
min      1.000000
25%      2.000000
50%      4.000000
75%      5.000000
max     30.000000

Texts:
           Counts
count  990.000000
mean    36.838384
std     34.812950
min      6.000000
25%     17.000000
50%     26.000000
75%     44.000000
max    471.000000


In [53]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [None]:
# Remove reviews that include too many unknowns:

sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 100
unk_summary_limit = 70

for length in range(min(length_texts.Counts), max_text_length): 
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= min_length and
            len(int_summaries[count]) <= max_summary_length and
            len(int_texts[count]) >= min_length and
            unk_counter(int_summaries[count]) <= unk_summary_limit and
            unk_counter(int_texts[count]) <= unk_text_limit and
            length == len(int_texts[count])
           ):
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])

#### Prediction using glove embeddings

In [78]:
# creating glove embedding matrix:

glove_embedding_matrix = embedding_matrix_creation(300)

100%|████████████████████████████████████████████████████████████████████████| 17711/17711 [00:00<00:00, 203234.66it/s]



Embedding matrix is created !!!





In [140]:
glove_model_name = "glove.ckpt" 
glove_load_model = "./glove.ckpt"

In [119]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():

    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq_2_seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size,
                                                       glove_embedding_matrix)

    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # Create the weights for sequence_loss
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

Graph is built.


In [138]:
epochs = 10
batch_size = 32
rnn_size = 216
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
learning_rate_decay = 0.95
min_learning_rate = 0.0005

display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3
update_check = (len(sorted_texts)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model
avg_loss = [] # Record the average loss.

checkpoint = glove_model_name
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())


    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_summaries, sorted_texts, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                average_loss = round(update_loss/update_check,3)
                print("Average loss for this update:", average_loss)
                summary_update_loss.append(update_loss)
                avg_loss.append(average_loss)

                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0


        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate

        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/10 Batch   20/253 - Loss:  4.277, Seconds: 9.15
Epoch   1/10 Batch   40/253 - Loss:  3.030, Seconds: 7.63
Epoch   1/10 Batch   60/253 - Loss:  2.889, Seconds: 9.89
Epoch   1/10 Batch   80/253 - Loss:  2.756, Seconds: 8.56
Average loss for this update: 3.226
New Record!
Epoch   1/10 Batch  100/253 - Loss:  2.762, Seconds: 7.59
Epoch   1/10 Batch  120/253 - Loss:  2.948, Seconds: 9.12
Epoch   1/10 Batch  140/253 - Loss:  2.859, Seconds: 8.91
Epoch   1/10 Batch  160/253 - Loss:  2.902, Seconds: 11.74
Average loss for this update: 2.872
New Record!
Epoch   1/10 Batch  180/253 - Loss:  2.747, Seconds: 12.54
Epoch   1/10 Batch  200/253 - Loss:  2.743, Seconds: 14.55
Epoch   1/10 Batch  220/253 - Loss:  2.993, Seconds: 15.83
Epoch   1/10 Batch  240/253 - Loss:  2.984, Seconds: 17.96
Average loss for this update: 2.854
New Record!
Epoch   2/10 Batch   20/253 - Loss:  2.356, Seconds: 9.38
Epoch   2/10 Batch   40/253 - Loss:  2.211, Seconds: 7.48
Epoch   2/10 Batch   60/253 - Loss:  2.

Epoch  10/10 Batch  180/253 - Loss:  1.126, Seconds: 9.86
Epoch  10/10 Batch  200/253 - Loss:  1.226, Seconds: 11.66
Epoch  10/10 Batch  220/253 - Loss:  1.241, Seconds: 12.20
Epoch  10/10 Batch  240/253 - Loss:  1.299, Seconds: 13.98
Average loss for this update: 1.234
No Improvement.


In [141]:
glove_history = []
# To predict all the test data: len(test_x), For easy run only 10 predictions are done.
for i in range(10):
    glove_history.append(prediction(test_x[i], test_y[i], glove_load_model))

INFO:tensorflow:Restoring parameters from ./glove.ckpt

Original Text: gentleman reviewed said go store get better well disabled disability van allows us two packages carry see carrying two big cases water home think wonderful water sure could get sale better price drive someone drive carry please sweet deal ordered six cases watched ups guy set inside door nothing complain price right considered go wrong

Text
  Word Ids:    [1774, 3757, 2282, 258, 391, 390, 17, 143, 8418, 10429, 8416, 705, 206, 132, 316, 2246, 761, 2272, 132, 772, 3717, 503, 205, 1095, 165, 503, 30, 619, 390, 1738, 17, 99, 2048, 427, 2048, 2246, 439, 323, 105, 89, 306, 3717, 4953, 1711, 248, 1010, 2001, 453, 774, 2840, 99, 167, 2672, 258, 931]
  Input Words: gentleman reviewed said go store get better well disabled disability van allows us two packages carry see carrying two big cases water home think wonderful water sure could get sale better price drive someone drive carry please sweet deal ordered six cases watche

INFO:tensorflow:Restoring parameters from ./glove.ckpt

Original Text: stuff taste like black licorice sweet way tastes kind like thick nasty soy sauce see anyone could take tablespoons week barely managed swallow teaspoon put mouth eat healthy much sugar tastebuds say way stuff daily iron make attempts hide something else though really imagine right addition cap tightly package arrived black goo leaked packaging wrapped securely bubble wrap though got still pleasant clean container could open

Text
  Word Ids:    [551, 231, 12, 123, 124, 323, 664, 549, 199, 12, 567, 443, 4466, 196, 761, 507, 619, 329, 6823, 261, 1786, 3710, 2329, 4486, 245, 792, 171, 179, 121, 54, 4073, 430, 664, 551, 2704, 904, 355, 11665, 3278, 637, 626, 553, 220, 976, 167, 85, 7882, 3185, 757, 21, 123, 4795, 7141, 1210, 142, 296, 1829, 1830, 553, 84, 418, 1913, 1572, 1837, 619, 1568]
  Input Words: stuff taste like black licorice sweet way tastes kind like thick nasty soy sauce see anyone could take tablespoons wee

#### Prediction using  BERT Embeddings

In [55]:
bert_embedding_matrix = bert_embedding_matrix_creation()

100%|██████████████████████████████████████████████████████████████████████████████| 5818/5818 [35:38<00:00,  2.72it/s]



Embedding matrix is created !!!





In [56]:
bert_model_name = "bert.ckpt" 
bert_load_model = "./bert.ckpt"

In [58]:
epochs = 10
batch_size = 32
rnn_size = 216
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
learning_rate_decay = 0.95
min_learning_rate = 0.0005

In [60]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():

    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq_2_seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size,
                                                        bert_embedding_matrix)

    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    # Create the weights for sequence_loss
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

Graph is built.


In [61]:
display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(sorted_texts)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model
bert_avg_loss = [] # Record the averag loss.

checkpoint = bert_model_name
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_summaries, sorted_texts, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                average_loss = round(update_loss/update_check,3)
                print("Average loss for this update:", average_loss)
                summary_update_loss.append(update_loss)
                bert_avg_loss.append(average_loss)

                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0


        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate

        if stop_early == stop:
            print("Stopping Training.")
            break

Average loss for this update: 5.243
New Record!
Average loss for this update: 3.132
New Record!
Epoch   1/10 Batch   20/25 - Loss:  3.907, Seconds: 10.86
Average loss for this update: 3.281
No Improvement.
Average loss for this update: 2.78
New Record!
Average loss for this update: 2.753
New Record!
Epoch   2/10 Batch   20/25 - Loss:  2.808, Seconds: 8.61
Average loss for this update: 2.934
No Improvement.
Average loss for this update: 2.651
New Record!
Average loss for this update: 2.568
New Record!
Epoch   3/10 Batch   20/25 - Loss:  2.669, Seconds: 8.79
Average loss for this update: 2.833
No Improvement.
Average loss for this update: 2.517
New Record!
Average loss for this update: 2.539
No Improvement.
Epoch   4/10 Batch   20/25 - Loss:  2.546, Seconds: 9.91
Average loss for this update: 2.623
No Improvement.
Average loss for this update: 2.434
New Record!
Average loss for this update: 2.355
New Record!
Epoch   5/10 Batch   20/25 - Loss:  2.431, Seconds: 10.05
Average loss for this 

In [62]:
bert_history = []
# To predict all the test data: len(test_x), For easy run only 10 predictions are done.
for i in range(10):
    bert_history.append(prediction(test_x[i], test_y[i], bert_load_model))

INFO:tensorflow:Restoring parameters from ./bert.ckpt

Original Text: pros extremely fragrant full bodied steeped twice nice air tight container organic cons price tao tea blue flower earl grey black tea best earl grey ever tasted would recommend anyone tao tea blue flower earl grey black tea loose leaf ounce tins pack

Text
  Word Ids:    [2239, 687, 2489, 280, 2462, 4495, 2921, 137, 3474, 3475, 1837, 529, 1554, 99, 4496, 1183, 360, 2914, 2455, 2456, 123, 1183, 238, 2455, 2456, 374, 1050, 134, 60, 507, 4496, 1183, 360, 2914, 2455, 2456, 123, 1183, 2470, 4497, 260, 2170, 414]
  Input Words: pros extremely fragrant full bodied steeped twice nice air tight container organic cons price tao tea blue flower earl grey black tea best earl grey ever tasted would recommend anyone tao tea blue flower earl grey black tea loose leaf ounce tins pack

Summary
  Actual summary: best earl grey ever
  Word Ids:       [97, 1846]
  Response Words: great cornmeal
INFO:tensorflow:Restoring parameters from 

INFO:tensorflow:Restoring parameters from ./bert.ckpt

Original Text: recently acquired bottle immediately impressed good flavor without excessive saltiness supermarket brands used marinades dipping sauces addition straight use cases added little something extra quite surprise find american made soy sauce tastes good

Text
  Word Ids:    [1505, 3592, 201, 1652, 472, 8, 93, 559, 4969, 4947, 746, 160, 866, 4970, 3409, 2033, 85, 2160, 233, 3717, 764, 487, 637, 2094, 749, 1386, 212, 3350, 90, 4466, 196, 549, 8]
  Input Words: recently acquired bottle immediately impressed good flavor without excessive saltiness supermarket brands used marinades dipping sauces addition straight use cases added little something extra quite surprise find american made soy sauce tastes good

Summary
  Actual summary: better than anything in the supermarket
  Word Ids:       [97, 1846]
  Response Words: great cornmeal
INFO:tensorflow:Restoring parameters from ./bert.ckpt

Original Text: thing marmite like hate 

In [None]:
# Printing the output:

for i in range(len(test_x)):
    print("Actual Summary: {}".format(test_y[i]))
    print("Glove Summary:  {}".format(glove_history[i]))
    print("Bert Summary:   {}".format(bert_history[i]))

In [None]:
# Plotting average loss:

import matplotlib.pyplot as plt

plt.plot(bert_avg_loss, "r")
plt.plot(avg_loss, "b")
plt.show()