In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import tensor_array_ops
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.4.0


# 1. Inspecting the Data

In [2]:
reviews = pd.read_json("Cell_Phones_and_Accessories.json")

In [3]:
reviews.shape

(760450, 11)

In [4]:
reviews.head()

Unnamed: 0,Rating,Rev_verify,Review_Date,IC,Prod_meta,Reviewer_Name,Review,Rev_summ,Review_timestamp,Useful,Prod_img
0,5,True,"09 1, 2015",B009XD5TPQ,,Sunny Zoeller,Bought it for my husband. He's very happy with it,He's very happy with,1441065600,,
1,5,True,"01 9, 2016",B016MF3P3K,,Denise Lesley,Great screen protector. Doesn't even seem as ...,Five Stars,1452297600,,
10,1,True,"01 14, 2016",B00IJJCQBA,{'Color:': ' Black / Black'},Stephanie,To tight on my phone and the bottom piece was ...,One Star,1452729600,,
100,5,False,"06 21, 2017",B00NIJOGOG,{'Color:': ' Rose Gold [6+] CHOOSE CORRECT SIZ...,SG,"Very good and superior quality, looks great. M...",Very nice and good quality!!!,1498003200,,
1000,3,True,"06 10, 2015",B00MQYS97Y,,Linda,"The charger is not working, however ! The comp...",Not happy,1433894400,,


In [5]:
# Check for any nulls values
reviews.isnull().sum()

Rating                   0
Rev_verify               0
Review_Date              0
IC                       0
Prod_meta           352624
Reviewer_Name           91
Review                 530
Rev_summ               355
Review_timestamp         0
Useful              698250
Prod_img            742256
dtype: int64

In [6]:
# Remove features that are not needed

reviews = reviews.drop(['Rev_verify','Review_Date','Prod_meta','Reviewer_Name','Review_timestamp','Useful',
                        'Prod_img','Rev_summ'], 1)

In [7]:
reviews.shape

(760450, 3)

In [8]:
reviews.head(5)

Unnamed: 0,Rating,IC,Review
0,5,B009XD5TPQ,Bought it for my husband. He's very happy with it
1,5,B016MF3P3K,Great screen protector. Doesn't even seem as ...
10,1,B00IJJCQBA,To tight on my phone and the bottom piece was ...
100,5,B00NIJOGOG,"Very good and superior quality, looks great. M..."
1000,3,B00MQYS97Y,"The charger is not working, however ! The comp..."


In [10]:
# Remove null values
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)

In [11]:
reviews.shape

(759920, 3)

In [12]:
#Converting dataset to required format
grouped_review= reviews.groupby(['IC'],as_index=True)['Review'].apply(lambda tags: ','.join(tags))
grouped_ratings= reviews.groupby(['IC'],as_index=False).agg({'Rating':['min','max','mean']})

In [13]:
grp1=grouped_review.reset_index()
grp1['Review']

0        the hirl that wants and needs Hello Kitty prod...
1        I was very excited when I first got this case....
2        Received this in about a weeks time. Very nice...
3        thank you.,Works some of the time.  The charge...
4        This Charger is awesome! First of all I wasn't...
5        I did not use it for a few months then when I ...
6        One of two units didn't work but the company m...
7        This charger specifically didn't last that lon...
8        Nice case but print started to fade and peel a...
9        No label on charger, no indication of output. ...
10       QUE COSA TAN MALAAAAA!!!,Overheated & melted a...
11       Product charges as described but the plastic q...
12       Doesn't fit well. Doesn't let you charge both ...
13       I bought 2 of these to go on my children's pho...
14       I can go two days and more with this battery. ...
15       great item for a really low cost now thats a g...
16       Graet...charging quick!,Amazing fast charger,D.

In [14]:
result=pd.merge(grp1,grouped_ratings,on='IC')
result

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,IC,Review,"(Rating, min)","(Rating, max)","(Rating, mean)"
0,7508492919,the hirl that wants and needs Hello Kitty prod...,1,5,4.300000
1,7532385086,I was very excited when I first got this case....,1,5,3.285714
2,7887421268,Received this in about a weeks time. Very nice...,1,5,3.538462
3,8199900164,"thank you.,Works some of the time. The charge...",2,5,3.750000
4,8288853439,This Charger is awesome! First of all I wasn't...,1,5,3.818182
5,8288862993,I did not use it for a few months then when I ...,1,5,3.656250
6,828886922X,One of two units didn't work but the company m...,3,5,4.333333
7,8288878881,This charger specifically didn't last that lon...,1,5,3.977273
8,9578085451,Nice case but print started to fade and peel a...,3,5,4.250000
9,961301375X,"No label on charger, no indication of output. ...",1,5,3.923077


In [15]:
# Inspecting some of the reviews
for i in range(5):
    print("Review #",i+1)
    print(result.Review[i])
    print()

Review # 1
the hirl that wants and needs Hello Kitty products
Was so happy that she has a collection to choose from
all her friends wants hello kitty phone cases to
thanks seller,so cute love this...one drop and its done though...i dropped iton the bathroom floor and it cracked. other then that item is very cute,this case is so cute it looks good on my white iphone its pretty good quality only one diamond has fallen off and its cos i dropped it on the cement. its kinda hard to get off but whatever its so cute and hot and everyone compliments me on it,Happy it,This case is very pretty.  It is very girly looking.  The only problem is it will be hard to put the case in a jeans pocket because the bow does stick out.  It would not be a problem for me as I carry mine in my purse, but my daughters put their phones in their pockets.  Shipping was as described.,Cute case. Jewels do come off but what do you expect. Mine has held up pretty nicely and lost a few stones but is barely noticeable.,Go

# Preparing the Data

In [16]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [17]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        # We are not using "text.split()" here
        #since it is not fool proof, e.g. words followed by punctuations "Are you kidding?I think you aren't."
        text = re.findall(r"[\w']+", text)
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)# remove links
    text = re.sub(r'\<a href', ' ', text)# remove html link tag
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

Testing the function clean_text()

In [18]:
clean_text("That's a great piece of art,Can you believe it?I've.But you may not.")

'great piece art believe may'

Clean the summaries and texts

We will remove the stopwords from the texts because they do not provide much use for training our model. However, we will keep them for our summaries so that they sound more like natural phrases.

In [19]:
clean_texts = []
for text in result.Review:
    clean_texts.append(clean_text(text))
print("Reviews are complete.")

Reviews are complete.


In [20]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Clean Review #",i+1)
    print(clean_texts[i])
    print()

Clean Review # 1
hirl wants needs hello kitty products happy collection choose friends wants hello kitty phone cases thanks seller cute love one drop done though dropped iton bathroom floor cracked item cute case cute looks good white iphone pretty good quality one diamond fallen cos dropped cement kinda hard get whatever cute hot everyone compliments happy case pretty girly looking problem hard put case jeans pocket bow stick would problem carry mine purse daughters put phones pockets shipping described cute case jewels come expect mine held pretty nicely lost stones barely noticeable got friend saw liked got liked looks good iphone fits well good case low price right never problems besides losing 2 stones since purchase rough recommend get case shown picture case got black gold bow still cute wanted wanted picture beautiful phone case also hard remove also 3d ribbon came wasy darker picture shows also came wonky pretty looks like actually bedazzled phone case better expected cute yes

Count the number of occurrences of each word in a set of text

In [21]:
def count_words(count_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

Give the function a try

In [22]:
mydict = {}
count_words(mydict, ["that is a great great great leader","India has a great leader"])
mydict

{'that': 1, 'is': 1, 'a': 2, 'great': 4, 'leader': 2, 'India': 1, 'has': 1}

In [23]:
word_counts = {}
count_words(word_counts, clean_texts)
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 107606


In [24]:
#Let's see how many times "glass" occurs in the data
word_counts["glass"]

38745

Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better
(https://github.com/commonsense/conceptnet-numberbatch)

In [25]:
embeddings_index = {}
with open('numberbatch-en.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 417195


Take a look at the CN embedding dimension

In [26]:
embeddings_index["glass"].shape

(300,)

Find the number of words that are missing from CN, and are used more than our threshold.

I use a threshold of 20, so that words not in CN can be added to our word_embedding_matrix, but they need to be common enough in the reviews so that the model can understand their meaning.

In [27]:
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from CN: 1948
Percent of words that are missing from vocabulary: 1.81%


What are those missing words in the CN

In [28]:
missing_words = []
for word, count in word_counts.items():
    if count > threshold and word not in embeddings_index:
        missing_words.append((word,count))
missing_words[:30]

[('outter', 193),
 ('doesnt', 4329),
 ('12', 4432),
 ('17', 1049),
 ('11', 2088),
 ('80', 1386),
 ('100', 10990),
 ('advertized', 126),
 ('woks', 39),
 ('waze', 252),
 ('sii', 262),
 ('microusb', 1145),
 ('excelente', 2964),
 ('calidad', 89),
 ('ningun', 21),
 ('gs2', 76),
 ('2013', 1303),
 ('chager', 29),
 ('13', 1675),
 ('samsungs', 188),
 ('sansung', 22),
 ('i9100', 39),
 ('htc', 8176),
 ('note3', 230),
 ('50', 4205),
 ('75', 1400),
 ('90', 1762),
 ('40', 2410),
 ('70', 1083),
 ('powerbear', 88)]

Looks mostly products' brand.

Words to indexes, indexes to words dicts

Limit the vocab that we will use to words that appear ≥ threshold or are in CN

In [29]:
#dictionary to convert words to integers
vocab_to_int = {} 
# Index words from 0
value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 107606
Number of words we will use: 44966
Percent of words we will use: 41.79%


Create word embedding matrix
It has shape (nb_words, embedding_dim) i.e. (44966, 300) in this case. 1st dim is word index, 2nd dim is from CN or random generated.

In [30]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

44966



Function to convert sentences to sequence of words indexes

It also use <UNK> index to replace unknown words, append <EOS> (End of Sentence) to the sequences if eos is set True

In [31]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

Apply convert_to_ints to clean_texts

In [32]:
word_count = 0
unk_count = 0

int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in clean_text:", word_count)
print("Total number of UNKs in clean_text:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in clean_text: 19681670
Total number of UNKs in clean_text: 123301
Percent of words that are UNK: 0.63%


Take a look at what the sequence looks like

Each number here represents a word

In [34]:
int_texts[:5]

[[44962,
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  0,
  2,
  3,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  44962,
  20,
  21,
  22,
  23,
  13,
  24,
  13,
  25,
  26,
  27,
  28,
  29,
  26,
  30,
  15,
  31,
  32,
  33,
  19,
  34,
  35,
  36,
  37,
  38,
  13,
  39,
  40,
  41,
  5,
  24,
  29,
  42,
  43,
  44,
  36,
  45,
  24,
  46,
  47,
  48,
  49,
  50,
  44,
  51,
  52,
  53,
  54,
  45,
  55,
  56,
  57,
  58,
  13,
  24,
  59,
  60,
  61,
  52,
  62,
  29,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  68,
  71,
  25,
  26,
  28,
  72,
  73,
  26,
  24,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  65,
  82,
  83,
  84,
  85,
  37,
  24,
  86,
  87,
  24,
  68,
  88,
  89,
  48,
  90,
  13,
  91,
  91,
  87,
  92,
  9,
  24,
  93,
  36,
  94,
  93,
  95,
  96,
  97,
  44962,
  98,
  87,
  99,
  93,
  97,
  100,
  29,
  25,
  101,
  102,
  103,
  9,
  24,
  104,
  105,
  13,
  106,
  50,
  85,
  24,
  107,
  108,
  109,
  24,
  110,


Function to get the length of each sequence

In [35]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [36]:
create_lengths(int_texts[:5])

Unnamed: 0,counts
0,659
1,95
2,171
3,22
4,209


Get statistic summary of the length of summaries and texts

In [37]:
lengths_texts = create_lengths(int_texts)

print("Texts:")
print(lengths_texts.describe())

Texts:
             counts
count  48133.000000
mean     409.901793
std      995.823531
min        2.000000
25%       67.000000
50%      159.000000
75%      391.000000
max    46004.000000


See what's the max squence length we can cover by percentile

In [38]:
# Inspect the length of texts
print(np.percentile(lengths_texts.counts, 90))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))

887.0
1496.0
4059.040000000001


Function to counts the number of time <UNK appears in a sentence

In [39]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

Filter for length limit and number of UNKs

Sort the summaries and texts by the length of the element in texts from shortest to longest

In [44]:
max_text_length = 886 # This will cover up to 89.5% lengthes
min_length = 2
unk_text_limit = 1 # text can contain up to 1 UNK word

def filter_condition(item):
    int_text = item[0]
    if (len(int_text) >= min_length and 
       len(int_text) <= max_text_length and 
       unk_counter(int_text) <= unk_text_limit):
        return True
    else:
        return False

int_text_summaries = list(zip(int_texts))
int_text_summaries_filtered = list(filter(filter_condition, int_text_summaries))
sorted_int_text_summaries = sorted(int_text_summaries_filtered, key=lambda item: len(item[0]))
sorted_int_text_summaries = list(zip(*sorted_int_text_summaries))
sorted_summaries = list(sorted_int_text_summaries[0])
sorted_texts = list(sorted_int_text_summaries[0])
# Delete those temporary varaibles
del int_text_summaries, sorted_int_text_summaries, int_text_summaries_filtered
# Compare lengths to ensure they match
print(len(sorted_summaries))
print(len(sorted_texts))

31981
31981


Inspect the length of text in sorted_texts

In [45]:
lengths_texts = [len(text) for text in sorted_texts]
lengths_texts[:20]

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

Save data for later use

In [46]:
import pickle
def __pickleStuff(filename, stuff):
    save_stuff = open(filename, "wb")
    pickle.dump(stuff, save_stuff)
    save_stuff.close()
def __loadStuff(filename):
    saved_stuff = open(filename,"rb")
    stuff = pickle.load(saved_stuff)
    saved_stuff.close()
    return stuff

In [47]:

__pickleStuff("clean_texts.p",clean_texts)

__pickleStuff("sorted_summaries.p",sorted_summaries)
__pickleStuff("sorted_texts.p",sorted_texts)
__pickleStuff("word_embedding_matrix.p",word_embedding_matrix)

__pickleStuff("vocab_to_int.p",vocab_to_int)
__pickleStuff("int_to_vocab.p",int_to_vocab)

# Building the Model

Create palceholders for inputs to the model

summary_length and text_length are the sentence lengths in a batch, and max_summary_length is the maximum length of a summary in a batch.

In [48]:
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

Remove the last word id from each batch and concatenate the id of <GO the begining of each batch

In [49]:
def process_encoding_input(target_data, vocab_to_int, batch_size):  
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1]) # slice it to target_data[0:batch_size, 0: -1]
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

### Create the encoding layers

bidirectional_dynamic_rnn use tf.variable_scope so that variables are reused with each layer

parameters

rnn_size: The number of units in the LSTM cell

sequence_length: size [batch_size], containing the actual lengths for each of the sequences in the batch

num_layers: number of bidirectional RNN layer

rnn_inputs: number of bidirectional RNN layer

keep_prob: RNN dropout input keep probability

In [50]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                    input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                    input_keep_prob = keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                    cell_bw, 
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)
            enc_output = tf.concat(enc_output,2)
            # original code is missing this line below, that is how we connect layers 
            # by feeding the current layer's output to next layer's input
            rnn_inputs = enc_output
    return enc_output, enc_state

### Create the training decoding layer

parameters

dec_embed_input: output of embedding_lookup for a batch of inputs
    
summary_length: length of each padded summary sequences in batch, since padded, all lengths should be same number
    
dec_cell: the decoder RNN cells' output with attention wapper
    
output_layer: fully connected layer to apply to the RNN output
    
vocab_size: vocabulary size i.e. len(vocab_to_int)+1
    
max_summary_length: the maximum length of a summary in a batch
    
batch_size: number of input sequences in a batch
    
    
Three components

TrainingHelper: reads a sequence of integers from the encoding layer.

BasicDecoder: processes the sequence with the decoding cell, and an output layer, which is a fully connected layer. initial_state set to zero state.

dynamic_decode: creates our outputs that will be used for training.

In [51]:
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, output_layer,
                            vocab_size, max_summary_length,batch_size):
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=dec_cell,
                                                       helper=training_helper,
                                                       initial_state=dec_cell.zero_state(dtype=tf.float32, batch_size=batch_size),
                                                       output_layer = output_layer)

    training_logits = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)
    return training_logits

### Create infer decoding layer

parameters

embeddings: the CN's word_embedding_matrix

start_token: the id of <GO

end_token: the id of <EOS
dec_cell: the decoder RNN cells' output with attention wapper

output_layer: fully connected layer to apply to the RNN output

max_summary_length: the maximum length of a summary in a batch

batch_size: number of input sequences in a batch

GreedyEmbeddingHelper argument start_tokens: int32 vector shaped {batch_size, the start tokens.

In [52]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, output_layer,
                             max_summary_length, batch_size):
    '''Create the inference logits'''
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        dec_cell.zero_state(dtype=tf.float32, batch_size=batch_size),
                                                        output_layer)
                
    inference_logits = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    return inference_logits

### Create Decoding layer

3 parts: decoding cell, attention, and getting our logits.

#### Decoding Cell:

Just a two layer LSTM with dropout.

#### Attention:

Using Bhadanau, since trains faster than Luong.

#### AttentionWrapper

applies the attention mechanism to our decoding cell.

parameters

dec_embed_input: output of embedding_lookup for a batch of inputs

embeddings: the CN's word_embedding_matrix

enc_output: encoder layer output, containing the forward and the backward rnn output

enc_state: encoder layer state, a tuple containing the forward and the backward final states of bidirectional rnn.

vocab_size: vocabulary size i.e. len(vocab_to_int)+1

text_length: the actual lengths for each of the input text sequences in the batch

summary_length: the actual lengths for each of the input summary sequences in the batch

max_summary_length: the maximum length of a summary in a batch

rnn_size: The number of units in the LSTM cell

vocab_to_int: vocab_to_int the dictionary

keep_prob: RNN dropout input keep probability

batch_size: number of input sequences in a batch

num_layers: number of decoder RNN layer

In [53]:
def lstm_cell(lstm_size, keep_prob):
    cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    return tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = keep_prob)

def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length,
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    dec_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell(rnn_size, keep_prob) for _ in range(num_layers)])
    output_layer = Dense(vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                     enc_output,
                                                     text_length,
                                                     normalize=False,
                                                     name='BahdanauAttention')
    dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,attn_mech,rnn_size)
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input,summary_length,dec_cell,
                                                  output_layer,
                                                  vocab_size,
                                                  max_summary_length,
                                                  batch_size)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,
                                                    vocab_to_int['<GO>'],
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell,
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
    return training_logits, inference_logits

In [54]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):
    '''Use the previous functions to create the training and inference logits'''
    
    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size) #shape=(batch_size, senquence length) each seq start with index of<GO>
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers)
    return training_logits, inference_logits

### Pad sentences for batch

Pad so the actual lengths for each of the sequences in the batch have the same length.

In [55]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

### Function to generate batch data for training

In [56]:
def get_batches(summaries, texts, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

#### Just to test "get_batches" function

Here we generate a batch with size of 5

Checkout those "44963" they are PAD>s, also all sequences' lengths are the same.

In [57]:
print("'<PAD>' has id: {}".format(vocab_to_int['<PAD>']))
sorted_summaries_samples = sorted_summaries[7:50]
sorted_texts_samples = sorted_texts[7:50]
pad_summaries_batch_samples, pad_texts_batch_samples, pad_summaries_lengths_samples, pad_texts_lengths_samples = next(get_batches(
    sorted_summaries_samples, sorted_texts_samples, 5))
print("pad summaries batch samples:\n\r {}".format(pad_summaries_batch_samples))

'<PAD>' has id: 44963
pad summaries batch samples:
 [[   26 44964]
 [23721 44964]
 [   11 44964]
 [   14 44964]
 [   26 44964]]


In [65]:
# Set the Hyperparameters
epochs = 10
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.95

### Build graph

In [59]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits[0].rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits[0].sample_id, name='predictions')
    
    # Create the weights for sequence_loss, the sould be all True across since each batch is padded
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")
graph_location = "./graph"
print(graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(train_graph)

Graph is built.
./graph


# Training the Model

In [71]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(sorted_texts)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model

checkpoint = "./best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_summaries, sorted_texts, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/10 Batch   20/499 - Loss:  9.189, Seconds: 405.00


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\hp\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-71-79eda766686c>", line 35, in <module>
    keep_prob: keep_probability})
  File "C:\Users\hp\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 889, in run
    run_metadata_ptr)
  File "C:\Users\hp\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1120, in _run
    feed_dict_tensor, options, run_metadata)
  File "C:\Users\hp\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1317, in _do_run
    options, run_metadata)
  File "C:\Users\hp\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1323, in _do_call
    return fn(*args)
  File "C:\Users\hp\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1302, in _run_fn
    status, run_metadata)
KeyboardInterrupt

During handling

KeyboardInterrupt: 

## Making Our Own Summaries

To see the quality of the summaries that this model can generate, you can either create your own review, or use a review from the dataset. You can set the length of the summary to a fixed value, or use a random value.

In [77]:
def text_to_seq(text):
    '''Prepare the text for the model'''
    
    text = clean_text(text)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in text.split()]

input_sentences: a list of reviews strings we are going to summarize

generagte_summary_length: a int or list, if a list must be same length as input_sentences

In [None]:
random = np.random.randint(0,len(clean_texts))
input_sentence = clean_texts[random]
text = text_to_seq(clean_texts[random])

checkpoint = "best_model.ckpt.data-00000-of-00001"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph('best_model.ckpt.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    #Multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 

# Remove the padding from the tweet
pad = vocab_to_int["<PAD>"] 

print('Original Text:', input_sentence)

print('\nText')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))

print('\nSummary')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from best_model.ckpt.data-00000-of-00001


INFO:tensorflow:Restoring parameters from best_model.ckpt.data-00000-of-00001
