In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from pickle import dump, load

In [2]:
reviews = pd.read_csv("Reviews.csv")

In [3]:
reviews.shape

(568454, 10)

In [5]:
reviews.head(1)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...


In [7]:
reviews['UserId'].value_counts()

A3OXHLG6DIBRW8    448
A1YUL9PCJR3JTY    421
AY12DBB0U420B     389
A281NPSIMI1C2R    365
A1Z54EM24Y40LL    256
A1TMAVN4CEM8U8    204
A2MUGFV2TDQ47K    201
A3TVZM3ZIXG8YW    199
A3PJZ8TU8FDQ1K    178
AQQLWCMRNDFGI     176
A2SZLNSI5KOQJT    175
A29JUMRL1US6YP    172
AZV26LP92E6WU     167
AY1EF0GOH80EK     162
A31N6KB160O508    162
A2FRFAQCWZJT3Q    161
A1UQBFCERIP7VJ    157
AKMEY1BSHSDG7     155
A1LZJZIHUPLDV4    154
A1WX42M589VAMQ    151
A3D6OI36USYOU1    150
A2Y8IDC1FKGNJC    150
A3HPCRD9RX351S    149
A2PNOU7NXB1JE4    143
AKZKG2Z7CNV27     143
A33AQPJYH7UUXR    142
A35R32TA60XD57    141
A1IU7S4HCK1XK0    140
A1X1CEGHTHMBL1    135
A36WGHR8TO5DKT    134
                 ... 
A3OYGQFYTLXDEI      1
A1FHH2PQNN65UA      1
ARYX2NRJ45MJE       1
A2DAAQI25T42JS      1
A2DDDKF5NZMTFY      1
A2OTS926107OSB      1
A35VZGIUVQVLX9      1
A3P7JE6K4D7KK0      1
A3MWZPIQ0LAST2      1
A1JT9S5P9NKA29      1
A1OS5209JKYNHO      1
A2TG6PN9JDALGE      1
A1CKP0D0XGGVCH      1
A28K6G5NGB1PIC      1
A3MIHY1FCJ

In [9]:
reviews.loc[reviews['UserId'] == 'A3OXHLG6DIBRW8']

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
369,370,B002O3VHXU,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",1,1,5,1282176000,Very Smooth Coffee - Highly Recommended,"Green Mountain ""Nantucket Blend"" K-Cups make a..."
813,814,B004ET7MG8,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",3,4,2,1272240000,Odd Fake Flavor - Not Recommended,"Trident ""Strawberry Twist"" sugarless gum is ve..."
3306,3307,B005K4Q1VI,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",24,25,5,1321401600,Really Good Hot Cocoa - Highly Recommended,These Grove Square Hot Cocoa flavors are by fa...
3416,3417,B005K4Q1VI,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",5,5,5,1321401600,Really Good Hot Cocoa - Highly Recommended,These Grove Square Hot Cocoa flavors are by fa...
3926,3927,B000VSDFRG,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",2,2,5,1237161600,Great Diabetic Friendly Candy - Highly Recomme...,"Hershey ""Sugar Free Caramel Filled Chocolates""..."
6160,6161,B007J6GGII,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",0,0,4,1332633600,True Bacon Flavor,"These David's Signature Beyond Gourmet ""Bacon""..."
7773,7774,B007J6KEPY,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",0,0,5,1332633600,Outstanding Flavor - Highly Recommended,"These David's Signature Beyond Gourmet ""Pomegr..."
10715,10716,B00389Q4XW,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",0,0,4,1283126400,Good Hot Cocoa - Recommended,The Green Mountain Hot Cocoa is a good choice ...
15318,15319,B00503DP0O,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",0,0,4,1311897600,Very Good Tasting Snack Bars - Recommended,We have several relatives with dietary restric...
15792,15793,B007TJGZ5E,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",0,0,5,1282176000,Very Smooth Coffee - Highly Recommended,"Green Mountain ""Nantucket Blend"" K-Cups make a..."


<b>Note</b>: Before dropping is it worth noting that certain users have significantly more reviews written than others. However in the grand scheme, since there are 568454 reviews, most likely the 448 reviews of the most frequent reviewer won't bias our model. If it does, i will limit the number of reviews taken for each user.

<h3>About the Data</h3>
<br>
Notice that there are 10 columns in this dataset for a full summary of a review including helpfulness and rating. However, since we are trying to train a summazier, those additional fields are unneccessary. We only need the Summary and Text columns for this project.

In [35]:
reviews = reviews.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1)

<h3>Further cleaning</h3>
<br>
We need to drop rows in which either summary or text is empty as we cannot use them for training or testing.

In [61]:
reviews.isna().sum()

Summary    27
Text        0
dtype: int64

In [63]:
reviews.dropna(inplace=True)

In [64]:
reviews.isna().sum()

Summary    0
Text       0
dtype: int64

<h3>Cleaning the text data</h3>
<br>

We want to remove text that would not help our model learn to summarize. This would include stopwords, contractions and web symbols like links and html elements

In [65]:
# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# function to remove contractions from reviews

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def remove_web_symbols(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    return text


In [66]:
from string import digits

def clean_review(review):
    # remove contractions
    # remove urls, digits
    # remove stopwords
    
    review = decontracted(review)
    review = remove_web_symbols(review)
    
    remove_digits = str.maketrans('', '', digits)
    review = review.translate(remove_digits)
    
    # NLTK stopwords
    stops = set(stopwords.words("english"))
    
    review = review.split()
    review = [w for w in review if not w in stops]
    review = " ".join(review)
        
    return review.lower()

In [68]:
test = clean_review("HOW'd you do on this I'm great")

In [69]:
test

'how would i great'

In [73]:
clean_reviews = list()
clean_summaries = list()

for summary in reviews.Summary:
    if summary == '':
        continue
    clean_summaries.append(clean_review(summary))
    
for review in reviews.Text:
    if review == '':
        continue
    clean_reviews.append(clean_review(review))

In [74]:
print(len(clean_summaries))
print(len(clean_reviews))

568427
568427


In [78]:
for i in range(5):
    print(clean_summaries[i])
    print("\n")
    
for i in range(5):
    print(clean_reviews[i])
    print("\n")

good quality dog food


not advertised


delight says


cough medicine


great taffy


i bought several vitality canned dog food products found good quality the product looks like stew processed meat smells better my labrador finicky appreciates product better


product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted not sure error vendor intended represent product jumbo


this confection around centuries it light pillowy citrus gelatin nuts case filberts and cut tiny squares liberally coated powdered sugar and tiny mouthful heaven not chewy flavorful i highly recommend yummy treat if familiar story c s lewis the lion the witch the wardrobe treat seduces edmund selling brother sisters witch


if looking secret ingredient robitussin i believe i found i got addition root beer extract i ordered good made cherry soda the flavor medicinal


great taffy great price there wide assortment yummy taffy delivery quick if taffy lover deal




<h3>Saving cleaned data</h3>
<br>
<p>Since the dataset is quite large, we will save the cleaned text data into a pickle file for reusability.</p>

In [89]:
review_data = list()

for i, review in enumerate(clean_reviews):
    review_data.append({'review': review, 'summary': clean_summaries[i]})
    
dump(review_data, open('review_dataset.pkl', 'wb'))

In [90]:
batch_size = 64
epochs = 100
latent_dim = 256
num_samples = 10000

In [91]:
review_dataset = load(open('review_dataset.pkl', 'rb'))

In [92]:
print(len(review_dataset))

568427


In [97]:
review_dataset[0]

{'review': 'i bought several vitality canned dog food products found good quality the product looks like stew processed meat smells better my labrador finicky appreciates product better',
 'summary': 'good quality dog food'}

<h3>Now we need to prepare the data to feed to our model</h3>

In [143]:
# add symbols to start and end of the ouput 
# REASONING: We need the start token so that our decoder has something to start with.
# This is important since the decoder is going to generate characters based its outputs (and hidden states) so letting the system know 
# when it is starting is important. The end token is also important to help the model decode aribitrary length. If we don't have the eos, we don't know when the model has finished decoding and
# we will keep processing random tokens that the system has not accounted for in its summary.

# TEST: does it help to add the sos and eos to the input sentences

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

i = 0

for review_summary in review_dataset:
    
    if i == 5000:
        break
    else:
        i += 1
    
    input_text = review_summary['review']
    target_text = review_summary['summary']
    
    # add sos and eos tokens
    target_text = '\t' + target_text + '\n'
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)



KeyboardInterrupt: 

In [148]:
len(target_characters)

41

In [149]:
len(input_characters)

60

In [150]:
# THIS WAS AFTER 30 MIN OF RUNNING THE CODE ABOVE

len(input_texts)
len(target_texts)

176870

In [151]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print("Number of encoder tokens: ", num_encoder_tokens)
print("Number of decoder tokens: ", num_decoder_tokens)
print("Max encoder sequence ", max_encoder_seq_length)
print("Max decoder sequence ", max_decoder_seq_length)


Number of encoder tokens:  60
Number of decoder tokens:  41
Max encoder sequence  12482
Max decoder sequence  140


In [152]:
# create mappings between token and index for training

input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])

target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [153]:
input_token_index

{' ': 0,
 '<': 1,
 '>': 2,
 '\\': 3,
 '^': 4,
 '`': 5,
 'a': 6,
 'b': 7,
 'c': 8,
 'd': 9,
 'e': 10,
 'f': 11,
 'g': 12,
 'h': 13,
 'i': 14,
 'j': 15,
 'k': 16,
 'l': 17,
 'm': 18,
 'n': 19,
 'o': 20,
 'p': 21,
 'q': 22,
 'r': 23,
 's': 24,
 't': 25,
 'u': 26,
 'v': 27,
 'w': 28,
 'x': 29,
 'y': 30,
 'z': 31,
 '{': 32,
 '}': 33,
 '~': 34,
 '\x8c': 35,
 '¢': 36,
 '£': 37,
 '¦': 38,
 '§': 39,
 '«': 40,
 '\xad': 41,
 '®': 42,
 '°': 43,
 'µ': 44,
 '·': 45,
 'º': 46,
 '¼': 47,
 '½': 48,
 '¾': 49,
 'â': 50,
 'å': 51,
 'æ': 52,
 'ç': 53,
 'ê': 54,
 'ë': 55,
 'î': 56,
 'ï': 57,
 'ô': 58,
 'û': 59}

In [154]:
target_token_index

{' ': 0,
 '<': 1,
 '>': 2,
 '\\': 3,
 '^': 4,
 '`': 5,
 'a': 6,
 'b': 7,
 'c': 8,
 'd': 9,
 'e': 10,
 'f': 11,
 'g': 12,
 'h': 13,
 'i': 14,
 'j': 15,
 'k': 16,
 'l': 17,
 'm': 18,
 'n': 19,
 'o': 20,
 'p': 21,
 'q': 22,
 'r': 23,
 's': 24,
 't': 25,
 'u': 26,
 'v': 27,
 'w': 28,
 'x': 29,
 'y': 30,
 'z': 31,
 '{': 32,
 '}': 33,
 '~': 34,
 '«': 35,
 '®': 36,
 '»': 37,
 '½': 38,
 'û': 39,
 'ý': 40}

In [155]:
# set up encoder and decoder vectors
# 3 dimensions: 1. training example  2. character index   3. token value

data_len = len(input_texts)

encoder_input_data = np.zeros((data_len, max_encoder_seq_length, num_encoder_tokens), dtype='float32')

decoder_input_data = np.zeros((data_len, max_decoder_seq_length, num_decoder_tokens), dtype='float32')

decoder_target_data = np.zeros((data_len, max_decoder_seq_length, num_decoder_tokens), dtype='float32')


In [156]:
encoder_input_data.shape

(176870, 12482, 60)

In [157]:
decoder_input_data.shape

(176870, 140, 41)

In [158]:
decoder_target_data.shape

(176870, 140, 41)

<h3>Encoding the character level data for the encoder and decoder</h3>

In [159]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
    
        if t > 0:
            # for the decoder output, it needs to always be 1 step ahead of the input
            # as such we just use the current index at the timestep before
            
            decoder_target_data[i, t-1, target_token_index[char]] = 1.

In [176]:
encoder_input_data.shape

(176870, 12482, 60)

In [161]:
decoder_input_data.shape

(176870, 140, 41)

In [162]:
decoder_target_data.shape

(176870, 140, 41)

<b>Note</b>: Good, the shapes should still be identical to before we one hot encoded the encoder and the 2 decoder vectors

<h2>Model</h2>

In [163]:
from keras.models import Model
from keras.layers import Input, Dense, LSTM

Using TensorFlow backend.


<h3>Description of the model</h3>
<br>
We will be using an lstm to get encode the inputs. We will be discarding its outputs as we only want the final hidden states. Then we pass the encoded input states into another LSTM. This one will only keep the decoder ouputs. Finally, the decoded outputs will be passed to a dense layer in which we will use a softmax to predict the next characters. 

<b>Note to self</b>: The reason we use shape=(None, num_encoder_tokens) instead of a hardcoded value (in place of the None) is because this acts as a placeholder. As such, the model will learn figure out how many rows of inputs it is receiving which makes it more flexible. The num_encoder_tokens is because we are passing in a character at a time and there are num_encoder_token of them to be passed)

In [177]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We will only be passing on the final states of the input LSTM
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs,_,_ = decoder(decoder_inputs, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [178]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 60)     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 41)     0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, 256), (None, 324608      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 256),  305152      input_6[0][0]                    
                                                                 lstm_5[0][1]                     
          

<h3>Training the model</h3>

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 141496 samples, validate on 35374 samples
Epoch 1/100


In [None]:
model.save('text_summarization_model.h5')

<h3>Inference</h3>
<br>
<p>This is to test the outputs of our summarizer model. The inference is as follows:<br>

<ol>
    <li> Encode inputs and retrieve initial decoder state</li>
<li> Run one step of the decoder with the initial state and set the target to sos token. Output will be the predicted next character</li>
    <li> Rinse and repeat with the current target token and current states</li>
</ol>    

<br>

<b>Note</b>: The decoder input state only needs to be set once with the encoder state values and then we rely on the decoders own hidden states for the remainder of the character prediction
    
</p>

In [1]:
# can instantitate model as long as there is an input and hidden states
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(hsape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(decoder_inputs, initial_state=decoder_state_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_state_inputs,
    [decoder_outputs] + decoder_states
)

NameError: name 'Model' is not defined

In [None]:
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items()
)

reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items()
)

In [None]:
def decode_sequence(input_seq):
    input_states = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    
    # start the sequence with the start of sentence token
    target_seq[0,0, target_token_index['\t']] = 1
    
    end_of_sentence = False
    
    decoded_sentence = ""
    
    while not end_of_sentence:
        output_tokens, h, c = decoder_model.predict([target_seq] + input_states)
        
        # since we are using a softmax for our activation, the largest value is the most probable character
        predicted_token_index = np.argmax(output_tokens[0, -1, :])
        predicted_char = reverse_target_char_index[predicted_token_index]
        decoded_sentence += predicted_char
        
        # end condition will be if the predicted output char was the eos ('\n') or if the length is greater than the max ouput seq length
        if(sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
            
        # reset the target_seq to be the value of the predicted char
        target_seq = np.zeros((1,1, num_decoder_tokens))
        target_seq[0, 0, predicted_token_index] = 1
        
        state_values = [h,c]
    
    return decoded_sentence
    

<h2>Testing Model Predictions</h2>


In [None]:
for seq_index in range(50):
    input_seq = encoder_input_data[seq_index: seq_index+1]
    decoded_sentence = decode_sentence(input_seq)
    print("- - -")
    print("Input Sentence: ", input_texts[input_seq])
    print("Predicted Summary:", decoded_sentence)
    print("Actual Summary:", output_texts[input_seq])

<h1>Conclusion</h1>
<br>
In this notebook I practiced seq2seq modelling through a text summarization task at a character level. The next steps for improvements involve adding an embedding layer to learn character embeddings. From my knowledge of character embeddings, this should help the model generalize with 

<br>

The reason I chose to use a character level model is beacause it was easier to train on such a large dataset. However, I am curious as to how this model would fare at the word level. This is an experiment that i wish to conduct.