In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf 
import numpy as np
from nltk.corpus import stopwords #provides list of english stopwords
stop = stopwords.words('english')

In [5]:
#PRINT VERSION!!!
tf.__version__

'2.0.0'

In [6]:
train = pd.read_csv('reviews.csv')#,  nrows=1000)  #, nrows=100000 sep='\t',

In [7]:
train.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [8]:
train = train[['Summary','Text']]

In [9]:
train['text_length'] = train['Text'].str.count(' ')

In [10]:
train['text_length'].describe()

count    568454.000000
mean         81.005522
std          80.807102
min           2.000000
25%          33.000000
50%          57.000000
75%          99.000000
max        3525.000000
Name: text_length, dtype: float64

In [11]:
train['summary_length'] = train['Summary'].str.count(' ')
train['summary_length'].describe()

count    568427.000000
mean          3.128462
std           2.619420
min           0.000000
25%           1.000000
50%           3.000000
75%           4.000000
max          41.000000
Name: summary_length, dtype: float64

In [12]:
train = train.loc[train['summary_length']<8]

In [13]:
train = train.loc[train['text_length']<30]

In [14]:
print(train.shape)
print(train.head())

(109053, 4)
                   Summary                                               Text  \
4              Great taffy  Great taffy at a great price.  There was a wid...   
7   Wonderful, tasty taffy  This taffy is so good.  It is very soft and ch...   
8               Yay Barley  Right now I'm mostly just sprouting this so my...   
9         Healthy Dog Food  This is a very healthy dog food. Good for thei...   
13       fresh and greasy!  good flavor! these came securely packed... the...   

    text_length  summary_length  
4            29             1.0  
7            27             2.0  
8            25             1.0  
9            24             2.0  
13           14             2.0  


In [15]:
train['text_lower'] = train['Text'].str.lower()
train['text_no_punctuation'] = train['text_lower'].str.replace('[^\w\s]','')
#train['english_no_stopwords'] = train['english_no_punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#train["english_no_stopwords"] = train["english_no_stopwords"].fillna("fillna")
#train["english_no_stopwords"] = train["english_no_stopwords"] 

In [16]:
train['summary_lower'] = train["Summary"].str.lower()
train['summary_no_punctuation'] =  '_start_' + ' ' +train['summary_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'

**VERY IMPORTANT TRICK!! NOTICE THAT WE ADD "_start_" and "_end_" EXACTLY AT THE BEGINNING AND THE END OF EACH SENTENCE TO HAVE SOME KIND OF'DELIMITERS' THAT WILL TELL OUR DECODER TO START AND FINISH. BECAUSE WE DON'T HAVE GENERAL SIGNALS OF START AND FINISH IN NATURAL LANGUAGE. BASICALLY '_end_' REFLECTS THE POINT IN WHICH OUR OUTPUT SENTENCE IS MORE LIKELY TO END.**

In [17]:
max_features1 = 5000
maxlen1 = 30

max_features2 = 5000
maxlen2 = 8

In [18]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1) 
tok1.fit_on_texts(list(train['text_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_text =tok1.texts_to_sequences(list(train['text_no_punctuation'].astype(str)))
tf_train_text =tf.keras.preprocessing.sequence.pad_sequences(tf_train_text, maxlen=maxlen1) #let's execute pad step 

In [19]:
#the processing has to be done for both 
#two different tokenizers

In [20]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters = '*') 
tok2.fit_on_texts(list(train['summary_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_summary = tok2.texts_to_sequences(list(train['summary_no_punctuation'].astype(str)))
tf_train_summary = tf.keras.preprocessing.sequence.pad_sequences(tf_train_summary, maxlen=maxlen2, padding ='post') 

# Define Model Architecture

In [21]:
vectorized_summary = tf_train_summary
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_summary[:, :-1]

# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_summary[:, 1:]

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

vectorized_text = tf_train_text
# Encoder input is simply the body of the issue text
encoder_input_data = vectorized_text
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')


Shape of decoder input: (109053, 7)
Shape of decoder target: (109053, 7)
Shape of encoder input: (109053, 30)


In [22]:
vocab_size_encoder = len(tok1.word_index) + 1 #remember vocab size?
vocab_size_decoder = len(tok2.word_index) + 1

### Define Model Architecture

In [23]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 50



In [24]:
encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (English text)
x = tf.keras.layers.Embedding(vocab_size_encoder, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)


#Batch normalization is used so that the distribution of the inputs 
#to a specific layer doesn't change over time
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)


# We do not need the `encoder_output` just the hidden state.
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (Italian text)
dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
#again batch normalization
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) #the decoder "decodes" the encoder output.
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####
seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

** Examine Model Architecture Summary **

In [25]:
#from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
#viz_model_architecture(seq2seq_Model)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 50)     692150      Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      [(None, 30)]         0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 50)     200         Decoder-Word-Embedding[0][0]     
______________________________________________________________________________________________

# Train Model

In [26]:
batch_size = 64
epochs = 2 
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,  epochs=epochs ,  validation_split=0.12) 

Train on 95966 samples, validate on 13087 samples
Epoch 1/2
Epoch 2/2


In [27]:
#test_text = ['apparently they used too much synthetic flavors that it just burns your tongue also theres too much oil  almost made me chok']
#test_text = ['this stuff is awesome  for best flavor boil it in water drain the water add spice packet and then add hot water']
test_text = ['this product is great  gives you so much energy and tastes great  try this cafe latte and all the other flavors and you will not be disappointed']

In [28]:
#seq2seq_Model = tf.keras.models.load_model('seq2seq_subsample_1_epochs.h5')
#seq2seq_Model = tf.keras.models.load_model('seq2seq_full_data_3_epochs.h5')

In [29]:
#max_len_title = 30
# get the encoder's features for the decoder
tok1.fit_on_texts(test_text)

In [30]:
raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=maxlen1)

In [31]:
body_encoding = encoder_model.predict(raw_tokenized) #predict the encoder state of the new sentence

In [32]:
latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]

In [33]:
#remember the get layer methodo for getting the embedding (word clusters)
decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input 
dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)


In [34]:
gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')



In [35]:
gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

In [36]:
# Reconstruct dense layers
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)

In [37]:
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])

In [38]:
# we want to save the encoder's embedding before its updated by decoder
#   because we can use that as an embedding for other tasks.
original_body_encoding = body_encoding

In [39]:
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)

In [40]:
state_value

array([[1]])

In [41]:
decoded_sentence = []
stop_condition = False

In [42]:
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())
#vocabulary_inv[0] = "<PAD/>"
#vocabulary_inv[1] = "unknown"

In [43]:
vocabulary_inv

{1: '_start_',
 2: '_end_',
 3: 'great',
 4: 'good',
 5: 'the',
 6: 'coffee',
 7: 'best',
 8: 'love',
 9: 'tea',
 10: 'for',
 11: 'product',
 12: 'and',
 13: 'it',
 14: 'delicious',
 15: 'my',
 16: 'a',
 17: 'not',
 18: 'this',
 19: 'taste',
 20: 'yummy',
 21: 'excellent',
 22: 'very',
 23: 'dog',
 24: 'price',
 25: 'flavor',
 26: 'i',
 27: 'of',
 28: 'is',
 29: 'to',
 30: 'these',
 31: 'tasty',
 32: 'stuff',
 33: 'but',
 34: 'ever',
 35: 'favorite',
 36: 'yum',
 37: 'like',
 38: 'snack',
 39: 'food',
 40: 'awesome',
 41: 'loves',
 42: 'in',
 43: 'too',
 44: 'chocolate',
 45: 'wonderful',
 46: 'dogs',
 47: 'chips',
 48: 'as',
 49: 'so',
 50: 'them',
 51: 'treats',
 52: 'just',
 53: 'are',
 54: 'nice',
 55: 'hot',
 56: 'healthy',
 57: 'free',
 58: 'with',
 59: 'tasting',
 60: 'cookies',
 61: 'buy',
 62: 'treat',
 63: 'quality',
 64: 'tastes',
 65: 'perfect',
 66: 'better',
 67: 'you',
 68: 'on',
 69: 'sweet',
 70: 'value',
 71: 'at',
 72: 'green',
 73: 'really',
 74: 'what',
 75: 'no',


In [44]:
while not stop_condition:
    #print(1)
    preds, st = decoder_model.predict([state_value, body_encoding])

    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= maxlen2:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    #print(state_value)

great
product
_end_


In [103]:
train.tail()

Unnamed: 0,Summary,Text,text_length,summary_length,text_lower,text_no_punctuation,summary_lower,summary_no_punctuation
568439,a-ok,We need this for a recipe my wife is intereste...,22,0.0,we need this for a recipe my wife is intereste...,we need this for a recipe my wife is intereste...,a-ok,_start_ aok _end_
568442,Great Cafe Latte,This product is great. Gives you so much ener...,28,2.0,this product is great. gives you so much ener...,this product is great gives you so much energ...,great cafe latte,_start_ great cafe latte _end_
568448,Very large ground spice jars.,My only complaint is that there's so much of i...,28,4.0,my only complaint is that there's so much of i...,my only complaint is that theres so much of it...,very large ground spice jars.,_start_ very large ground spice jars _end_
568449,Will not do without,Great for sesame chicken..this is a good if no...,25,3.0,great for sesame chicken..this is a good if no...,great for sesame chickenthis is a good if not ...,will not do without,_start_ will not do without _end_
568453,Great Honey,"I am very satisfied ,product is as advertised,...",20,1.0,"i am very satisfied ,product is as advertised,...",i am very satisfied product is as advertised i...,great honey,_start_ great honey _end_


In [68]:
train['text_no_punctuation'][568442]

'this product is great  gives you so much energy and tastes great  try this cafe latte and all the other flavors and you will not be disappointed'