In [0]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow import keras

In [2]:
tf.__version__

'2.2.0-rc2'

In [14]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
reviews_df = pd.read_csv('/content/drive/My Drive/Amazon_Reviews.csv')

In [16]:
reviews_df['Score'].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [223]:
neg_df = reviews_df[reviews_df['Score'] <= 4].sample(20000, random_state=15)

neg_df = neg_df.dropna(subset=['Summary'])

neg_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
215069,215070,B0030VBRCG,A3GVWQSH9KRU14,mom of a foodie kid,0,0,1,1300406400,SUPER SOUR!!!,My 5.5 yr old son LOVES apricots and sweet pot...
205060,205061,B0002PHFKQ,AJU5NCWYOM5T2,Beatriz V. Austin,1,1,4,1325548800,Orange flower taste in a bottle,This orange flower water is pretty good. You h...
287143,287144,B000EMK4VE,A1JY64GQC3B8WV,R. Davis,1,2,3,1222473600,"Ok, but it is just tea",Tea is not just flavored colored water. Now th...
174199,174200,B000LL0RKG,A3CJ63L7LVTH53,Michelle,12,13,4,1186185600,Delicious and Convenient -- No Waste!,I like Silk soymilk because it stays fresh in ...
215811,215812,B002Z04ZNQ,A3CT3C010B9BBY,James L. Bimler,0,0,3,1302134400,First time Coconut juice taster,I wasn't really sure what to expect here. I'm ...


## Load CSV of CFPB Complaints and Summaries

In [0]:
cfpb_df = pd.read_csv('/content/drive/My Drive/cfpb_summaries_4.csv')

sum_df = cfpb_df.dropna(subset=['Summary'])

# Build test_df
test_df = sum_df.sample(10, random_state=15)

test_df['Text'] = test_df['Consumer complaint narrative']

test_df = test_df[['Text', 'Summary']]

# Remove rows that were sampled for test_df
to_remove = test_df.index.tolist()

sum_df = sum_df.drop(to_remove)

test_df = test_df.reset_index(drop=True)

In [226]:
print(f'\nNumber of cfpb summaries:', sum_df.shape[0], '\nNumber of amazon summaries:', neg_df.shape[0], '\n')

print(f'Percent cfpb', 100*round(sum_df.shape[0]/(sum_df.shape[0] + neg_df.shape[0]), 3), '%\n')


Number of cfpb summaries: 514 
Number of amazon summaries: 19995 

Percent cfpb 2.5 %



## Build Combined DataFrame w/ 'X' Copies of CFPB Text & Summaries

In [0]:
sum_df['Text'] = sum_df['Consumer complaint narrative']

tmp_df = sum_df[['Text', 'Summary']]

tmp_df_2 = neg_df[['Text', 'Summary']]

comb_df = tmp_df.append(tmp_df_2)

comb_df.reset_index(inplace=True, drop=True)

for i in range(5):

  comb_df = tmp_df.append(comb_df).reset_index(drop=True)

In [228]:
comb_df.shape

(23079, 2)

## Clean Texts

In [229]:
clean = [re.sub('[^A-Za-z.,\s\']', '', str(text)) for text in comb_df['Text']]

split_word_nars = [nar.split() for nar in clean]

"""Contractions Import"""

import sys
sys.path.append('/content/drive/My Drive')
from english_contractions import replace_contraction

""" Stop Words Import """

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

""" Loops """

new_words = []

for nar in split_word_nars:

  nar_words = []

  for word in nar:

    if re.search('\w+[.]', word):

      splitted = word.split('.')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append('.')
    
    elif re.search('\w+[,]', word):
      
      splitted = word.split(',')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append(',')
    
    elif re.match('[.]', word):
      
      placeholder = 1
    
    else:

      tmp_words = replace_contraction(word)

      for w in tmp_words.split():

        nar_words.append(w)
  
  nar_words = [word for word in nar_words if not word in stops]

  new_words.append(' '.join(nar_words))

texts = new_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Clean Summaries

In [0]:
clean = [re.sub('[^A-Za-z.,\s\']', '', str(text)) for text in comb_df['Summary']]

split_word_nars = [nar.split() for nar in clean]

"""Contractions Import"""

import sys
sys.path.append('/content/drive/My Drive')
from english_contractions import replace_contraction

""" Loops """

new_words = []

for nar in split_word_nars:

  nar_words = []

  for word in nar:

    if re.search('\w+[.]', word):

      splitted = word.split('.')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append('.')
    
    elif re.search('\w+[,]', word):
      
      splitted = word.split(',')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append(',')
    
    elif re.match('[.]', word):
      
      placeholder = 1
    
    else:

      tmp_words = replace_contraction(word)

      for w in tmp_words.split():

        nar_words.append(w)

  new_words.append(' '.join(nar_words))

summaries = new_words

## Clean Test Text and Summaries

In [231]:
# Texts

clean = [re.sub('[^A-Za-z.,\s\']', '', str(text)) for text in test_df['Text']]

split_word_nars = [nar.split() for nar in clean]

"""Contractions Import"""

import sys
sys.path.append('/content/drive/My Drive')
from english_contractions import replace_contraction

""" Stop Words Import """

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

""" Loops """

new_words = []

for nar in split_word_nars:

  nar_words = []

  for word in nar:

    if re.search('\w+[.]', word):

      splitted = word.split('.')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append('.')
    
    elif re.search('\w+[,]', word):
      
      splitted = word.split(',')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append(',')
    
    elif re.match('[.]', word):
      
      placeholder = 1
    
    else:

      tmp_words = replace_contraction(word)

      for w in tmp_words.split():

        nar_words.append(w)
  
  nar_words = [word for word in nar_words if not word in stops]

  new_words.append(' '.join(nar_words))

test_texts = new_words

# Summaries

clean = [re.sub('[^A-Za-z.,\s\']', '', text) for text in test_df['Summary']]

split_word_nars = [nar.split() for nar in clean]

new_words = []

for nar in split_word_nars:

  nar_words = []

  for word in nar:

    if re.search('\w+[.]', word):

      splitted = word.split('.')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append('.')
    
    elif re.search('\w+[,]', word):
      
      splitted = word.split(',')

      tmp_words = replace_contraction(splitted[0].lower())

      for w in tmp_words.split():

        nar_words.append(w)

      nar_words.append(',')
    
    elif re.match('[.]', word):
      
      placeholder = 1
    
    else:

      tmp_words = replace_contraction(word)

      for w in tmp_words.split():

        nar_words.append(w)

  new_words.append(' '.join(nar_words))

test_summaries = new_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Build Clean Df

In [232]:
comb_df['Cleaned_text'], comb_df['Cleaned_summary'] = texts, summaries

clean_df = comb_df[['Cleaned_text', 'Cleaned_summary']]

target_texts = []

for target_text in clean_df['Cleaned_summary']:

  target_text = '_START_ ' + str(target_text) + ' _END_'

  target_texts.append(target_text)

clean_df['Summary'] = target_texts

# test_df

test_df['Cleaned_text'], test_df['Cleaned_summary'] = test_texts, test_summaries

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


## Train/Val Split

In [0]:
max_len_text = 100

max_len_summary = 7

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(clean_df['Cleaned_text'], clean_df['Summary'], test_size=0.15, random_state=0, shuffle=True)

## AttentionLayer Class

In [0]:
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K


class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

## (Optional) Load Model and Tokenizer

In [0]:
import pickle

# model

drive_path = '/content/drive/'

file_path = 'My Drive/saved_keras_rnns/'

name = 'enc_dec_main_u5.h5'

model = keras.models.load_model(drive_path + file_path + name, custom_objects={'AttentionLayer': AttentionLayer})

# tokenizer

name = 'tok_enc_dec_main_u5.pkl'

(x_tokenizer, y_tokenizer) = pickle.load(open(drive_path + file_path + name, 'rb'))

# build pre-trained model

config = model.get_config()

weights = model.get_weights()

model = tf.keras.Model.from_config(config, custom_objects={'AttentionLayer': AttentionLayer})

model.set_weights(weights)



## Tokenize Text

### Reviews

In [234]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# x_tokenizer = tf.keras.preprocessing.text.Tokenizer()

# x_tokenizer.fit_on_texts(list(X_train))

# convert text sequences into integer sequences
X_train = x_tokenizer.texts_to_sequences(X_train)
X_val = x_tokenizer.texts_to_sequences(X_val)

# padding zero up to maximum length
X_train = pad_sequences(X_train, maxlen=max_len_text, padding='post')
X_val = pad_sequences(X_val, maxlen=max_len_text, padding='post')

x_voc_size = len(x_tokenizer.word_index) + 1

x_voc_size

32279

### Summaries

In [235]:
# y_tokenizer = tf.keras.preprocessing.text.Tokenizer()

# y_tokenizer.fit_on_texts(list(y_train))

# convert summary sequences into integer sequences
y_train = y_tokenizer.texts_to_sequences(y_train)
y_val = y_tokenizer.texts_to_sequences(y_val)

# padding zero upto maximum length
y_train = pad_sequences(y_train, maxlen=max_len_summary, padding='post')
y_val = pad_sequences(y_val, maxlen=max_len_summary, padding='post')

y_voc_size = len(y_tokenizer.word_index) + 1

y_voc_size



7108

## Build Model

In [236]:
import tensorflow_addons as tfa

n_neurons = 256

embedding_dim = 512

dropout_rate = 0

# Encoder 
encoder_inputs = tf.keras.layers.Input(shape=[max_len_text,])
enc_emb_layer = tf.keras.layers.Embedding(x_voc_size, embedding_dim, trainable=True)
enc_emb = enc_emb_layer(encoder_inputs)

# LSTM 1 
encoder_lstm_1 = tf.keras.layers.LSTM(n_neurons, return_sequences=True, return_state=True,
                                      dropout=dropout_rate, recurrent_dropout=dropout_rate)
encoder_output_1, state_h1, state_c1 = encoder_lstm_1(enc_emb)

# LSTM 2 
encoder_lstm_2 = tf.keras.layers.LSTM(n_neurons, return_sequences=True, return_state=True,
                                      dropout=dropout_rate, recurrent_dropout=dropout_rate)
encoder_output_2, state_h2, state_c2 = encoder_lstm_2(encoder_output_1)

# LSTM 3 
encoder_lstm_3 = tf.keras.layers.LSTM(n_neurons, return_sequences=True, return_state=True,
                                      dropout=dropout_rate, recurrent_dropout=dropout_rate)
encoder_outputs, state_h, state_c = encoder_lstm_3(encoder_output_2)

# Set up the decoder
decoder_inputs = tf.keras.layers.Input(shape=[None,])
dec_emb_layer = tf.keras.layers.Embedding(y_voc_size, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# LSTM using encoder_states as initial state
decoder_lstm = tf.keras.layers.LSTM(n_neurons, return_sequences=True, return_state=True,
                                    dropout=dropout_rate, recurrent_dropout=dropout_rate)
decoder_output_1, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Attention Layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_output_1])
decoder_concat_input = tf.keras.layers.Concatenate(axis=-1, name='concat_layer')([decoder_output_1, attn_out]) # Concat outputs

# Dense layer 1
decoder_concat_input = tf.keras.layers.Dropout(0.5)(decoder_concat_input)
decoder_dense_1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1024, activation='sigmoid'))
decoder_output_2 = decoder_dense_1(decoder_concat_input)

# Dense output
decoder_dense = tf.keras.layers.Dense(y_voc_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_output_2)

# Define the model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model_26"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_47 (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 100, 512)     16526848    input_47[0][0]                   
__________________________________________________________________________________________________
lstm_56 (LSTM)                  [(None, 100, 256), ( 787456      embedding_28[0][0]               
__________________________________________________________________________________________________
input_48 (InputLayer)           [(None, None)]       0                                            
___________________________________________________________________________________________

## Train Model

### Build Target Set

In [0]:
y_train_post = y_train[:,:-1]

y_train_post_2 = y_train.reshape(y_train.shape[0], y_train.shape[1], -1)[:,1:]

y_val_post = y_val[:,:-1]

y_val_post_2 = y_val.reshape(y_val.shape[0], y_val.shape[1], -1)[:,1:]

### Establish Callbacks

In [238]:
# Early Stopping

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Checkpointing Model Weights

import os

checkpoint_path = 'checkpoints/cp-{epoch:03d}.ckpt'

checkpoint_dir = os.path.dirname(checkpoint_path)

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, verbose=1,
                                                   save_weights_only=True, period=1)

latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)

latest_checkpoint



'checkpoints/cp-005.ckpt'

### Train Model

In [239]:
batch_size = 256

epochs = 5

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

model.load_weights(latest_checkpoint)

history = model.fit([X_train, y_train_post], y_train_post_2,
                    validation_data=([X_val, y_val_post], y_val_post_2),
                    batch_size=batch_size, epochs=epochs, callbacks=[early_stop, checkpoint_cb])

Epoch 1/5
Epoch 00001: saving model to checkpoints/cp-001.ckpt
Epoch 2/5
Epoch 00002: saving model to checkpoints/cp-002.ckpt
Epoch 3/5
Epoch 00003: saving model to checkpoints/cp-003.ckpt
Epoch 4/5
Epoch 00004: saving model to checkpoints/cp-004.ckpt
Epoch 5/5
Epoch 00005: saving model to checkpoints/cp-005.ckpt


## Save Model and Tokenizers

### Model

In [0]:
drive_path = '/content/drive/'

file_path = 'My Drive/saved_keras_rnns/'

name = 'enc_dec_main_u5.h5'

model.save(drive_path + file_path + name)

### Tokenizers

In [0]:
import pickle

name = 'tok_enc_dec_main_u5.pkl'

pickle.dump((x_tokenizer, y_tokenizer), open(drive_path + file_path + name, 'wb'))

## Load Model and Tokenizers

### Model

In [0]:
drive_path = '/content/drive/'

file_path = 'My Drive/saved_keras_rnns/'

name = 'enc_dec_main.h5'

model = keras.models.load_model(drive_path + file_path + name, custom_objects={'AttentionLayer': AttentionLayer})



### Tokenizers

In [0]:
name = 'tok_enc_dec_main.pkl'

tokenizers_tuple = pickle.load(open(drive_path + file_path + name, 'rb'))

## Get Word Indices

In [0]:
reverse_source_word_index = x_tokenizer.index_word

reverse_target_word_index = y_tokenizer.index_word

target_word_index = y_tokenizer.word_index

## Model

In [0]:
# encoder inference
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = tf.keras.layers.Input(shape=(n_neurons,))
decoder_state_input_c = tf.keras.layers.Input(shape=(n_neurons,))
decoder_hidden_state_input = tf.keras.layers.Input(shape=(max_len_text, n_neurons))

# Get the embeddings of the decoder sequence
dec_emb_2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs_2, state_h2, state_c2 = decoder_lstm(dec_emb_2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# Attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs_2])
decoder_inf_concat = tf.keras.layers.Concatenate(axis=-1, name='concat')([decoder_outputs_2, attn_out_inf])

# Dense layer
decoder_outputs_3 = decoder_dense_1(decoder_inf_concat)

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs_4 = decoder_dense(decoder_outputs_3)

# Final decoder model
decoder_model = tf.keras.Model([decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
                               [decoder_outputs_4] + [state_h2, state_c2])

## Decode Sequence

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Chose the 'start' word as the first word of the target sequence
    target_seq[0, 0] = target_word_index['start']

    stop_condition = False

    decoded_sentence = ''

    while not stop_condition:

        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'end':

            decoded_sentence += ' ' + sampled_token

            # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'end') or (len(decoded_sentence.split()) >= (max_len_summary-1)):
                
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))

        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [0]:
def seq2summary(input_seq):

    new_string = ''

    for i in input_seq:

      if ((i != 0 and i != target_word_index['start']) and i != target_word_index['end']):

        new_string = new_string + reverse_target_word_index[i] + ' '

    return new_string

def seq2text(input_seq):

    new_string = ''

    for i in input_seq:

      if i != 0:

        new_string = new_string + reverse_source_word_index[i] + ' '
        
    return new_string

## Show Predictions

### Tokenize Test Text

In [0]:
test_text = x_tokenizer.texts_to_sequences(test_df['Cleaned_text'])

test_text = pad_sequences(test_text, maxlen=max_len_text, padding='post')

test_sum = y_tokenizer.texts_to_sequences(test_df['Cleaned_summary'])

test_sum = pad_sequences(test_sum, maxlen=max_len_summary, padding='post')

### Prediction Generator

In [245]:
for i in range(len(test_text)):

  print('Review:', seq2text(test_text[i]))

  print('Original summary:', seq2summary(test_sum[i]))

  print('Predicted summary:', decode_sequence(test_text[i].reshape(1, max_len_text)))

  print('\n')

Review: i called multiple occasions find intrest the worker told find website i tried look but i cannot also i stopped attending prism misleading programs when i started impression i working towards xxxx degree fact certificate i constantly try get intouch someone help i xxxx balance payment credit keeps going interest please help 
Original summary: misleading programs student loan help 
Predicted summary:  misleading marketing ad


Review: my mother died xxxxxxxx i inherited xxxx acres family owned along mortgage whitch suspected fradulant loan property worth neer amount money i contacted b of a told thought fradulant fha loan i didnt hear anything fraud department b of a two years got forcloser notice xxxx xxxx collection agency b of after contacting stated going sell property i imeditaly pay amount loan full i conacted congressmens office put contact hud since fha loan this found b of a paid full xxxxxxxx since mortage insurance loan so b of a paid xxxxxxxx still wanting pay loan ag