In [1]:
##############################
#Preparing movie review data #
##############################

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [3]:
#Before feeding the data into the RNN - need to apply preprocessing steps
# 1) Create a TF dataset object - split into training, test, and validation partitions
# 2) Identify the unique words in the training dataset
# 3) Map each unique word to a unique integer and encode the review text into encoded integers
# 4) Divide the dataset into mini-batches as input to the model

In [2]:
#Step 1) Creating a tensorflow dataset

target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices(
    (df.values, target.values))

## Inspection:
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][ :50], ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [3]:
# Splitting the data set - 50,000 total sample - 25,000 to evaluation, 20,000 for training and 5000 for validation
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(
    50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)


In [4]:
# Step 2: find unique tokens (words)

from collections import Counter

tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))


Vocab-size: 87007


In [5]:
# Step 3: encoding unique tokens to integers
encoder = tfds.features.text.TokenTextEncoder(token_counts) #ignores the counts and just takes the words
example_str = 'This is an example!'

print(encoder.encode(example_str))


[232, 9, 270, 1123]


In [6]:
# Next step is to use the .map() method to transform the text using the encoder but
# first need to do the below

# Step 3-A: Define the function for transformation
# Because the text data is enclosed in a tensor object - we cant use the .map() method directly
# We need to create two functions

#This first function treats the input tensors as if the .numpy() method was in eager execution mode
# The .numpy() method is able to read the text data that is in the tensor object

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

In [7]:
# Function 2) A wrapper for the first function using tf.py_function - convert it to a TF operator
# .map() method can then be used

#Step 3-B: Wrap the encode function to a TF op.

def encode_map_fn(text, label):
    return tf.py_function(encode, inp = [text, label],
                         Tout = (tf.int64, tf.int64))

#Using the wrapper function to use map and to transform the text to tokens
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

#Look at the shape of some examples:
tf.random.set_seed(1)

for example in ds_train.shuffle(1000).take(5):
    print('Sequence length:', example[0].shape)

Sequence length: (24,)
Sequence length: (179,)
Sequence length: (262,)
Sequence length: (535,)
Sequence length: (130,)


In [8]:
# An issue that still exists after converting the sequences of words into sequences of intergers, is 
# that they have different lengths - RNN are more efficient if they are the same length

#Can use padded_batch() to combine different sizes sequences into mini-batches
# takes the maximum (largest) sequence and pad the smaller ones to be the same length (using 0s)

#Take a small subset
ds_subset = ds_train.take(8)
for example in ds_subset:
    print('Individual size:', example[0].shape)

# Can see for the 8 samples, all are different lengths

Individual size: (119,)
Individual size: (688,)
Individual size: (308,)
Individual size: (204,)
Individual size: (326,)
Individual size: (240,)
Individual size: (127,)
Individual size: (453,)


In [9]:
# Dividing the dataset into batches
ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))

for batch in ds_batched:
    print('Batch dimension:', batch[0].shape)
    
#Here we can see the 4 batches and the columns being 688 and 453 (batch.shape[1])
# 688 is the maximum size from the samples in the batch so all others are padded to match
# same with the 453

Batch dimension: (4, 688)
Batch dimension: (4, 453)


In [10]:
# Will do the above to batch the all the samples into batch size of 32

train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))

valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))

test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

#Now the data is in a suitable format for the RNN model
# First we will go over feature embedding which is optional but recommended 
# for reducing dimensionality of the word vectors

In [None]:
##########################################
# Embedding layers for sentence encoding #
##########################################

In [11]:
from tensorflow.keras.layers import Embedding

model = tf.keras.Sequential()

model.add(Embedding(input_dim = 100, #unique integer values that the model will receive as input
                   output_dim=6, # output size of the embedding feature
                   input_length=20, # length of the sequence inputs - can be set to None for variable
                   name = 'embed-layer'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, 20, 6)             600       
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [None]:
#########################
# Building an RNN Model #
#########################

In [12]:
# We begin by building an RNN model, starting with an embedding layer with input_dim = 1000 and output_dim = 32.
# Then two recurrent layers of SimpleRNN (regular, fully connected recurrent layer - not gated or LSTM)
# Lastly, a non-recurrent dense layer as the output layer with a single output value as the prediction

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))

model.summary()


#The above is just an example of how to build an RNN model
# Below we will build one for the sentiment analysis

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          32000     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [None]:
################################################
# Building an RNN model for sentiment analysis #
################################################

In [None]:
# We have long sequences (input data length) so we are going to use a LSTM layer
# to account for long-term effects - we will also put the LSTM layer inside
# a Bidirectional wrapper - makes the recurrent layers pass through the input sequences
# from both directions - start to end as well as the reverse

In [13]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

# Build the model
bi_lstm_model = tf.keras.Sequential([ #Start with the embedding - length is sample size + 2
    tf.keras.layers.Embedding(
        input_dim = vocab_size,
        output_dim = embedding_dim,
        name = 'embed-layer'),
    
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, name = 'lstm-layer'), # bidirectional LSTM layer
        name = 'bidir-lstm'),
    
    tf.keras.layers.Dense(64, activation = 'relu'), # first dense layer - relu activation - feature size = 64
    
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

bi_lstm_model.summary()

## Compile and train:
bi_lstm_model.compile(
    optimizer = tf.keras.optimizers.Adam(1e-3),
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    metrics = ['accuracy'])
#since we specify sigmoid activation in the last layer - our outputs are probabilities - from_logits=False

history = bi_lstm_model.fit( # training the data
    train_data,
    validation_data = valid_data,
    epochs = 10)

##Evaluate on the test data

test_results = bi_lstm_model.evaluate(test_data)

print('Test acc.: {:.2f}%'.format(test_results[1]*100))


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1740180   
_________________________________________________________________
bidir-lstm (Bidirectional)   (None, 128)               43520     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,792,021
Trainable params: 1,792,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc.: 80.81%


In [17]:
#Creating a pre-processing wrapper that only take the last portion of each review
# doing this to use the SimpleRNN which fails with the above task because the sequence lengths are too long
# The helpful function below, preprocess_datasets(), combines preprocessing steps 2-4

from collections import Counter

def preprocess_datasets(
    ds_raw_train,
    ds_raw_valid,
    ds_raw_test,
    max_seq_length = None, #optional - how many tokens from each review - if 100, then only the last 100 tokens
    batch_size = 32):
    
    ##(step 1 is already done)
    ## Step 2: find unique tokens
    tokenizer = tfds.features.text.Tokenizer()
    token_counts = Counter()
    
    for example in ds_raw_train:
        tokens = tokenizer.tokenize(example[0].numpy()[0])
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
        token_counts.update(tokens)
        
    print('Vocab-size:', len(token_counts))
    
    #Step 3: encoding the texts
    encoder = tfds.features.text.TokenTextEncoder(
        token_counts)
    
    def encode(text_tensor, label):
        text = text_tensor.numpy()[0]
        encoded_text = encoder.encode(text)
        if max_seq_length is not None:
            encoded_text = encoded_text[-max_seq_length:]
        return encoded_text, label
    
    def encode_map_fn(text, label):
        return tf.py_function(encode, inp=[text, label],
                             Tout=(tf.int64, tf.int64))
    
    ds_train = ds_raw_train.map(encode_map_fn)
    ds_valid = ds_raw_valid.map(encode_map_fn)
    ds_test = ds_raw_test.map(encode_map_fn)
    
    #Step 4: batching the datasets
    train_data = ds_train.padded_batch(
        batch_size, padded_shapes = ([-1], []))
    
    valid_data = ds_valid.padded_batch(
        batch_size, padded_shapes = ([-1], []))
    
    test_data = ds_test.padded_batch(
    batch_size, padded_shapes = ([-1], []))
    
    return(train_data, valid_data, test_data, 
          len(token_counts))
    
    
    


In [15]:
#Now another helper function, build_rnn_model(), for building models with different architecture
# more easily

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU

def build_rnn_model(embedding_dim, vocab_size,
                   recurrent_type = 'SimpleRNN',
                   n_recurrent_units = 64,
                   n_recurrent_layers = 1,
                   bidirectional = True):
    
    tf.random.set_seed(1)
    
    #build the model
    model = tf.keras.Sequential()
    
    model.add(Embedding(
        input_dim = vocab_size,
        output_dim= embedding_dim,
        name = 'embed-layer'))
    
    for i in range(n_recurrent_layers):
        return_sequences = (i < n_recurrent_layers-1)
        
        if recurrent_type == 'SimpleRNN':
            recurrent_layer = SimpleRNN(
                units = n_recurrent_units,
                return_sequences=return_sequences,
                name = 'simprnn-layer-{}'.format(i))
        
        elif recurrent_type == 'LSTM':
            recurrent_layer = LSTM(
                units = n_recurrent_units,
                return_sequences=return_sequences,
                name = 'lstm-layer-{}'.format(i))
        
        elif recurrent_type == 'GRU':
            recurrent_layer = GRU(
                units = n_recurrent_units,
                return_sequences=return_sequences,
                name = 'gru-rnn_layer{}'.format(i))
            
        if bidirectional:
            recurrent_layer = Bidirectional(
                recurrent_layer, name = 'bidir-'+
                recurrent_layer.name)
            
        model.add(recurrent_layer)
        
    model.add(tf.keras.layers.Dense(64, activation = 'relu'))
    model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))
    
    return model

In [18]:
#With the above simple functions - it is possible to compare different RNN models with
# different input sequence lengths

#Below is an example where the maximum length is set at 100 tokens

batch_size = 32
embedding_dim = 20
max_seq_length = 100

train_data, valid_data, test_data, n = preprocess_datasets(
    ds_raw_train, ds_raw_valid, ds_raw_test,
    max_seq_length=max_seq_length,
    batch_size=batch_size)

vocab_size = n + 2

rnn_model = build_rnn_model(
    embedding_dim, vocab_size,
    recurrent_type='SimpleRNN',
    n_recurrent_units=64,
    n_recurrent_layers=1,
    bidirectional=True)

rnn_model.summary()

Vocab-size: 58063
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1161300   
_________________________________________________________________
bidir-simprnn-layer-0 (Bidir (None, 128)               10880     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 1,180,501
Trainable params: 1,180,501
Non-trainable params: 0
_________________________________________________________________


In [19]:
rnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy'])

history = rnn_model.fit(
    train_data,
    validation_data = valid_data,
    epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
