In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, Dense
import tensorflow_datasets as tfds

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('../datasets/Movie_data.csv', encoding='utf-8')
df.head(2)

Unnamed: 0,review,sentiment
0,at a Saturday matinee in my home town. I went ...,0
1,I love this movie. It is the first film Master...,1


- Creating dataset.

In [3]:
target = df.pop('sentiment')
data_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [4]:
for sample in data_raw.take(3):
    tf.print(sample[0].numpy()[0][:70], sample[1])

b'at a Saturday matinee in my home town. I went with an older friend (he' 0
b'I love this movie. It is the first film Master P have ever done. It is' 1
b'In the voice over which begins the film, Hughie(Billy Connolly), a roa' 1


In [5]:
tf.random.set_seed(1)

data_raw = data_raw.shuffle(50000, reshuffle_each_iteration=False)

ds_raw_test = data_raw.take(25000)
ds_train_valid = data_raw.skip(25000)
ds_raw_train = ds_train_valid.take(20000)
ds_raw_valid = ds_train_valid.skip(20000)

- Finding unique words in training set.

In [6]:
from collections import Counter

tokenizer = tfds.deprecated.text.Tokenizer()

token_counts = Counter()

In [7]:
for sample in ds_raw_train:
    tokens = tokenizer.tokenize(sample[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab size: {}'.format(len(token_counts)))

Vocab size: 86963


- Encode the unique words into integers.

In [8]:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
sample_str = 'This is an example string :))'
print(encoder.encode(sample_str))

[8, 9, 169, 995, 2725]


- Convenient functions.

In [9]:
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=[tf.int64, tf.int64])

In [10]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test  = ds_raw_test.map(encode_map_fn)

In [11]:
for sample in ds_train.shuffle(1000).take(5):
    print('Sequence length: {}'.format(sample[0].shape))

Sequence length: (240,)
Sequence length: (109,)
Sequence length: (203,)
Sequence length: (134,)
Sequence length: (117,)


- Unifying the shape. (Although RNNs can handle that.)

In [12]:
take = ds_train.take(8)
for sample in take:
    print("Individual shape: ", sample[0].shape)

Individual shape:  (152,)
Individual shape:  (77,)
Individual shape:  (178,)
Individual shape:  (130,)
Individual shape:  (172,)
Individual shape:  (164,)
Individual shape:  (100,)
Individual shape:  (112,)


In [13]:
take_batched = take.padded_batch(batch_size=4, padded_shapes=([-1], []))
for batch in take_batched:
    print('Batch dimension', batch[0].shape)

Batch dimension (4, 178)
Batch dimension (4, 172)


In [14]:
train_data = ds_train.padded_batch(batch_size=32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(batch_size=32, padded_shapes=([-1], []))
test_data  = ds_test.padded_batch(batch_size=32, padded_shapes=([-1], []))

- Embedding layers for sentence encoding

In [15]:
model = tf.keras.Sequential([
    Embedding(input_dim=100,
              output_dim=6,
              input_length=20,
              name='embed_layer')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed_layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600 (2.34 KB)
Trainable params: 600 (2.34 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Building RNN.
- Quite straightforward.

In [16]:
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))


model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          32000     
                                                                 
 simple_rnn (SimpleRNN)      (None, None, 32)          2080      
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 36193 (141.38 KB)
Trainable params: 36193 (141.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#### RNN for Sentiment Analysis.
- Since there are very long sequences, using an `LSTM layer` will account for
long-term effects. In addition, putting the `LSTM layer` inside a `Bidirectional
wrapper`, makes the recurrent layers pass through the input sequences
from both directions, start to end, as well as the reverse direction

In [17]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size,
                         output_dim=embedding_dim,
                         name='embed_layer'))
lstm_model.add(Bidirectional(
    tf.keras.layers.LSTM(64, name='lstm_layer'),
    name='bidire_layer'
))

lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dense(1, activation='sigmoid'))

In [18]:
lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed_layer (Embedding)     (None, None, 20)          1739300   
                                                                 
 bidire_layer (Bidirectiona  (None, 128)               43520     
 l)                                                              
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1791141 (6.83 MB)
Trainable params: 1791141 (6.83 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-1),
                   loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                   metrics=['accuracy']
    )

In [20]:
history = lstm_model.fit(train_data, 
                         validation_data=valid_data,
                         epochs=10)

Epoch 1/10
  3/625 [..............................] - ETA: 8:55 - loss: 2.0222 - accuracy: 0.5729

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 

In [None]:
test_results = lstm_model.evaluate(test_data)
print('Test Acc: {:.4f}%'.format(test_results[1] * 100))

In [None]:
def preprocess_datasets(ds_raw_train, ds_raw_valid, ds_raw_test,
                        max_seq_length=None, batch_size=32):
    ## (step 1 is already done)
    # ## Step 2: find unique tokens
    
    tokenizer = tfds.features.text.Tokenizer()
    token_counts = Counter()
    
    for example in ds_raw_train:
        tokens = tokenizer.tokenize(example[0].numpy()[0])
        
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
            token_counts.update(tokens)
        
        print('Vocab-size:', len(token_counts))
            
        ## Step 3: encoding the texts
        encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
            
        def encode(text_tensor, label):
            text = text_tensor.numpy()[0]
            encoded_text = encoder.encode(text)
            
            if max_seq_length is not None:
                encoded_text = encoded_text[-max_seq_length:]
                
            return encoded_text, label
        
        def encode_map_fn(text, label):
            return tf.py_function(encode, inp=[text, label],Tout=(tf.int64, tf.int64))
        
        
        ds_train = ds_raw_train.map(encode_map_fn)
        ds_valid = ds_raw_valid.map(encode_map_fn)
        ds_test = ds_raw_test.map(encode_map_fn)
        
        ## Step 4: batching the datasets
        train_data = ds_train.padded_batch(batch_size, padded_shapes=([-1],[]))
        valid_data = ds_valid.padded_batch(batch_size, padded_shapes=([-1],[]))
        test_data = ds_test.padded_batch(batch_size, padded_shapes=([-1],[]))
    
    return (train_data, valid_data, test_data, len(token_counts))

In [None]:
from tensorflow.keras.layers import GRU, LSTM

def build_rnn_model(embedding_dim, vocab_size,
                    recurrent_type='SimpleRNN',
                    n_recurrent_units=64,
                    n_recurrent_layers=1,
                    bidirectional=True):
    tf.random.set_seed(1)
    
    # build the model
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        name='embed-layer'))
    
    for i in range(n_recurrent_layers):
        return_sequences = (i < n_recurrent_layers-1)
        
        if recurrent_type == 'SimpleRNN':
            recurrent_layer = SimpleRNN(units=n_recurrent_units,
                                        return_sequences=return_sequences,
                                        name='simprnn-layer-{}'.format(i))    
        elif recurrent_type == 'LSTM':
            recurrent_layer = LSTM(units=n_recurrent_units,
                                   return_sequences=return_sequences,
                                   name='lstm-layer-{}'.format(i))
        elif recurrent_type == 'GRU':
            recurrent_layer = GRU(units=n_recurrent_units,
                                  return_sequences=return_sequences,
                                  name='gru-layer-{}'.format(i))
            
        if bidirectional:
            recurrent_layer = Bidirectional(recurrent_layer, name='bidir-' + recurrent_layer.name)
    
        model.add(recurrent_layer)
        
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [None]:
batch_size = 32
embedding_dim = 20

max_seq_length = 100
train_data, valid_data, test_data, n = preprocess_datasets(ds_raw_train, ds_raw_valid, 
                                                           ds_raw_test,max_seq_length=max_seq_length,
                                                           batch_size=batch_size)

vocab_size = n + 2
rnn_model = build_rnn_model(embedding_dim, vocab_size,recurrent_type='SimpleRNN',
                            n_recurrent_units=64, n_recurrent_layers=1, bidirectional=True)

rnn_model.summary()

In [None]:
results = rnn_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(results[1]*100))