In [1]:
# imports
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras import utils
utils.set_random_seed(27)

In [2]:
# loading data
(train_ds, test_ds), info = tfds.load(name='imdb_reviews', split=['train', 'test'], 
                                      with_info=True, as_supervised=True, shuffle_files=True)

In [3]:
# looking at data
for i in train_ds.take(5).as_numpy_iterator():
  print('Text:', i[0], '\n', 'Label:', i[1], '\n')

Text: b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it." 
 Label: 0 

Text: b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish.

In [4]:
# getting size of train, test sets
print('Train set size:', info.splits['train'].num_examples)
print('Test set size:', info.splits['test'].num_examples)

Train set size: 25000
Test set size: 25000


In [5]:
# getting dtype of train, test sets
print('Text', info.features['text'].tf_dtype)
print('Label', info.features['label'].tf_dtype)

Text <dtype: 'string'>
Label <dtype: 'int64'>


In [6]:
# getting info of target feature
print('Number of classes:', info.features['label'].num_classes)
print('Class names:', info.features['label'].names)

Number of classes: 2
Class names: ['neg', 'pos']


In [7]:
# text vetorization
Vectorizer = layers.TextVectorization(max_tokens=1000+2, pad_to_max_tokens=True, standardize='lower_and_strip_punctuation', 
                                      split='whitespace', output_mode='int', output_sequence_length=100)
reviews = train_ds.map(lambda text, label: text)
Vectorizer.adapt(reviews)
Vectorizer(reviews.take(1).get_single_element())

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 11,  14,  34, 412, 384,  18,  90,  28,   1,   8,  33,   1,   1,
        42, 487,   1, 191,  24,  85, 152,  19,  11, 217, 316,  28,  65,
       240, 214,   8, 489,  54,  65,  85, 112,  96,  22,   1,  11,  93,
       642, 743,  11,  18,   7,  34, 394,   1, 170,   1, 408,   2,  88,
         1, 137,  66, 144,  51,   2,   1,   1,  66, 245,  65,   1,  16,
         1,   1,   1,   1,   1,   1,   3,  40,   1,   1,  17,   1,  14,
       158,  19,   4,   1, 891,   1,   8,   4,  18,  12,  14,   1,   5,
        99, 146,   1,  10, 237, 704,  12,  48,  24])>

In [8]:
# building input pipeline
train_ds = train_ds.shuffle(len(train_ds)).batch(32).cache().prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [9]:
# model building
model = Sequential()
model.add(layers.Input(shape=(1,), dtype=tf.string))
model.add(Vectorizer)
model.add(layers.Embedding(input_dim=Vectorizer.vocabulary_size(), output_dim=2, input_length=(100)))
model.add(layers.LSTM(units=32, activation='tanh', return_sequences=True, dropout=0.4, input_shape=(32, 100, 1)))
model.add(layers.BatchNormalization())
model.add(layers.LSTM(units=16, activation='tanh', dropout=0.4))
model.add(layers.BatchNormalization())
model.add(layers.Dense(units=8, activation='relu'))
model.add(layers.Dense(units=1, activation='sigmoid'))

In [10]:
# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 2)            2004      
                                                                 
 lstm (LSTM)                 (None, 100, 32)           4480      
                                                                 
 batch_normalization (BatchN  (None, 100, 32)          128       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 16)                3136      
                                                                 
 batch_normalization_1 (Batc  (None, 16)               6

In [11]:
# model config
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
# model training
history = model.fit(train_ds, verbose=1)



In [13]:
# model evaluation
scores = model.evaluate(test_ds, verbose=0)
print('Accuracy:', round(scores[1] * 100, 2))

Accuracy: 69.7
