In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [4]:
#data loading using sentiment140 for text classification
#tfds dataset
dataset, info = tfds.load('imdb_reviews',with_info=True,as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 100
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [6]:
#preprocessing
#tfds loads raw texts we need to vectorize the text to feed into our model

#TextVectorization layers
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
#encoder is now able to encode text to a matrix 
#unknwon vocabulary are replaced with a known token with no value 

In [7]:
#Model development

# input -> encoder -> embedding layer -> bidirectional lstm -> dense ->dense ->classification

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

#which layers support masking, masking allows the embedding layer to handle varying sequence lengths
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [8]:
#compile and training
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
import numpy as np
sample_text = ('This was a great movie. Nolan is a really good director')
predictions = model.predict(np.array([sample_text]))
if predictions[0]>0.5:
    print(1)
else:
    print(0)

1
