In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from tensorflow import keras
from datetime import datetime

In [2]:
# Load data
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True, cache_dir='.', cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
train_dir = os.path.join(dataset_dir, 'train')
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
seed = 123
batch_size = 1024
train_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed)
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [3]:
# Preprocessing
# Create a custom standardization function to strip HTML break
# tags ’<br />’.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase,'<br />', ' ')
  return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), ' ')
# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100
# Use the text vectorization layer to normalize, split, and map
# strings to integers. Note that the layer uses the custom
# standardization defined above.
# Set maximum_sequence length as all samples are not of the same
# length.
vectorize_layer = TextVectorization(
                              standardize=custom_standardization,
                              max_tokens=vocab_size,
                              output_mode='int',
                              output_sequence_length=sequence_length)
# Make a text-only dataset (no labels) and call adapt to build
# the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [20]:
embedding_dim = 16

#Generate a model
model = keras.Sequential([vectorize_layer,
                          keras.layers.Embedding(vocab_size, embedding_dim, name="embedding"),
                          keras.layers.GlobalAveragePooling1D(), # To deal with the fact that sentences do not have the same length
                          keras.layers.Dense(16, activation='relu'),
                          keras.layers.Dense(1, activation='sigmoid')])

In [21]:
model.compile(loss=keras.losses.BinaryCrossentropy(),
              optimizer=keras.optimizers.Adam(learning_rate=0.001),
              metrics=['binary_accuracy'])

In [22]:
model.fit(train_ds, batch_size=64, epochs=30, validation_data=val_ds)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40


KeyboardInterrupt: ignored