In [12]:
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz

In [3]:
data_path = r'./Datasets/aclImdb/'
batch_size = 32

In [4]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(f'{data_path}train', 
                                                                  validation_split=0.2, 
                                                                  subset='training',
                                                                  seed=42)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(f'{data_path}train', 
                                                                validation_split=0.2, 
                                                                subset='validation',
                                                                seed=42)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(f'{data_path}test')

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [5]:
print(f'Number of batches: {tf.data.experimental.cardinality(raw_train_ds).numpy(), tf.data.experimental.cardinality(raw_val_ds).numpy(), tf.data.experimental.cardinality(raw_test_ds).numpy()}')

Number of batches: (625, 157, 782)


In [6]:
def custom_standardization(input_data):
    lower_case = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lower_case, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '%s' % re.escape(string.punctuation), '')

In [7]:
max_features = 20000
embedding_dim = 128
sequence_length = 500

In [8]:
vectorization_layer = TextVectorization(standardize=custom_standardization,
                                        max_tokens = max_features,
                                        output_mode = 'int',
                                        output_sequence_length=sequence_length)

In [9]:
text_ds = raw_train_ds.map(lambda x, y: x)
vectorization_layer.adapt(text_ds)

In [11]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorization_layer(text), label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async pre-fetching of the data for better performance on GPU
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [13]:
inputs = tf.keras.Input(shape=(None, ), dtype='int64')
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(128, 7, padding='valid', activation='relu', strides=3)(x)
x = layers.Conv1D(128, 7, padding='valid', activation='relu', strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.5)(x)
predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)
model = tf.keras.Model(inputs, predictions)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f575a8086d0>

In [15]:
model.evaluate(test_ds)



[0.43074753880500793, 0.8644400238990784]

## End to end model

In [17]:
inputs = tf.keras.Input(shape=(1, ), dtype='string')
indices = vectorization_layer(inputs)
outputs = model(indices)

end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
end_to_end_model.evaluate(raw_test_ds)



[0.43074774742126465, 0.8644400238990784]