In [1]:
# after installation and kernel upgrade restart needed
!pip install -q --upgrade keras-nlp
!pip install -q --upgrade keras

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.0.5 which is incompatible.[0m[31m
[0m

In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import keras_nlp
import tensorflow as tf
import shutil
import keras

AUTOTUNE = tf.data.AUTOTUNE
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")

# set training and testing data paths
train_dir = os.path.join(dataset_dir, "train")
test_dir = os.path.join(dataset_dir, "test")

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [3]:
# create datasets
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)

val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

test_ds = tf.keras.utils.text_dataset_from_directory(
    test_dir, batch_size=batch_size)

class_labels = train_ds.class_names
print("\nClass names:", class_labels)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.

Class names: ['neg', 'pos']


In [4]:
# concat reviews texts from train and validation datasets discarding labels 
reviews = tf.concat(([review for review, _ in train_ds.unbatch()], 
                     [review for review, _ in val_ds.unbatch()]), axis=0)
# TextVectorization layer allows efficiently extract vocabulary from the text 
# while optionally appling standardization to it
vectorizer = tf.keras.layers.TextVectorization()
vectorizer.adapt(reviews)
# compute the number of tokens per each review
bows = vectorizer(reviews)
counts = tf.math.count_nonzero(bows, axis=1, keepdims=True)
# get a mean and standard deviation of reviews word counts 
mean, std = tf.math.reduce_mean(counts), tf.math.reduce_std(tf.cast(counts, dtype=tf.float32))
min, max = tf.reduce_min(counts), tf.reduce_max(counts)

print("Number of unique words in both train and valid datasets:", vectorizer.vocabulary_size())
print(f"Reviews contain {mean} words on average, with standard deviation of {std} words")
print(f"Minimum/maximum review word count: {min}/{max}")

Number of unique words in both train and valid datasets: 121894
Reviews contain 232 words on average, with standard deviation of 173.0606689453125 words
Minimum/maximum review word count: 10/2469


## Custom vocabulary from the IMDB dataset

In [8]:
def create_vocab(vocab_size, dataset, **kwargs):
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        dataset.map(lambda x, y: x),
        vocabulary_size=vocab_size,
        lowercase=True,
        strip_accents=True,
        reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
        **kwargs,
    )
    return vocab

def create_tokenizer(vocab, **kwargs):
    tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
        vocabulary=vocab,
        lowercase=True,
        strip_accents=True,
        oov_token="[UNK]",
        **kwargs,
    )
    return tokenizer

def create_packer(tokenizer, **kwargs):
    packer = keras_nlp.layers.StartEndPacker(
        start_value=tokenizer.token_to_id("[START]"),
        end_value=tokenizer.token_to_id("[END]"),
        pad_value=tokenizer.token_to_id("[PAD]"),
        sequence_length=512,
        **kwargs,
    )
    return packer

def preprocess(x, y):
    token_ids = packer(tokenizer(x))
    return token_ids, y

def get_prepr_dataset(dataset):
    return dataset.map(preprocess, num_parallel_calls=AUTOTUNE).prefetch(AUTOTUNE)

In [9]:
vocabulary = create_vocab(vocab_size=30_000, dataset=train_ds)
tokenizer = create_tokenizer(vocab=vocabulary)
packer = create_packer(tokenizer)

train_prepr = get_prepr_dataset(train_ds)
val_prepr = get_prepr_dataset(val_ds)

print(train_prepr.unbatch().take(1).get_single_element())

(<tf.Tensor: shape=(512,), dtype=int32, numpy=
array([    1,  1437,   406,    16,  2578,   406,    25,   100,    96,
         147,   328,    98,    96,   294,    16,    97,   130,   267,
          24,    16,   102,    11,    61,  5943,    99,    96,   175,
         385,   646,   125,   518,    97,    96,   283,   105,   102,
          11,    61,    43,  1089,    17,  7418,   279,   601,    18,
          32,   101,    19,    34,    32,   101,    19,    34,   646,
          99,    96,  8873,    16,   131,    11,    61,   140,   690,
         273,   263,   113,   219,    18,    18,    18,  4373,  5147,
         395,    43,  2993, 10362,   868,  1055,  3114,  4702,    97,
         118,  1037,   554,    11,    61,   494,   100, 15302,    18,
        1824,    43,  3884,    16,   345,    35,   239,    16,    43,
         208,   501,   868,  3314,   655,    43,  3496,   123,    96,
        1402,   121,  2214,    16,    97,  3078,   102,    99,   261,
          43,  1101,   113,    96, 17469,  

## Design a tiny transformer

In [20]:
def create_model(vocab):
    # model inputs 
    token_id_input = keras.Input(shape=(None,), dtype="int32", name="token_ids",)
    # positional encoding + token encoding
    outputs = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size=len(vocab),
        sequence_length=packer.sequence_length,
        embedding_dim=64)(token_id_input)
    # 2-headed transformer encoder
    outputs = keras_nlp.layers.TransformerEncoder(
        num_heads=2,
        intermediate_dim=128,
        dropout=0.1)(outputs)
    # the "[START]" token (id = 0) is used for classification
    outputs = keras.layers.Dense(2)(outputs[:, 0, :])
    model = keras.Model(inputs=token_id_input, outputs=outputs,)
    return model

def compile_model(model, lr_rate):
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=keras.optimizers.AdamW(lr_rate),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
        jit_compile=True)

def get_performance(model, dataset, metric="sparse_categorical_accuracy"):
    """Evaluates model on a given dataset and returns specified metric value"""
    return model.evaluate(dataset, return_dict=True, verbose=0)[metric]


model = create_model(vocabulary)
model.summary()

## Train the transformer directly on the classification objective

In [15]:
class TriangularScheduler(keras.optimizers.schedules.LearningRateSchedule):
    """Linear ramp up for 'warmup' steps, then linear decay to 0 at 'total steps'."""
    def __init__(self, rate, warmup, total):
        self.rate = rate
        self.warmup = warmup
        self.total = total

    def get_config(self):
        config = {"rate": self.rate, "warmup": self.warmup, "total": self.total}
        return config

    def __call__(self, step):
        step = keras.ops.cast(step, dtype="float32")
        rate = keras.ops.cast(self.rate, dtype="float32")
        warmup = keras.ops.cast(self.warmup, dtype="float32")
        total = keras.ops.cast(self.total, dtype="float32")

        warmup_rate = rate * step / self.warmup
        cooldown_rate = rate * (total - step) / (total - warmup)
        triangular_rate = keras.ops.minimum(warmup_rate, cooldown_rate)
        return keras.ops.maximum(triangular_rate, 0.0)

epochs = 15
batch_size = 32
steps_per_epoch = train_ds.cardinality().numpy()
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(0.1 * num_train_steps)
initial_learning_rate = 5e-5

warmup_schedule = TriangularScheduler(initial_learning_rate, warmup_steps, num_train_steps)

In [30]:
compile_model(model, lr_rate=warmup_schedule)

e_stop = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
model.fit(
    train_prepr, validation_data=val_prepr, epochs=15,
    callbacks=[e_stop],
)

Epoch 1/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - loss: 0.7141 - sparse_categorical_accuracy: 0.4912 - val_loss: 0.6889 - val_sparse_categorical_accuracy: 0.5076
Epoch 2/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 0.6874 - sparse_categorical_accuracy: 0.5404 - val_loss: 0.5697 - val_sparse_categorical_accuracy: 0.7522
Epoch 3/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 0.5183 - sparse_categorical_accuracy: 0.7478 - val_loss: 0.4467 - val_sparse_categorical_accuracy: 0.7786
Epoch 4/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 0.3627 - sparse_categorical_accuracy: 0.8421 - val_loss: 0.3343 - val_sparse_categorical_accuracy: 0.8538
Epoch 5/15
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - loss: 0.2869 - sparse_categorical_accuracy: 0.8826 - val_loss: 0.3115 - val_sparse_categorical_accurac

<keras.src.callbacks.history.History at 0x7f67a0259fd0>

In [31]:
model.evaluate(imdb_test_prepr)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.3019 - sparse_categorical_accuracy: 0.8815


[0.31072551012039185, 0.8772000074386597]

In [21]:
# create a range of vocab sizes to explore
vocab_sizes = [10_000, 30_000, 50_000, 70_000, 100_000]

# dictionary to save models performances
performance = {}

In [26]:
# 10_000
# data tokenization & packing
vocabulary = create_vocab(vocab_size=vocab_sizes[0], dataset=train_ds)
tokenizer = create_tokenizer(vocab=vocabulary)
packer = create_packer(tokenizer)
# get preprocessed training and validation data using new vocabulary 
train_prepr = get_prepr_dataset(train_ds)
val_prepr = get_prepr_dataset(val_ds)
# set up model and compile it
model = create_model(vocabulary)
compile_model(model, lr_rate=warmup_schedule)
# train the model using triangular scheduling
e_stop = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
model.fit(
    train_prepr, validation_data=val_prepr, epochs=1, callbacks=[e_stop],
)
# get performance on training/validation sets to compare later 
performance[vocab_sizes[0]] = (get_performance(model, train_prepr), get_performance(model, val_prepr))
print(f"{vocab_sizes[0]}-vocabulary model performance on the validation set: {performance[vocab_sizes[0]][1]}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - loss: 0.7273 - sparse_categorical_accuracy: 0.5052 - val_loss: 0.6898 - val_sparse_categorical_accuracy: 0.5076
10000-vocabulary model performance on the validation set: 0.5076000094413757
