KerasNLP is a natural language processing library that supports users through their entire development cycle. Our workflows are built from modular components that have state-of-the-art preset weights and architectures when used out-of-the-box and are easily customizable when more control is needed.

In [None]:
# !pip install -q --upgrade keras-nlp

In [18]:
import keras_nlp

In [5]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  # or "tensorflow" or "torch"


import keras

# Use mixed precision to speed up all training in this guide.
keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz
# !# Remove unsupervised examples
# !rm -r aclImdb/train/unsup

In [11]:
data_path = 'D:/a27_YEARS_OLD/deep_learning/tensoflow/aclImdb_v1/aclImdb/'

In [None]:
BATCH_SIZE = 16
imdb_train = keras.utils.text_dataset_from_directory(
    data_path + 'train',
    batch_size=BATCH_SIZE,
)

Found 25000 files belonging to 2 classes.


In [13]:
imdb_test = keras.utils.text_dataset_from_directory(
    data_path + 'test',
    batch_size=BATCH_SIZE,
)

Found 25000 files belonging to 2 classes.


In [19]:
# Inspect first review
# Format is (review text tensor, label tensor)
print(imdb_train.unbatch().take(1).get_single_element())

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was the best documentary I've ever seen!! I just saw Lords of Dogtown and wanted to know more about Stacy Peralta, and was surprised and happy to find out this was one of his films as well. Great Job Stacy! I was kicking back at work last week, bored O*&^%less and this movie came on. Growing up in Orange County in the 80's I surfed up and down the local beaches and so did my dad when he was a teenager. I grew up at the beach, my parents took me every weekend, I body surfed, boogeyboarded then moved up from there. This movie just captivated me. It was way before my time but it was awesome to see what these guys went through..TRUE PIONEERS! This movie is a collectors item.">, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


sentiment classifier positive negative

In [20]:
print(dir(keras_nlp))

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'encoders', 'layers']


In [None]:
classifier = keras_nlp.models.BertClassifier.from_preset("bert_tiny_en_uncased_sst2")
# Note: batched inputs expected so must wrap string in iterable


AttributeError: module 'keras_nlp.layers' has no attribute 'BertClassifier'

In [None]:
# classifier.predict(["I love modular workflows in keras-nlp!"])

In [None]:
# classifier.evaluate(imdb_test)

In [None]:
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased",
    num_classes=2,
)

In [None]:
classifier.fit(
    imdb_train,
    validation_data=imdb_test,
    epochs=1,
)

Fine tuning 

In [None]:
import tensorflow as tf

preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_tiny_en_uncased",
    sequence_length=512,
)

In [None]:
# Apply the preprocessor to every sample of train and test data using `map()`.
# [`tf.data.AUTOTUNE`](https://www.tensorflow.org/api_docs/python/tf/data/AUTOTUNE) and `prefetch()` are options to tune performance, see
# https://www.tensorflow.org/guide/data_performance for details.

# Note: only call `cache()` if you training data fits in CPU memory!
imdb_train_cached = (
    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
)
imdb_test_cached = (
    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
)

In [None]:
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased", preprocessor=None, num_classes=2
)
classifier.fit(
    imdb_train_cached,
    validation_data=imdb_test_cached,
    epochs=3,
)

In [None]:
tokenizer = keras_nlp.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
tokenizer(["I love modular workflows!", "Libraries over frameworks!"])

MultiSegmentPacker is a layer in KerasNLP (part of TensorFlow) that packs multiple segments of text into a single input sequence, preparing data for BERT-style transformer models.
What does MultiSegmentPacker do?
It takes multiple segments of text (e.g., question and answer pairs) and:
Concatenates them into a single sequence.
Adds special tokens: start_value (CLS) and end_value (SEP) to indicate segment boundaries.
Pads the sequence to a uniform length.

In [None]:
# Write your own packer or use one of our `Layers`
packer = keras_nlp.layers.MultiSegmentPacker(
    start_value=tokenizer.cls_token_id,
    end_value=tokenizer.sep_token_id,
    # Note: This cannot be longer than the preset's `sequence_length`, and there
    # is no check for a custom preprocessor!
    sequence_length=64,
)

# packer = keras_nlp.layers.MultiSegmentPacker(
#     start_value=tokenizer.cls_token_id,  # 101
#     end_value=tokenizer.sep_token_id     # 102
# )

# # Input segments
# segment1 = tf.constant([1, 2, 3])  # "Hello"
# segment2 = tf.constant([4, 5, 6])  # "World"

# # Pack segments
# packed_sequence = packer([segment1, segment2])

# print(packed_sequence)  # [101, 1, 2, 3, 102, 4, 5, 6, 102]

In [None]:
# This function that takes a text sample `x` and its
# corresponding label `y` as input and converts the
# text into a format suitable for input into a BERT model.

In [None]:
def preprocessor(x, y):
    token_ids, segment_ids = packer(tokenizer(x))
    x = {
        "token_ids": token_ids,
        "segment_ids": segment_ids,
        "padding_mask": token_ids != 0,
    }
    return x, y


In [None]:
imdb_train_preprocessed = imdb_train.map(preprocessor, tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
imdb_test_preprocessed = imdb_test.map(preprocessor, tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

# Preprocessed example
print(imdb_train_preprocessed.unbatch().take(1).get_single_element())

# imdb_train_preprocessed.unbatch(): Removes batch dimensions from the dataset.
# .take(1): Returns a dataset containing only the first element.
# .get_single_element(): Returns the first element of the dataset as a numpy array.

Fine tuning

In [None]:
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_tiny_en_uncased")
backbone = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased")

In [None]:
imdb_train_preprocessed = (
    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
)
imdb_test_preprocessed = (
    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
)

# This line of code is optimizing the imdb_train dataset by applying three key transformations:
# Mapping: Applying a preprocessing function (preprocessor) to each element.
# Caching: Storing the preprocessed data in memory for faster access.
# Prefetching: Overlapping computation and I/O to improve performance.

# imdb_train.map(preprocessor, tf.data.AUTOTUNE): Applies the preprocessor function to each element in imdb_train.
# tf.data.AUTOTUNE: Automatically adjusts parallelism (number of threads) for optimal performance.

# .cache(): Stores the preprocessed data in memory, so subsequent iterations can access it faster.
# .prefetch(tf.data.AUTOTUNE): Overlaps computation and I/O, loading the next batch while the current one is processed.
# tf.data.AUTOTUNE: Automatically adjusts prefetching buffer size for optimal performance.

In [None]:
backbone.trainable = False
inputs = backbone.input

In [None]:
sequence = backbone(inputs)["sequence_output"]
# inputs: Input data, often a tensor or dictionary containing input IDs, attention masks, and token type IDs.
# sequence_output: A tensor representing the last hidden state of the sequence.

In [None]:
for _ in range(2):
    sequence = keras_nlp.layers.TransformerEncoder(
        num_heads=2,
        intermediate_dim=512,
        dropout=0.1,
    )(sequence)

#This code applies a Transformer Encoder layer twice to the input sequence, enhancing its representation.
# TransformerEncoder parameters
# num_heads=2: Number of attention heads.
# intermediate_dim=512: Dimensionality of the intermediate (feed-forward) layer.
# dropout=0.1: Dropout rate (randomly sets 10% of weights to zero).
# TransformerEncoder architecture
# Self-Attention: Computes attention weights between sequence elements.
# Feed-Forward Network (FFN): Transforms attention-weighted outputs.
# Layer Normalization: Normalizes activations.
# Residual Connection: Adds input to output.
# Applying TransformerEncoder twice
# First pass: Processes input sequence, capturing local dependencies.
# Second pass: Refines output from the first pass, capturing more complex dependencies.


In [None]:
# Use [CLS] token output to classify
outputs = keras.layers.Dense(2)(sequence[:, backbone.cls_token_index, :])
# keras.layers.Dense(2):
# Creates a Dense layer with 2 output units
# sequence[:, backbone.cls_token_index, :]:
# sequence: The sequence output from the backbone model.
# backbone.cls_token_index: The index of the CLS token in the sequence (usually 0).
# :: Selects all features (hidden dimensions) for the CLS token.

In [None]:
model = keras.Model(inputs, outputs)

In [None]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.AdamW(5e-5),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    jit_compile=True,
)

In [None]:
model.summary()

In [None]:

model.fit(
    imdb_train_preprocessed,
    validation_data=imdb_test_preprocessed,
    epochs=3,
)

preprocesing

In [None]:
# All BERT `en` models have the same vocabulary, so reuse preprocessor from
# "bert_tiny_en_uncased"
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    "bert_tiny_en_uncased",
    sequence_length=256,
)
packer = preprocessor.packer
tokenizer = preprocessor.tokenizer

In [None]:
# keras.Layer to replace some input tokens with the "[MASK]" token
masker = keras_nlp.layers.MaskedLMMaskGenerator(
    vocabulary_size=tokenizer.vocabulary_size(),
    mask_selection_rate=0.25,
    mask_selection_length=64,
    mask_token_id=tokenizer.token_to_id("[MASK]"),
    unselectable_token_ids=[
        tokenizer.token_to_id(x) for x in ["[CLS]", "[PAD]", "[SEP]"]
    ],
)
# Randomly selects tokens to mask based on mask_selection_rate.
# Replaces selected tokens with [MASK] token.
# Ensures unselectable_token_ids are not masked.
# [CLS] This is a sample sentence [SEP].
# [CLS] This [MASK] a sample [MASK] [SEP].

# Improved language understanding: Masked LM training enhances language model's ability to predict missing tokens.

In [None]:
def preprocess(inputs, label):
    inputs = preprocessor(inputs)
    masked_inputs = masker(inputs["token_ids"])
    # Split the masking layer outputs into a (features, labels, and weights)
    # tuple that we can use with keras.Model.fit().
    features = {
        "token_ids": masked_inputs["token_ids"],
        "segment_ids": inputs["segment_ids"],
        "padding_mask": inputs["padding_mask"],
        "mask_positions": masked_inputs["mask_positions"],
    }
    labels = masked_inputs["mask_ids"]
    weights = masked_inputs["mask_weights"]
    return features, labels, weights

In [None]:
pretrain_ds = imdb_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
pretrain_val_ds = imdb_test.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

# Tokens with ID 103 are "masked"
print(pretrain_ds.unbatch().take(1).get_single_element())

In [None]:
# BERT backbone
backbone = keras_nlp.models.BertBackbone(
    vocabulary_size=tokenizer.vocabulary_size(),
    num_layers=2,
    num_heads=2,
    hidden_dim=128,
    intermediate_dim=512,
)

In [None]:
# Language modeling head
mlm_head = keras_nlp.layers.MaskedLMHead(
    token_embedding=backbone.token_embedding,
)

In [None]:
inputs = {
    "token_ids": keras.Input(shape=(None,), dtype=tf.int32, name="token_ids"),
    "segment_ids": keras.Input(shape=(None,), dtype=tf.int32, name="segment_ids"),
    "padding_mask": keras.Input(shape=(None,), dtype=tf.int32, name="padding_mask"),
    "mask_positions": keras.Input(shape=(None,), dtype=tf.int32, name="mask_positions"),
}

In [None]:
# Encoded token sequence
sequence = backbone(inputs)["sequence_output"]


In [None]:
# Predict an output word for each masked input token.
# We use the input token embedding to project from our encoded vectors to
# vocabulary logits, which has been shown to improve training efficiency.
outputs = mlm_head(sequence, mask_positions=inputs["mask_positions"])

In [None]:
# Define and compile our pretraining model.
pretraining_model = keras.Model(inputs, outputs)
pretraining_model.summary()
pretraining_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.AdamW(learning_rate=5e-4),
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
    jit_compile=True,
)

In [None]:
# Pretrain on IMDB dataset
pretraining_model.fit(
    pretrain_ds,
    validation_data=pretrain_val_ds,
    epochs=3,  # Increase to 6 for higher accuracy
)

Build and train your own transformer from scratch

In [None]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    imdb_train.map(lambda x, y: x),
    vocabulary_size=20_000,
    lowercase=True,
    strip_accents=True,
    reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
)

In [None]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    lowercase=True,
    strip_accents=True,
    oov_token="[UNK]",
)

In [None]:
packer = keras_nlp.layers.StartEndPacker(
    start_value=tokenizer.token_to_id("[START]"),
    end_value=tokenizer.token_to_id("[END]"),
    pad_value=tokenizer.token_to_id("[PAD]"),
    sequence_length=512,
)

In [None]:

def preprocess(x, y):
    token_ids = packer(tokenizer(x))
    return token_ids, y

In [None]:

imdb_preproc_train_ds = imdb_train.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
imdb_preproc_val_ds = imdb_test.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [None]:
print(imdb_preproc_train_ds.unbatch().take(1).get_single_element())

In [None]:
token_id_input = keras.Input(
    shape=(None,),
    dtype="int32",
    name="token_ids",
)

outputs = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=len(vocab),
    sequence_length=packer.sequence_length,
    embedding_dim=64,
)(token_id_input)

outputs = keras_nlp.layers.TransformerEncoder(
    num_heads=2,
    intermediate_dim=128,
    dropout=0.1,
)(outputs)

In [None]:
# Use "[START]" token to classify
outputs = keras.layers.Dense(2)(outputs[:, 0, :])

In [None]:
model = keras.Model(
    inputs=token_id_input,
    outputs=outputs,
)

model.summary()

In [None]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.AdamW(5e-5),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
    jit_compile=True,
)

In [None]:
model.fit(
    imdb_preproc_train_ds,
    validation_data=imdb_preproc_val_ds,
    epochs=3,
)

Fine-tuning

In [None]:
def preprocess(sentences, labels):
    return tokenizer(sentences), labels


# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.
finetune_ds = sst_train_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
finetune_val_ds = sst_val_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

# Preview a single input example.
print(finetune_val_ds.take(1).get_single_element())

In [None]:
import os

os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"


import keras_nlp
import tensorflow as tf
import keras

In [None]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 128
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

# Model params.
NUM_LAYERS = 3
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.1
NORM_EPSILON = 1e-5

# Training params.
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3

In [None]:
# Reload the encoder model from disk so we can restart fine-tuning from scratch.
encoder_model = keras.models.load_model("encoder_model.keras", compile=False)

In [None]:
# Take as input the tokenized input.
inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")

In [None]:
# Encode and pool the tokens.
encoded_tokens = encoder_model(inputs)
pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens[0])

In [None]:
# Predict an output label.
outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)

In [None]:
# Define and compile our fine-tuning model.
finetuning_model = keras.Model(inputs, outputs)

In [None]:
finetuning_model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.AdamW(FINETUNING_LEARNING_RATE),
    metrics=["accuracy"],
)

In [None]:
# Finetune the model for the SST-2 task.
finetuning_model.fit(
    finetune_ds,
    validation_data=finetune_val_ds,
    epochs=FINETUNING_EPOCHS,
)