In [None]:
# pipenv install --python 3.10
# pipenv shell

**run this cell when running on colab**

In [3]:
# create folder
from pathlib import Path

trained_model_folder = Path("./trained_models/")

if not trained_model_folder.exists():
    trained_model_folder.mkdir()

# install tree
!apt-get install tree

# install libraries
%pip install -q --upgrade keras-nlp  # install keras-nlp before keras
%pip install -q --upgrade keras

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/465.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m317.4/465.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

**Setup**

In [4]:
import keras
from keras import layers
import keras_nlp
from keras_nlp import layers as nlp_layers

**Download the data**

In [5]:
keras.utils.get_file(
    origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    cache_dir="./",
    extract=True
)

imdb_dir = Path("./datasets/aclImdb")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


In [9]:
!tree -d datasets/aclImdb/


[01;34mdatasets/aclImdb/[0m
├── [01;34mtest[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
└── [01;34mtrain[0m
    ├── [01;34mneg[0m
    ├── [01;34mpos[0m
    └── [01;34munsup[0m

7 directories


remove unsupervised training data, we don't need that here

In [10]:
!rm -r datasets/aclImdb/train/unsup


In [11]:
!tree -d datasets/aclImdb/


[01;34mdatasets/aclImdb/[0m
├── [01;34mtest[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
└── [01;34mtrain[0m
    ├── [01;34mneg[0m
    └── [01;34mpos[0m

6 directories


quick look at one review

In [12]:
!cat datasets/aclImdb/train/neg/21_4.txt

What was with all the Turkish actors? No offense but I thought it was all for nothing for all these actors. The film had no script to test any actors acting skill or ability. It demanded next to nothing I bought this film to see Michael Madsen. He is one of my favorite actors but this film was another failure for him. The script was so bad. Their was just nothing to sink your teeth into and all the characters were two dimensional. Madsen tried to act like a hard ass but the script and direction didn't even allow him to do enough with his character to make it more interesting or 3 dimensional.<br /><br />Even the sound effects of the gunfight at the beginning of the film sounded like the noise of paint ball guns when they are fired in a skirmish. It was really weird and they didn't sound like real guns. A video game had better sound effects than this film. There was also a really annoying bloke at the beginning of the film who was a member of the robbery gang. He had this American whini

prepare validation set

In [13]:
import os, shutil, random
 
validation_dir = imdb_dir / "validation"
validation_dir.mkdir()
train_dir = imdb_dir / "train"
for category in ("neg", "pos"):
    (validation_dir / category).mkdir()
    files = os.listdir(train_dir / category)
    random.Random(1234).shuffle(files)  # use seed to ensure same dataset through different runs
    num_validation_samples = int(0.2 * len(files))
    validation_files = files[-num_validation_samples:]
    for file in validation_files:
        shutil.move(train_dir / category / file,
                    validation_dir / category / file)


In [14]:
!tree -d datasets/aclImdb/

[01;34mdatasets/aclImdb/[0m
├── [01;34mtest[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
├── [01;34mtrain[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
└── [01;34mvalidation[0m
    ├── [01;34mneg[0m
    └── [01;34mpos[0m

9 directories


In [15]:
batch_size = 32

# 0 for negative, 1 for positive
train_dataset = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/train", batch_size=batch_size)
validation_dataset = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/validation", batch_size=batch_size)
test_dataset = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/test", batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


take a look at the batch data

In [16]:
for inputs, targets in train_dataset:
    print("inputs.shape: ", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break


inputs.shape:  (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b"I am a huge fan of Say Anything, Jerry Maguire, and Almost Famous (I wasn't that big on Singles), so it's safe to say that I look forward to anything that Cameron Crowe attaches his name to. I went to see Vanilla Sky having been told that it was a very weird movie and that I probably wouldn't like it if I was expecting anything similar to Crowe's other films. Well, having just seen it, let me say that the former was correct, and the latter couldn't have been more wrong. It is a very weird movie, and nothing really comes together until the end. Anyone who tells you that they saw it coming halfway into the movie is either lying to you or is unable to detach their hindsight from their memory. Anyway, the movie was stellar, and I look forward to owning it as soon as the DVD is released. I was moved by the film, and felt emotionally spent by the end. This is an ex

train a text vectorization layer with unlabeled data

In [17]:
text_only_train_dataset = train_dataset.map(lambda x, y: x)  # do not need labels to train the text vectorization layer

max_length = 600
max_tokens = 20_000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length)

text_vectorization.adapt(text_only_train_dataset)

prepare integer sequence datasets

In [18]:
int_train_dataset = train_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_validation_dataset = validation_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_dataset = test_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

use a TransformerEncoder-based model for text classification

In [19]:
vocabulary_size = 20_000
sequence_length = 600
embed_dimension = 32

num_heads = 2
dense_layer_dimension = 32

inputs = keras.Input(shape=(None,), dtype="int64")

x = nlp_layers.TokenAndPositionEmbedding(vocabulary_size=vocabulary_size,
                                         sequence_length=sequence_length,
                                         embedding_dim=embed_dimension
                                         )(inputs)

x = nlp_layers.TransformerEncoder(intermediate_dim=dense_layer_dimension,
                                  num_heads=num_heads
                                  )(x)

# x = layers.GlobalMaxPooling1D()(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)  # 0.5
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

train the transformer encoder model

In [20]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "trained_models/transformer_encoder.keras",
        save_best_only=True)]

model.fit(int_train_dataset,
          validation_data=int_validation_dataset,
          epochs=6,
          callbacks=callbacks)


Epoch 1/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 110ms/step - accuracy: 0.6131 - loss: 0.8031 - val_accuracy: 0.8220 - val_loss: 0.4059
Epoch 2/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 81ms/step - accuracy: 0.8047 - loss: 0.4238 - val_accuracy: 0.8396 - val_loss: 0.3771
Epoch 3/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 80ms/step - accuracy: 0.8381 - loss: 0.3649 - val_accuracy: 0.8420 - val_loss: 0.3500
Epoch 4/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 83ms/step - accuracy: 0.8660 - loss: 0.3195 - val_accuracy: 0.8266 - val_loss: 0.3779
Epoch 5/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 82ms/step - accuracy: 0.8827 - loss: 0.2839 - val_accuracy: 0.8504 - val_loss: 0.3408
Epoch 6/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 82ms/step - accuracy: 0.9035 - loss: 0.2481 - val_accuracy: 0.8546 - val_loss: 0.3385
Epoch 7/20
[1

<keras.src.callbacks.history.History at 0x7db5a667ce50>

evaluate the transformer encoder model

In [21]:
model = keras.models.load_model(
    "trained_models/transformer_encoder.keras")

print(f"Test accuracy: {model.evaluate(int_test_dataset)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.8502 - loss: 0.3496
Test accuracy: 0.851
