[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Pinkwjp/sentiment-analysis-with-transformer-on-IMDB/blob/master/transformer.ipynb)

In [1]:
# pipenv install --python 3.10
# pipenv shell

**run this cell when running on colab**

In [2]:
# create folder
from pathlib import Path

trained_model_folder = Path("./trained_models/")

if not trained_model_folder.exists():
    trained_model_folder.mkdir()

# install tree
!apt-get install tree

# install libraries
%pip install -q --upgrade keras-nlp  # install keras-nlp before keras
%pip install -q --upgrade keras

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 1s (50.9 kB/s)
Selecting previously unselected package tree.
(Reading database ... 121753 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Setup**

In [3]:
import keras
from keras import layers
import keras_nlp
from keras_nlp import layers as nlp_layers

**Download the data**

In [4]:
keras.utils.get_file(
    origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    cache_dir="./",
    extract=True
)

imdb_dir = Path("./datasets/aclImdb")

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 0us/step


In [5]:
!tree -d datasets/aclImdb/


[01;34mdatasets/aclImdb/[0m
├── [01;34mtest[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
└── [01;34mtrain[0m
    ├── [01;34mneg[0m
    ├── [01;34mpos[0m
    └── [01;34munsup[0m

7 directories


remove unsupervised training data, we don't need that here

In [6]:
!rm -r datasets/aclImdb/train/unsup


In [7]:
!tree -d datasets/aclImdb/


[01;34mdatasets/aclImdb/[0m
├── [01;34mtest[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
└── [01;34mtrain[0m
    ├── [01;34mneg[0m
    └── [01;34mpos[0m

6 directories


quick look at one review

In [8]:
!cat datasets/aclImdb/train/neg/21_4.txt

What was with all the Turkish actors? No offense but I thought it was all for nothing for all these actors. The film had no script to test any actors acting skill or ability. It demanded next to nothing I bought this film to see Michael Madsen. He is one of my favorite actors but this film was another failure for him. The script was so bad. Their was just nothing to sink your teeth into and all the characters were two dimensional. Madsen tried to act like a hard ass but the script and direction didn't even allow him to do enough with his character to make it more interesting or 3 dimensional.<br /><br />Even the sound effects of the gunfight at the beginning of the film sounded like the noise of paint ball guns when they are fired in a skirmish. It was really weird and they didn't sound like real guns. A video game had better sound effects than this film. There was also a really annoying bloke at the beginning of the film who was a member of the robbery gang. He had this American whini

prepare validation set

In [9]:
import os, shutil, random

validation_dir = imdb_dir / "validation"
validation_dir.mkdir()
train_dir = imdb_dir / "train"
for category in ("neg", "pos"):
    (validation_dir / category).mkdir()
    files = os.listdir(train_dir / category)
    random.Random(1234).shuffle(files)  # use seed to ensure same dataset through different runs
    num_validation_samples = int(0.2 * len(files))
    validation_files = files[-num_validation_samples:]
    for file in validation_files:
        shutil.move(train_dir / category / file,
                    validation_dir / category / file)


In [10]:
!tree -d datasets/aclImdb/

[01;34mdatasets/aclImdb/[0m
├── [01;34mtest[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
├── [01;34mtrain[0m
│   ├── [01;34mneg[0m
│   └── [01;34mpos[0m
└── [01;34mvalidation[0m
    ├── [01;34mneg[0m
    └── [01;34mpos[0m

9 directories


In [11]:
batch_size = 32

# 0 for negative, 1 for positive
train_dataset = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/train", batch_size=batch_size)
validation_dataset = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/validation", batch_size=batch_size)
test_dataset = keras.utils.text_dataset_from_directory(
    "datasets/aclImdb/test", batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


take a look at the batch data

In [12]:
for inputs, targets in train_dataset:
    print("inputs.shape: ", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break


inputs.shape:  (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'This episode of Twilight Zone combines a silent section (1890) with melodramatic acting and sight gags, an homage to the early Buster Keaton films. Lots of slapstick: Buster falling on a bulkhead door, falling in a puddle, running around pants-less. Silly scientist\'s invention of a Time Helmet, reminiscent of a Flash Gordon idea of what the future would be. Cheap prices, like $1.95 for ladies hats, or 17 cents a pound for beef seem outrageously high to Buster. Even the world of 1890 is too much for Buster/Mulligan. How shocking when he is mistakenly transported to the "modern" world of 1960! Buster was trying to go backwards! The "scientist" of that time wants to return to a calmer world, the 1890 that he has studied and admired. They go back together, and Buster/Mulligan is now happy and the "scientist" regrets not having electronic equipment, modern beds o

train a text vectorization layer with unlabeled data

In [13]:
text_only_train_dataset = train_dataset.map(lambda x, y: x)  # do not need labels to train the text vectorization layer

max_length = 600
max_tokens = 20_000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length)

text_vectorization.adapt(text_only_train_dataset)

prepare integer sequence datasets

In [14]:
int_train_dataset = train_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_validation_dataset = validation_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_dataset = test_dataset.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

use a TransformerEncoder-based model for text classification

In [19]:
vocabulary_size = 20_000
sequence_length = 600
embed_dimension = 32

num_heads = 2
dense_layer_dimension = 32

inputs = keras.Input(shape=(None,), dtype="int64")

x = nlp_layers.TokenAndPositionEmbedding(vocabulary_size=vocabulary_size,
                                         sequence_length=sequence_length,
                                         embedding_dim=embed_dimension
                                         )(inputs)

x = nlp_layers.TransformerEncoder(intermediate_dim=dense_layer_dimension,
                                  num_heads=num_heads
                                  )(x)

x = layers.GlobalMaxPooling1D()(x)
# x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)  # 0.5
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

train the transformer encoder model

In [20]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        "trained_models/transformer_encoder.keras",
        save_best_only=True)]

model.fit(int_train_dataset,
          validation_data=int_validation_dataset,
          epochs=6,
          callbacks=callbacks)


Epoch 1/6
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 91ms/step - accuracy: 0.5888 - loss: 0.6653 - val_accuracy: 0.8018 - val_loss: 0.4367
Epoch 2/6
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 84ms/step - accuracy: 0.8002 - loss: 0.4437 - val_accuracy: 0.8236 - val_loss: 0.3873
Epoch 3/6
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 84ms/step - accuracy: 0.8355 - loss: 0.3798 - val_accuracy: 0.8338 - val_loss: 0.3732
Epoch 4/6
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 84ms/step - accuracy: 0.8604 - loss: 0.3225 - val_accuracy: 0.8416 - val_loss: 0.3662
Epoch 5/6
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 85ms/step - accuracy: 0.8862 - loss: 0.2788 - val_accuracy: 0.8340 - val_loss: 0.3984
Epoch 6/6
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 83ms/step - accuracy: 0.9055 - loss: 0.2361 - val_accuracy: 0.8492 - val_loss: 0.3719


<keras.src.callbacks.history.History at 0x791b3708b5e0>

evaluate the transformer encoder model

In [21]:
model = keras.models.load_model(
    "trained_models/transformer_encoder.keras")

print(f"Test accuracy: {model.evaluate(int_test_dataset)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8414 - loss: 0.3647
Test accuracy: 0.839


## ***The following is for comaprison***

try closely follow a example on Keras website:  
https://keras.io/examples/nlp/text_classification_with_transformer/

In [22]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
25000 Training sequences
25000 Validation sequences


In [23]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = nlp_layers.TokenAndPositionEmbedding(vocab_size, maxlen, embed_dim)
x = embedding_layer(inputs)
encoder = nlp_layers.TransformerEncoder(ff_dim, num_heads)
x = encoder(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
#TODO: run the example model with my dataset to see if the size of training set makes the difference

In [24]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "trained_models/new_transformer_encoder.keras",
        save_best_only=True)]


history = model.fit(x_train, y_train,
                    batch_size=32,
                    epochs=2,
                    validation_data=(x_val, y_val),
                    callbacks=callbacks)


Epoch 1/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 24ms/step - accuracy: 0.7158 - loss: 0.5174 - val_accuracy: 0.8810 - val_loss: 0.2805
Epoch 2/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.9245 - loss: 0.1998 - val_accuracy: 0.8712 - val_loss: 0.3056
