In [2]:
import tensorflow as tf
from tensorflow.keras import models, layers
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np
import random

In [4]:
dataset, info = tfds.load(
    "imdb_reviews/plain_text",
    split=["train[:80%]", "train[80%:]", "test"],
    as_supervised=True,
    with_info=True
)

dataset_train_original = dataset[0]
dataset_validate_original = dataset[1]
dataset_test_original = dataset[2]
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/home/sam/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitInfo

In [11]:
print("train:" , len(dataset_train_original))
print("valid:" , len(dataset_validate_original))
print("tetst:" , len(dataset_test_original))


train: 20000
valid: 5000
tetst: 25000


In [None]:
dataset_train = dataset_train_original.cache()
dataset_train = dataset_train.shuffle(20_000)
dataset_train = dataset_train.batch(128)

dataset_validate = dataset_validate_original.cache()
dataset_validate = dataset_validate.batch(128)

In [None]:
vocabulary_size = 10_000
sequence_length = 128

encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_sequence_length=sequence_length,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="int"
)
encoder.adapt(dataset_train_original.map(lambda text, _: text).batch(512))

In [None]:
def render_history(history):
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()

In [None]:
# Functional API. f(g(h(...)))

embedding_size = 32

model_input =layers.Input(shape=[], dtype=tf.string)
encoded = encoder(model_input)
embedded = layers.Embedding(vocabulary_size, embedding_size)(encoded)

queries = layers.Dense(32, activation="linear")(embedded)
keys    = layers.Dense(32, activation="linear")(embedded)
values  = layers.Dense(32, activation="linear")(embedded)

attention_layer = layers.Attention()

attention, attention_score = attention_layer(
    [queries, keys, values],
    return_attention_scores=True
)

linear = layers.Dense(32, activation="linear")(attention)
flattened = layers.Flatten()(linear)
model_output = layers.Dense(1, activation="sigmoid")(flattened)


model = models.Model(model_input, model_output)
y = model.predict(["this arbitrary movie sa arbitrarily good"])
print(y.shape)

attention_model = models.Model(model_input, attention)
y = attention_model.predict(["this arbitrary movie sa arbitrarily good"])
print(y.shape)

attention_score_model = models.Model(model_input, attention_score)
y = attention_score_model.predict(["this arbitrary movie sa arbitrarily good"])
print(y.shape)

In [None]:
model.predict(["wost movie ever made. so bad"])

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    dataset_train,
    epochs=10,
    validation_data=dataset_validate
)



In [None]:
render_history(history)

In [None]:
embedding_model = models.Model(model_input, embedded)

input = ["this is a great movie. i really like the vibes that it emits. except for the actor xy, who did nor really study shakespearean theatre. a great move for a lovely and solemnd night at home. while having a nice cup of tea in the snow."]
output = model.predict(input)
print(output)

output_embedding = embedding_model.predict(input)
plt.imshow(output_embedding[0].transpose(), cmap="inferno")
plt.show()
plt.close()


output_attention = attention_model.predict(input)
plt.imshow(output_attention[0].transpose(), cmap="inferno")
plt.show()
plt.close()

output_attention_score = attention_score_model.predict(input)
plt.imshow(output_attention_score[0], cmap="inferno")
plt.show()
plt.close()