In [2]:
from pathlib import Path
from tensorflow import keras

DOWNLOAD_ROOT = "http://ai.stanford.edu/~amaas/data/sentiment/"
FILENAME = "aclImdb_v1.tar.gz"
filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [6]:
path = Path(filepath).parent / "aclImdb"

In [11]:
import os

for name, subdirs, files in os.walk(path):
    indent = len(Path(name).parts) - len(path.parts)
    print("    " * indent + Path(name).parts[-1] + os.sep)
    for index, filename in enumerate(sorted(files)):
        if index == 3:
            print("    " * (indent + 1) + "...")
            break
        print("    " * (indent + 1) + filename)

aclImdb/
    README
    imdb.vocab
    imdbEr.txt
    test/
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
        neg/
            0_2.txt
            10000_4.txt
            10001_1.txt
            ...
        pos/
            0_10.txt
            10000_7.txt
            10001_9.txt
            ...
    train/
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        ...
        neg/
            0_3.txt
            10000_4.txt
            10001_4.txt
            ...
        unsup/
            0_0.txt
            10000_0.txt
            10001_0.txt
            ...
        pos/
            0_9.txt
            10000_8.txt
            10001_10.txt
            ...


In [12]:
def review_patch(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_patch(path / "train" / "pos")
train_neg = review_patch(path / "train" / "neg")
test_val_pos = review_patch(path / "test" / "pos")
test_val_neg = review_patch(path / "test" / "neg")

In [14]:
len(train_pos), len(train_neg), len(test_val_pos), len(test_val_pos)

(12500, 12500, 12500, 12500)

In [15]:
import numpy as np
np.random.shuffle(test_val_neg)
np.random.shuffle(test_val_pos)

test_pos = test_val_pos[:5000]
test_neg = test_val_neg[:5000]
val_pos = test_val_pos[5000:]
val_neg = test_val_neg[5000:]

In [16]:
import tensorflow as tf

def load_data(pos, neg):
    data = []
    labels = []

    for filepaths, label in ((pos, 1), (neg, 0)):
        for filepath in filepaths:
            with open(filepath, 'r') as f:
                data.append(f.read())
            labels.append(label)
    
    return data, labels
def create_dataset(data, labels):
    return tf.data.Dataset.from_tensor_slices(
        (tf.constant(data), tf.constant(labels))
    )

In [24]:
training_set = create_dataset(*load_data(train_pos, train_neg))
val_set = create_dataset(*load_data(val_pos, val_neg))
test_set = create_dataset(*load_data(test_pos, test_neg))

In [23]:
for X, y in training_set.take(3):
    print(X)
    print(y)
    print()

tf.Tensor(b'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.', shape=(), dtype=string)
tf.Tensor(1, shape=(), dtype=int32)

tf.Tensor(b'Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV\'s "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina\'s pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D\'

In [25]:
BATCH_SIZE = 64
training_set = training_set.shuffle(25000).batch(BATCH_SIZE).prefetch(1)
test_set = test_set.batch(BATCH_SIZE).prefetch(1)
val_set = val_set.batch(BATCH_SIZE).prefetch(1)

In [48]:
def preprocess(X_batch, n_words=100, n_characters=500):
    shape = tf.shape(X_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])
    Z = tf.strings.substr(X_batch, 0, n_characters)
    Z = tf.strings.lower(Z)
    Z = tf.strings.regex_replace(Z, b"<br\\s*/?>", b" ")
    Z = tf.strings.regex_replace(Z, b"[^a-z]", b" ")
    Z = tf.strings.split(Z)
    return Z.to_tensor(shape=shape, default_value=b"<>")

In [49]:
example = tf.constant(["It's a great, great movie! I loved it.", "It was terrible, run away!"])
preprocess(example, n_words=5)

<tf.Tensor: shape=(2, 5), dtype=string, numpy=
array([[b'it', b's', b'a', b'great', b'great'],
       [b'it', b'was', b'terrible', b'run', b'away']], dtype=object)>

In [51]:
from collections import Counter

def get_vocab(data_sample, max_size=2000):
    data = preprocess(data_sample).numpy()
    counter = Counter()
    for words in data: 
        for word in words:
            if word != b"<>": 
                counter[word] += 1

    return [b"<>"] + [word for word, count in counter.most_common(max_size)]

get_vocab(example)
    

[b'<>',
 b'it',
 b'great',
 b's',
 b'a',
 b'movie',
 b'i',
 b'loved',
 b'was',
 b'terrible',
 b'run',
 b'away']

In [63]:
class TextVectorization(keras.layers.Layer):
    def __init__(self, max_vocabulary_size=2000, n_oov_buckets=200, dtype=tf.string, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.max_vocabulary_size = max_vocabulary_size
        self.n_oov_buckets = n_oov_buckets

    def adapt(self, data_sample):
        self.vocab = get_vocab(data_sample, self.max_vocabulary_size)
        words = tf.constant(self.vocab)
        word_ids = tf.range(len(self.vocab), dtype=tf.int64)
        vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
        self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)

    def call(self, inputs):
        preprocessed_inputs = preprocess(inputs)
        return self.table.lookup(preprocessed_inputs)
        


In [64]:
text_vectorization = TextVectorization()

text_vectorization.adapt(example)
text_vectorization(example)

<tf.Tensor: shape=(2, 100), dtype=int64, numpy=
array([[ 1,  3,  4,  2,  2,  5,  6,  7,  1,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 1,  8,  9, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0]])>

In [67]:
max_vocab_size = 2000
n_oov_buckets = 200

text_vectorization = TextVectorization(max_vocab_size, n_oov_buckets)

training_reviews_batches = training_set.map(lambda review, label: review)
training_reviews = np.concatenate(list(training_reviews_batches.as_numpy_iterator()), axis=0)

text_vectorization.adapt(training_reviews)


In [68]:
class BagOfWords(keras.layers.Layer):
    def __init__(self, n_tokens, dtype=tf.int32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.n_tokens = n_tokens
    def call(self, inputs):
        one_hot = tf.one_hot(inputs, self.n_tokens)
        return tf.reduce_sum(one_hot, axis=1)[:, 1:] # drop <> count

In [69]:
n_tokens = max_vocab_size + n_oov_buckets + 1 # 1 for <>
bag_of_words = BagOfWords(n_tokens)

In [70]:
model = keras.models.Sequential([
    text_vectorization,
    bag_of_words,
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [72]:
model.fit(training_set, epochs=5, validation_data=val_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x29fb35430>

In [74]:
def compute_mean_embedding(inputs):
    not_pad = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)    
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words

embedding_size = 20

model = keras.models.Sequential([
    text_vectorization,
    keras.layers.Embedding(input_dim=n_tokens,
                           output_dim=embedding_size,
                           mask_zero=True),
    keras.layers.Lambda(compute_mean_embedding),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit(training_set, epochs=5, validation_data=val_set)

Epoch 1/5


2022-04-10 17:03:58.360669: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-04-10 17:04:09.696558: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x177864940>