In [1]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
    output_mode="int"
)

In [2]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms."
]
text_vectorization.adapt(dataset)

In [4]:
vocabulary = text_vectorization.get_vocabulary()
vocabulary

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [5]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
i write rewrite and [UNK] rewrite again


## Now use bag of word model on IMDB dataset

In [6]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0 80.2M    0  224k    0     0  87914      0  0:15:56  0:00:02  0:15:54 87984
  0 80.2M    0  576k    0     0   162k      0  0:08:25  0:00:03  0:08:22  162k
  1 80.2M    1 1024k    0     0   225k      0  0:06:04  0:00:04  0:06:00  225k
  1 80.2M    1 1456k    0     0   262k      0  0:05:12  0:00:05  0:05:07  314k
  2 80.2M    2 1920k    0     0   293k      0  0:04:40  0:00:06  0:04:34  384k
  2 80.2M    2 2352k    0     0   311k      0  0:04:23  0:00:07  0:04:16  431k
  3 80.2M    3 2864k    0     0   334k      0  0:04:05  0:00:08  0:03:57  457k
  4 80.2M    4 3440k    0     0   360k      0  0:03

In [7]:
import os, shutil, random
from pathlib import Path

base_dir = Path('aclImdb')
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir/category/fname, val_dir/category/fname)

In [8]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [10]:
for inputs, targets in train_ds:
    print(f"{inputs.shape=}")
    print(f"{inputs.dtype=}")
    print(f"{targets.shape=}")
    print(f"{targets.dtype=}")
    print(f"{inputs[0]=}")
    print(f"{targets[0]=}")
    break

inputs.shape=TensorShape([32])
inputs.dtype=tf.string
targets.shape=TensorShape([32])
targets.dtype=tf.int32
inputs[0]=<tf.Tensor: shape=(), dtype=string, numpy=b'A good idea, badly implemented. While that could summarize 99% of the SciFi channel\'s movies, it really applies here. I love movies where a good back story is slowly revealed, and I like action movies, and I like all of the main actors, so this could have been great. However, despite some good acting, this movie fails due to Bill Platt\'s bad writing and directing.<br /><br />Another review made the good point of needing to know where you\'re going so you can get there. This movie doesn\'t. It\'s put together in such a haphazard way that you know the words "second draft" are not in Bill Platt\'s vocabulary. There is one scene that is entirely unnecessary and could be removed without anyone noticing. This scene even begins and ends with them driving a car, so you could cut from one car scene to the other and never have missed

Now let's encode the text using a bag-of-word model into binary vectors of the vocabulary size and ones where a word is present

In [12]:
text_vectorization = TextVectorization(
    max_tokens=2000,
    output_mode="multi_hot"
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)
binary_1gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)
binary_1gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y), num_parallel_calls=4
)


In [13]:
for inputs, targets in binary_1gram_train_ds:
    print(f"{inputs.shape=}")
    print(f"{inputs.dtype=}")
    print(f"{targets.shape=}")
    print(f"{targets.dtype=}")
    print(f"{inputs[0]=}")
    print(f"{targets[0]=}")
    break

inputs.shape=TensorShape([32, 2000])
inputs.dtype=tf.int64
targets.shape=TensorShape([32])
targets.dtype=tf.int32
inputs[0]=<tf.Tensor: shape=(2000,), dtype=int64, numpy=array([1, 1, 1, ..., 0, 0, 0], dtype=int64)>
targets[0]=<tf.Tensor: shape=(), dtype=int32, numpy=0>
