In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
from pathlib import Path

root = "https://ai.stanford.edu/~amaas/data/sentiment/"
filename = "aclImdb_v1.tar.gz"
filepath = tf.keras.utils.get_file(filename, root + filename, extract=True,
                                   cache_dir=".")
path = Path(filepath).with_name("aclImdb")
path

WindowsPath('datasets/aclImdb')

In [3]:
def tree(path, level=0, indent=4, max_files=3):
    if level == 0:
        print(f"{path}/")
        level += 1
    sub_paths = sorted(path.iterdir())
    sub_dirs = [sub_path for sub_path in sub_paths if sub_path.is_dir()]
    filepaths = [sub_path for sub_path in sub_paths if not sub_path in sub_dirs]
    indent_str = " " * indent * level
    for sub_dir in sub_dirs:
        print(f"{indent_str}{sub_dir.name}/")
        tree(sub_dir,  level + 1, indent)
    for filepath in filepaths[:max_files]:
        print(f"{indent_str}{filepath.name}")
    if len(filepaths) > max_files:
        print(f"{indent_str}...")

In [4]:
tree(path)

datasets\aclImdb/
    test/
        neg/
            0_2.txt
            10000_4.txt
            10001_1.txt
            ...
        pos/
            0_10.txt
            10000_7.txt
            10001_9.txt
            ...
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
    train/
        neg/
            0_3.txt
            10000_4.txt
            10001_4.txt
            ...
        pos/
            0_9.txt
            10000_8.txt
            10001_10.txt
            ...
        unsup/
            0_0.txt
            10000_0.txt
            10001_0.txt
            ...
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        ...
    imdb.vocab
    imdbEr.txt
    README


In [5]:
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_valid_pos = review_paths(path / "test" / "pos")
test_valid_neg = review_paths(path / "test" / "neg")

len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)

(12500, 12500, 12500, 12500)

In [6]:
np.random.shuffle(test_valid_pos)

test_pos = test_valid_pos[:5000]
test_neg = test_valid_neg[:5000]
valid_pos = test_valid_pos[5000:]
valid_neg = test_valid_neg[5000:]

In [7]:
def imdb_dataset_fit_in_memory(filepaths_positive, filepaths_negative):
    reviews = []
    labels = []
    for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):
        for filepath in filepaths:
            with open(filepath) as review_file:
                reviews.append(review_file.read())
            labels.append(label)
    return tf.data.Dataset.from_tensor_slices(
        (tf.constant(reviews), tf.constant(labels)))

def imdb_dataset(filepaths_positive, filepaths_negative, n_read_threads=5):
    dataset_neg = tf.data.TextLineDataset(filepaths_negative,
                                          num_parallel_reads=n_read_threads)
    dataset_neg = dataset_neg.map(lambda review: (review, 0))
    dataset_pos = tf.data.TextLineDataset(filepaths_positive,
                                          num_parallel_reads=n_read_threads)
    dataset_pos = dataset_pos.map(lambda review: (review, 1))
    return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)

In [8]:
for X, y in imdb_dataset_fit_in_memory(train_pos, train_neg).take(3):
    print(X)
    print(y)
    print()

tf.Tensor(b"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)

tf.Tensor(b"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public a

In [9]:
%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass

1min 3s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [10]:
%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).cache().repeat(10): pass

39.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [11]:
batch_size = 32

train_set = imdb_dataset(train_pos, train_neg).shuffle(25000, seed=42)
train_set = train_set.batch(batch_size).prefetch(1)
valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

In [12]:
max_tokens = 1000
sample_reviews = train_set.map(lambda review, label: review)
text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens, output_mode="tf_idf")
text_vectorization.adapt(sample_reviews)

In [13]:
text_vectorization.get_vocabulary()[:10]

['[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i']

In [14]:
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vectorization,
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_set, epochs=5, validation_data=valid_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15e9c3405b0>

In [15]:
def compute_mean_embedding(inputs):
    not_pad = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)    
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words

In [16]:
another_example = tf.constant([[[1., 2., 3.], [4., 5., 0.], [0., 0., 0.]],
                               [[6., 0., 0.], [0., 0., 0.], [0., 0., 0.]]])
compute_mean_embedding(another_example)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.535534 , 4.9497476, 2.1213205],
       [6.       , 0.       , 0.       ]], dtype=float32)>

In [17]:
embedding_size = 20
tf.random.set_seed(42)

text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens, output_mode="int")
text_vectorization.adapt(sample_reviews)

model = tf.keras.Sequential([
    text_vectorization,
    tf.keras.layers.Embedding(input_dim=max_tokens,
                              output_dim=embedding_size,
                              mask_zero=True),  # <pad> tokens => zero vectors
    tf.keras.layers.Lambda(compute_mean_embedding),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

In [18]:
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_set, epochs=5, validation_data=valid_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15e9c3f5330>

In [19]:
import tensorflow_datasets as tfds

datasets = tfds.load(name="imdb_reviews")
train_set, test_set = datasets["train"], datasets["test"]

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to C:\Users\JCA\tensorflow_datasets\imdb_reviews\plain_text\0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling...:   0%|          | 0/10 [00:00<?, ? shard/s]

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling...:   0%|          | 0/10 [00:00<?, ? shard/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling...:   0%|          | 0/20 [00:00<?, ? shard/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

Reading...: 0 examples [00:00, ? examples/s]

Writing...:   0%|          | 0/2500 [00:00<?, ? examples/s]

UnknownError: Failed to rename: C:\Users\JCA\tensorflow_datasets\imdb_reviews\plain_text\0.1.0.incomplete6K5MN7 to: C:\Users\JCA\tensorflow_datasets\imdb_reviews\plain_text\0.1.0 : Access is denied.
; Input/output error

In [20]:
for example in train_set.take(1):
    print(example["text"])
    print(example["label"])

TypeError: tuple indices must be integers or slices, not str