In [22]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

In [23]:
# Without downloading again
dataset = r'.\aclImdb_v1'

In [24]:
dataset_dir = os.path.join(os.path.abspath(dataset), 'aclImdb')

In [34]:
batch_size = 32
seed = 42
validation_split = 0.2

train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=validation_split,
    subset='training',
    seed=seed)

raw_validation_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=validation_split,
    subset='validation',
    seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 4647 files belonging to 2 classes.


In [None]:
import numpy as np

# Still need to understand how to use vectors from the file
with open(os.path.join(dataset_dir, 'imdb.vocab'), 'rb') as words_file:
        words = [x.strip() for x in words_file.readlines()]
with open(os.path.join(dataset_dir, 'imdbEr.txt'), 'rb') as vectors_file:
    vectors = np.array([float(x.strip()) for x in vectors_file.readlines()])

In [None]:
vocab = list(zip(words, vectors))
vocab[0]

In [35]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)


In [None]:
# Understand why it doesn't work
vectorize_layer = tf.keras.layers.TextVectorization(standardize=custom_standardization, vocabulary=words, output_mode='tf_idf', idf_weights=vectors)

In [36]:
words_only = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(words_only)

In [37]:
vectorize_layer.finalize_state()

In [None]:
# Print example from data
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(1):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

In [38]:
model = tf.keras.models.Sequential([
    vectorize_layer,
    tf.keras.layers.Dense(250, activation='relu', name='hidden_layer1'),
    tf.keras.layers.Dropout(0.2, name='dropout_layer1'),
    tf.keras.layers.Dense(250, activation='relu', name='hidden_layer2'),
    tf.keras.layers.Dense(2, name='output_layer')
])

# loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='mean')

In [39]:
model.compile(loss=loss_fn,
              metrics=['accuracy'])

In [10]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [31]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
validation_ds = raw_validation_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [40]:
model.fit(raw_train_ds, epochs=10, validation_data=raw_validation_ds)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.5031 - loss: 192.8603 - val_accuracy: 0.4916 - val_loss: 0.6964
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.5017 - loss: 0.7777 - val_accuracy: 0.5076 - val_loss: 0.6932
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.5015 - loss: 0.7277 - val_accuracy: 0.4924 - val_loss: 0.6932
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.5047 - loss: 0.7126 - val_accuracy: 0.4924 - val_loss: 0.6940
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.5087 - loss: 0.7172 - val_accuracy: 0.4920 - val_loss: 0.6936
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.5042 - loss: 0.7185 - val_accuracy: 0.4922 - val_loss: 0.6938
Epoch 7/10
[1m625/

<keras.src.callbacks.history.History at 0x2b24849fa10>

In [43]:
model.evaluate(raw_test_ds)

[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.0014 - loss: 0.7014


[0.7015230655670166, 0.003227889072149992]