In [None]:
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import string
import re


In [None]:
# get the data
datapath = Path.cwd() / Path('datasets') / Path('movie reviews')
train_path = datapath / Path('train')
test_path = datapath / Path('test')

train_paths_neg = list((train_path / Path('neg')).glob('*.txt')) 
train_paths_neg = [str(path) for path in train_paths_neg]

train_paths_pos = list((train_path / Path('pos')).glob('*.txt'))
train_paths_pos = [str(path) for path in train_paths_pos]

test_paths_neg = list((test_path / Path('neg')).glob('*.txt')) 
test_paths_neg = [str(path) for path in test_paths_neg]
test_paths_pos = list((test_path / Path('pos')).glob('*.txt'))
test_paths_pos = [str(path) for path in test_paths_pos]
# split test and validation set
validation_paths_neg = test_paths_neg[:7500]
validation_paths_pos = test_paths_pos[:7500]
test_paths_neg = test_paths_neg[7500:]
test_paths_pos = test_paths_pos[7500:]



In [None]:
# create efficient TF dataset
def create_imdb_dataset(*, filepaths_positive, filepaths_negative, n_readers=5):
    # read in as TextLineDataset and add label to it
    dataset_neg = tf.data.TextLineDataset(filepaths_negative, num_parallel_reads=n_readers)
    dataset_neg = dataset_neg.map(lambda review: (review, 0))
    dataset_pos = tf.data.TextLineDataset(filepaths_positive, num_parallel_reads=n_readers)
    dataset_pos = dataset_pos.map(lambda review: (review, 1))
    return tf.data.Dataset.concatenate(dataset_neg, dataset_pos)

batch_size = 32
train_set = create_imdb_dataset(filepaths_positive=train_paths_pos, filepaths_negative=train_paths_neg).shuffle(25000).batch(batch_size).prefetch(1)
test_set = create_imdb_dataset(filepaths_positive=test_paths_pos, filepaths_negative=test_paths_neg).batch(batch_size).prefetch(1)
valid_set = create_imdb_dataset(filepaths_positive=validation_paths_pos, filepaths_negative=validation_paths_neg).batch(batch_size).prefetch(1)


In [None]:
# TextVectorization preprocessing layer
def standardize_text(input):
    lower = tf.strings.lower(input)
    stripped_linebreaks = tf.strings.regex_replace(lower, '<br />', ' ')
    return tf.strings.regex_replace(stripped_linebreaks, f'[{re.escape(string.punctuation)}]', '')

vocab_size = 10000
sequence_length = 100
text_vectorization_layer = keras.layers.TextVectorization(standardize=standardize_text, max_tokens=vocab_size, output_sequence_length=sequence_length)
text_vectorization_layer.adapt(train_set.map(lambda text, label: text))


In [None]:
# binary classifier model
embedding_dim = 16

# compute mean embedding and multiply by sqrt(number of words)
def compute_mean_embedding(inputs):
    not_pad = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)    
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words
    
    
another_example = tf.constant([[[1., 2., 3.], [4., 5., 0.], [0., 0., 0.]],
                               [[6., 0., 0.], [0., 0., 0.], [0., 0., 0.]]])
aaa = compute_mean_embedding(another_example)



In [None]:
model = keras.models.Sequential([
    text_vectorization_layer,
    keras.layers.Embedding(vocab_size, embedding_dim, name='embedding'),
    keras.layers.Lambda(compute_mean_embedding),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit(train_set, epochs=5, validation_data=valid_set)