In [None]:
import numpy as np
import tensorflow as tf
import keras
from keras.layers import Embedding, LSTM, Dense, Dropout, TextVectorization
from keras.utils import pad_sequences

In [None]:
train_dir = 'path/to/train/dir'
test_dir = 'path/to/test/dir'

max_tokens = 20000  # Only consider the top 20,000 words
max_len = 200  # Only consider the first 200 words of each movie review
embedding_dim = 100  # Dimensionality of the embedding vector


In [None]:
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir, batch_size=32)


test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir, batch_size=32)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Create a TextVectorization layer
vectorizer = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_len,
)

vectorizer.adapt(train_ds.map(lambda text, label: text))

In [None]:
train_int = train.map(lambda x, y: (vectorizer(x), y), num_parallel_calls = 4)
test_int = test.map(lambda x, y: (vectorizer(x), y), num_parallel_calls = 4)

In [None]:
# Load the pre-trained GloVe embeddings
path_to_glove_file = "path/to/glove.6B.100d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors.")

In [None]:
vocabulary = vectorizer.get_vocabulary() #Retrieve the vocabulary indexed by our previous TextVectorization layer
word_index = dict(zip(vocabulary, range(len(vocabulary)))) #Use it to create a mapping from words to their index in the vocabulary

embedding_matrix = np.zeros((max_tokens, embedding_dim)) #Prepare a matrix that we’ll fill with the GloVe vectors.
 
  for word, i in word_index.items():
    if i < max_tokens:
      embedding_vector = embeddings_index.get(word) #Fill entry i in the matrix with the word vector for index i. 
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector #Words not found in the embedding index will be all zeros.

In [None]:
# Define the model architecture
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)(inputs)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

In [None]:
# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
history = model.fit(train_int, batch_size = batch_size, epochs = 50,
validation_data = test_int,
verbose=1)

In [None]:
Plot the accuracy curves (LSTM)
plt.plot(history.history['accuracy'], label='Training')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Training and Validation Accuracy per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Training and Validation Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper left')
plt.show()