In [5]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import matplotlib.pyplot as plt
import os
import string 
import shutil
import re

#Load Data

##Download imdb movies reviews

In [6]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar= True, cache_dir=".",
                                  cache_subdir="")

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Exception: ignored

In [None]:
print(os.listdir(dataset_dir))

train_dir = os.path.join(dataset_dir, 'train')
print(os.listdir(train_dir))

sample_file = os.path.join(train_dir, "pos/1181_9.txt")
with open(sample_file) as f:
    print(f.read())

remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

#Split train val test set

In [None]:
batch_size = 32 
seed = 42 


print("----train----")
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train',
                                                                  batch_size=batch_size,
                                                                  validation_split = 0.2,
                                                                  subset="training",
                                                                  seed=seed)
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i], raw_train_ds.class_names[label_batch.numpy()[i]])


print("\n----Validation----")
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train',
                                                                batch_size=batch_size,
                                                                validation_split=0.2,
                                                                subset='validation',
                                                                seed=seed)
for text_batch, label_batch in raw_val_ds.take(1):
    for i in range(3):
        print("Review", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i], raw_val_ds.class_names[label_batch.numpy()[i]])


print("\n----test----")
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/test',
                                                                 batch_size=batch_size)

#Staqndardize + tokenize + vectorize input data

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')

    return tf.strings.regex_replace(stripped_html,
                                    '[%s]' % re.escape(string.punctuation),
                                    '')
    
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(standardize=custom_standardization,
                                    max_tokens=max_features,
                                    output_mode='int',
                                    output_sequence_length=sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
print(train_text)
print(type(train_text))
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return  vectorize_layer(text), label

text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label =text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

print("165 ---> ",vectorize_layer.get_vocabulary()[165])
print("134 ---> ",vectorize_layer.get_vocabulary()[134])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

In [None]:
train_ds =raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size = AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size = AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size = AUTOTUNE)

#Model architecture

In [None]:
embedding_dim = 16
model = tf.keras.Sequential([layers.Embedding(max_features+1 , embedding_dim),
                             layers.Dropout(0.2),
                             layers.GlobalAveragePooling1D(),
                             layers.Dropout(0.2),
                             layers.Dense(1)])

model.summary()

model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

##training

In [None]:
epochs = 10
history = model.fit(train_ds,
                    validation_data = val_ds,
                    epochs = epochs)


In [None]:
loss , accuracy = model.evaluate(test_ds)
print("Loss:", loss)
print("Accuracy:", accuracy)

##Create a plot of accuracy and loss over time

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1,2,2)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

#Export the model

In [None]:
export_model=tf.keras.Sequential([vectorize_layer,
                                  model,
                                  layers.Activation('sigmoid')
                                  ])

export_model.compile(loss=losses.BinaryCrossentropy(from_logits=False),
                     optimizer='adam',
                     metrics=["accuracy"])

loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

##Inference on new data

In [None]:
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]

export_model.predict(examples)