In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers,losses
import os
import string
import re
import shutil

Loading movie review dataset

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
os.listdir(dataset_dir)

['README', 'test', 'imdb.vocab', 'train', 'imdbEr.txt']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['unsup',
 'pos',
 'neg',
 'labeledBow.feat',
 'urls_unsup.txt',
 'urls_neg.txt',
 'urls_pos.txt',
 'unsupBow.feat']

In [5]:
pos_file = os.path.join(train_dir,'pos')
# os.listdir(pos_file)

positive review sample

In [6]:
sample_file = os.path.join(pos_file, '7789_10.txt')
with open(sample_file) as f:
  print(f.read())

Allison Dean's performance is what stands out in my mind watching this film. She balances out the melancholy tone of the film with an iridescent energy. I would like to see more of her.


In [7]:
neg_file = os.path.join(train_dir,'neg')
# os.listdir(neg_file)

negative review sample

In [8]:
sample_file = os.path.join(neg_file, '6064_3.txt')
with open(sample_file) as f:
  print(f.read())

Well well well. As good as John Carpenter's season 1 outing in "Masters of Horror" was, this is the complete opposite. He certainly proved he was still a master of horror with "Cigarette Burns" but "Pro-Life" is perhaps the worst I have seen from him.<br /><br />It's stupid, totally devoid of creepy atmosphere and tension and it overstays it's welcome, despite the less-than-an-hour running time. The script is nonsense, the characters are irritable and un-appealing and the conclusion is beyond absurd.<br /><br />And for those suckers who actually bought the DVD (one of them being me); did you see how Carpenter describes the film? He's actually proud of it and he talks about it as his best work for a long time, and he praises the script. And in the commentary track, where he notices an obvious screw up that made it to the final cut, he just says he didn't feel it essential to rectify the mistake and he just let it be there. I fear the old master has completely lost his touch. I sincerely

Remove unwanted files

In [9]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

Dataset split (training_dataset with validation split)

In [10]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


few examples with label

In [11]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(2):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label 0
Review b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into 

In [12]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to neg
Label 1 corresponds to pos


Validation split

In [13]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


Test split

In [14]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.


Before using the dataset for training it needs to standardize and vectorize

custom standardization

In [15]:
def standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation),'')

Setting sequence length for truncate sequence in layer

In [16]:
max_features = 10000
sequence_length = 250

Vectorization layer

In [17]:
vectorize_layer = layers.TextVectorization(
    standardize= standardization,
    max_tokens= max_features,
    output_mode='int',
    output_sequence_length= sequence_length)

Label is not need for vectorization

In [18]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [19]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [20]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[9], label_batch[9]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b"When I was a kid we always used to be babysat, and we always used to rent a film or see a film at the cinema. This is one of the films we watched. This is one of the stupidest films I've ever seen, I think it might even be a Walt Disney Pictures film! A martian is dropped on earth, turns into a human, befriends a human, and is trying everything he can to get back home. But he is distracted by the wonders of the Earth. The only good comment I can give is the choice of actors, Back to the Future's Christopher Lloyd as the martian, Uncle Martin, Dumb and Dumber's Jeff Daniels as Tim O'Hara, Elizabeth Hurley as Brace Channing and Daryl Hannah as Lizzie. But apart from that it's complete crap. Poor!", shape=(), dtype=string)
Label neg
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  51,   10,   13,    4,  554,   71,  204,  330,    6,   26,    1,
           3,   71,  204,  330,    6,  862,    4,   19,   41,   67,    4,
          19,   31,    2, 

In [21]:
print("43 ---> ",vectorize_layer.get_vocabulary()[43])
print(" 907 ---> ",vectorize_layer.get_vocabulary()[907])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

43 --->  has
 907 --->  brings
Vocabulary size: 10000


Vectorizing the dataset

In [22]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

Dataset performance
To avoid bottlenecks dataset is stored in cache after it is loaded from the disk

In [23]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [24]:
embedding_dim = 16

Creating a neural network

In [25]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160016    
                                                                 
 dropout (Dropout)           (None, None, 16)          0         
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 160033 (625.13 KB)
Trainable params: 160033 (625.13 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [26]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [27]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Evaluating the model

In [28]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.3105068504810333
Accuracy:  0.8605999946594238


In [40]:
# history_dict = history.history
# history_dict.keys()
# acc = history_dict['accuracy']
# val_acc = history_dict['val_accuracy']
# loss = history_dict['loss']
# val_loss = history_dict['val_loss']

In [41]:
# epochs = range(1, len(acc) + 1)

# plt.plot(epochs, acc, 'ro', label='Training acc')
# # b is for "solid blue line"
# plt.plot(epochs, val_acc, 'b', label='Validation acc')
# plt.title('Training and validation acc')
# plt.xlabel('Epochs')
# plt.ylabel('acc')
# plt.legend()

exporting model

In [34]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

0.8728399872779846


Predicting

In [39]:
examples = [
  "great",
  " okay",
  "terrible",
  "Awesome"
]

export_model.predict(examples)



array([[0.6148526 ],
       [0.43930304],
       [0.35352564],
       [0.57363814]], dtype=float32)