In [1]:
import os
import io
import shutil
from warnings import filterwarnings

import re
import string

import tensorflow as tf

from jupyterthemes import jtplot
import matplotlib.pyplot as plt
import seaborn as sns

LOG_DIR = '../logs/'
sns.set_style('whitegrid')
filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='../logs/')
jtplot.style(context='talk', theme='chesterish', grid=True, ticks=True, figsize=(12, 6))

In [2]:
DATA_DIR = "../Data/Imdb/"
train_dir = os.path.join(DATA_DIR, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [3]:
# # The train directory also has additional folders which should be removed before creating training dataset.
# remove_dir = os.path.join(train_dir, 'unsup')
# shutil.rmtree(remove_dir)

## Create Datasets:

- use train directory to create train dataset and a validation dataset with split of 20% for validation.

In [4]:
BATCH_SIZE = 1024
SEED = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    '../Data/Imdb/train/', batch_size=BATCH_SIZE, 
    validation_split=.2, subset='training', seed=SEED
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    '../Data/Imdb/train', batch_size=BATCH_SIZE, 
    validation_split=.2, subset='validation', seed=SEED
) 

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [5]:
# -- Checking first five movie reviews from the datasets(tesnroflow dataset object)
for text, label in train_ds.take(1):
    for i in range(2):
        print(label[i].numpy(), text.numpy()[i], sep=": ", end="\n")

0: b"Oh My God! Please, for the love of all that is holy, Do Not Watch This Movie! It it 82 minutes of my life I will never get back. Sure, I could have stopped watching half way through. But I thought it might get better. It Didn't. Anyone who actually enjoyed this movie is one seriously sick and twisted individual. No wonder us Australians/New Zealanders have a terrible reputation when it comes to making movies. Everything about this movie is horrible, from the acting to the editing. I don't even normally write reviews on here, but in this case I'll make an exception. I only wish someone had of warned me before I hired this catastrophe"
1: b'This movie is SOOOO funny!!! The acting is WONDERFUL, the Ramones are sexy, the jokes are subtle, and the plot is just what every high schooler dreams of doing to his/her school. I absolutely loved the soundtrack as well as the carefully placed cynicism. If you like monty python, You will love this film. This movie is a tad bit "grease"esk (witho

## Configure the dataset for performance:

- `.cache()` keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.<br><br>

- `.prefetch()` overlaps data preprocessing and model execution while training.

In [6]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Using the Embedding layer

In [7]:
# - Embed a 1,000 word vocabulary into 5 dimensions.
embedding_layers = tf.keras.layers.Embedding(1000, 5)

In [8]:
# -- test out embedding layer 
# -- its a tensor of integers, of shape (samples, sequence_length)
print(embedding_layers(tf.constant([1, 2, 3, 55])))

tf.Tensor(
[[ 0.03979614  0.00108985 -0.00313186  0.04942242 -0.0214565 ]
 [-0.01830754  0.04851105 -0.01170666 -0.02710489  0.02220693]
 [-0.01305484 -0.02601485 -0.02959191 -0.02536185 -0.04980961]
 [ 0.00181361 -0.03030678 -0.00508846 -0.01310108  0.03067455]], shape=(4, 5), dtype=float32)


In [9]:
res = embedding_layers(tf.constant([[0, 1, 2], [3, 4, 5]]))
print(res)

tf.Tensor(
[[[ 8.9626312e-03  4.5869675e-02 -2.3183061e-02  4.0337298e-02
   -4.2887103e-02]
  [ 3.9796140e-02  1.0898486e-03 -3.1318553e-03  4.9422417e-02
   -2.1456504e-02]
  [-1.8307544e-02  4.8511054e-02 -1.1706661e-02 -2.7104890e-02
    2.2206929e-02]]

 [[-1.3054837e-02 -2.6014853e-02 -2.9591907e-02 -2.5361849e-02
   -4.9809612e-02]
  [ 1.8116523e-02  2.8409135e-02 -3.0370235e-02  4.2130638e-02
    4.8429970e-02]
  [-2.8947508e-02 -3.7939988e-02 -7.5232238e-05  2.9212918e-02
   -3.4238055e-02]]], shape=(2, 3, 5), dtype=float32)


- This returned tensor has one more axis, than the input, the embedding vectors are alinged along the new last axis.
- When given a batch of sequences as input, an embedding layer returns a 3D floating point tensor, of shape (samples, sequence_length, embedding_dimensionality).

## Text Preprocessing:

In [10]:
def custom_standardization(input_data):
    """ custom standardization function to strip HTML break tags """
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '', '')
    return tf.strings.regex_replace(
        stripped_html, '[%s]' % re.escape(string.punctuation), ''
    )

In [11]:
# -- vocabulary size & number of words in a sequence
vocab_size = 10000
sequence_length = 100
# -- using the text-vectorization layer to normalize, split, and map strings to integers.
# -- layer uses custom standardization defined above.
# -- set maximum sequence length as all samples are not of the same length.
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization, 
    max_tokens=vocab_size, 
    output_mode='int', 
    output_sequence_length=sequence_length
)
# -- make text only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [12]:
# -- creating a function to see the results of using this layer to preprocess some data.
def vec_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

text_batch, label_batch = next(iter(train_ds))
first_review, first_label = text_batch[0], label_batch[0]
processed_review = custom_standardization(first_review)
print(f'Review: {processed_review}\n' 
      f'Label: {first_label}\n\n'
      f'Vectorized review: {vec_text(processed_review, first_label)}'
     )

Review: b'the original animated dark knight returns in this ace adventure movie that rivals mask of phantasm in its coolness theres a lot of style and intelligence in mystery of the batwoman so much more than batman forever or batman and robinbr br theres a new crimefighter on the streets of gotham she dresses like a bat but shes not a grownup batgirl and batman is denying any affiliation with her meanwhile bruce wayne has to deal with the usual romances and detective work but the penguin bain and the local mob makes things little more complicatedbr br i didnt have high hopes for this un since being strongly let down but the weak batman sub zero robin isnt featured so much herebut i was delighted with the imaginative and exciting set pieces the clever plot and a cheeky sense of humor this is definitely a movie no fan of batman should be without keep your ears open for a really catchy song called betcha neva which is featured prominently throughoutbr br its a shame the dvd isnt so great

In [13]:
standardized_ = custom_standardization(first_review.numpy())
print(standardized_)

tf.Tensor(b'the original animated dark knight returns in this ace adventure movie that rivals mask of phantasm in its coolness theres a lot of style and intelligence in mystery of the batwoman so much more than batman forever or batman and robinbr br theres a new crimefighter on the streets of gotham she dresses like a bat but shes not a grownup batgirl and batman is denying any affiliation with her meanwhile bruce wayne has to deal with the usual romances and detective work but the penguin bain and the local mob makes things little more complicatedbr br i didnt have high hopes for this un since being strongly let down but the weak batman sub zero robin isnt featured so much herebut i was delighted with the imaginative and exciting set pieces the clever plot and a cheeky sense of humor this is definitely a movie no fan of batman should be without keep your ears open for a really catchy song called betcha neva which is featured prominently throughoutbr br its a shame the dvd isnt so gre

## Create a classification model: Continuous Bag of Words model.
- Now vectorize_layer can be used as the first layer of your end-to-end classification model, feeding transformed strings into the Embedding layer.

In [14]:
embedding_dim = 16
model = tf.keras.Sequential([
    vectorize_layer, 
    tf.keras.layers.Embedding(vocab_size, embedding_dim, name='embedding'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

## Compile & Train the model:

In [15]:
model.compile(optimizer='adam', 
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy']
             )

model.fit(train_ds, validation_data=val_ds, epochs=15, callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x277974b6fd0>

In [16]:
# -- Visualize the graph of performance
# (env) d:drive>python -m tensorboard.main --logdir=logs/   <-- worked for me
# %load_ext tensorboard
# %tensorboard --logdir logs

## Retrieve the trained word embeddings and save them to disk

In [17]:
# -- Write the weights to disk
# 1. file of vectors containing the embedding : tsv- tab separated format
# 2. file of metadata containing words.
# -----------------------------------------------------------------------
# weights = model.get_layer('embedding').get_weights()[0]
# vocab = vectorize_layer.get_vocabulary()

# out_v = io.open(LOG_DIR, 'w', encoding='utf-8')
# out_m = io.open(LOG_DIR, 'w', encoding='utf-8')

# for idx, word in enumerate(vocab):
#     if idx == 0:
#         continue
#     vec = weights[idx]
#     out_v.write('\t'.join([str(x) for x in vec]) + '\n')
#     out_m.write(word + "\n")
# out_v.close()
# out_m.close()

In [18]:
loss, accuracy = model.evaluate(val_ds)
print(f"Accuracy: {accuracy:.2f} | Loss: {loss:.2f}")

Accuracy: 0.77 | Loss: 0.44


In [19]:
# -- fetch the test dataset and make predictions on that.
# -- Improve the accuracy of the mode as the model is overfitting on the training.