In [None]:
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import KerasTools as KT
import numpy as np

import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
# Step 1

# Load the IMDB dataset, a set of 50000 reviews from the Internet Movie Database, 
# binary labeled ('good movie' / 'bad movie' )
# 
# It is split into training and test set of 25000 / 25000 entries.
# The words are tokenized into integer encoding.
#
# Note: As loading the IMDB dataset has some computational overhead for preprocessing,
# we load it over a wrapper function (KT.imdb.load_data), which uses a cache.

(train_data, train_labels), (test_data, test_labels) = KT.datasets.imdb.load_data(num_words=10000)

In [None]:
# Step 2

# Show raw data of training set entry '0'
# train_entry: Tokenized movie review 
# train_label: Binary sentiment label ('0' = bad movie, '1' = good movie)
entry = 0
print("Raw training entry No 0: {}".format(train_data[entry]))
print()
print("Raw training label No 0: '{}'".format(train_labels[entry]))

In [None]:
# Step 3

# Use the tokenizer word encoding dictionary to reconstruct the original review text.
#
# Note: Tokens 0, 1, and 2 are reserved for 'padding', 'start of sequence', and 'unknown word'

raw_word_index = KT.datasets.imdb.get_word_index()
word_index = {v+3:k for k,v in raw_word_index.items()}
word_index[0] = '-PAD-'
word_index[1] = '-START-'
word_index[2] = '-UNK-'

# Reconstruct train data entry 0 as string
print(" ".join(word_index.get(w, 2) for w in train_data[entry]))