# Classify Movie Reviews using Embeddings

In [1]:
!pip install tensorflow==2.0.0.alpha0

Collecting tensorflow==2.0.0.alpha0
[?25l  Downloading https://files.pythonhosted.org/packages/29/39/f99185d39131b8333afcfe1dcdb0629c2ffc4ecfb0e4c14ca210d620e56c/tensorflow-2.0.0a0-cp36-cp36m-manylinux1_x86_64.whl (79.9MB)
[K    100% |████████████████████████████████| 79.9MB 367kB/s 
Collecting tb-nightly<1.14.0a20190302,>=1.14.0a20190301 (from tensorflow==2.0.0.alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/a9/51/aa1d756644bf4624c03844115e4ac4058eff77acd786b26315f051a4b195/tb_nightly-1.14.0a20190301-py3-none-any.whl (3.0MB)
[K    100% |████████████████████████████████| 3.0MB 11.1MB/s 
Collecting google-pasta>=0.1.2 (from tensorflow==2.0.0.alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/64/bb/f1bbc131d6294baa6085a222d29abadd012696b73dcbf8cf1bf56b9f082a/google_pasta-0.1.5-py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 24.4MB/s 
Collecting tf-estimator-nightly<1.14.0.dev2019030116,>=1.14.0.dev2019030115 (from tensor

In [6]:
#this version of numpy is required to avoid an error related to numpy defaulting to not allowing pickle files
!pip install numpy==1.16.2

Collecting numpy==1.16.2
[?25l  Downloading https://files.pythonhosted.org/packages/35/d5/4f8410ac303e690144f0a0603c4b8fd3b986feb2749c435f7cdbb288f17e/numpy-1.16.2-cp36-cp36m-manylinux1_x86_64.whl (17.3MB)
[K    100% |████████████████████████████████| 17.3MB 2.1MB/s 
[31mdatascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[31malbumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.8 which is incompatible.[0m
[?25hInstalling collected packages: numpy
  Found existing installation: numpy 1.16.3
    Uninstalling numpy-1.16.3:
      Successfully uninstalled numpy-1.16.3
Successfully installed numpy-1.16.2


In [0]:
import tensorflow as tf
from tensorflow.keras import datasets, preprocessing, models, layers
import numpy as np

## Import IMDB Reviews

In [0]:
imdb = datasets.imdb

#Let's work with a dictionary of 20,000 words
NUM_WORDS = 20000

In [0]:
#load IMDB dataset as lists of integers
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=NUM_WORDS, )

## Get Word Index To See Words

In [4]:
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k: (v+3) for k, v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


## Inspect First Review

In [5]:
print(' '.join([reverse_word_index[i] for i in train_data[0]]))

<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be p

## Was this a positive review?

In [6]:
print(train_labels[0])

1


## Note: Reviews have variable length

In [7]:
print('Len 0: {}, Len 1: {}, Len 2: {}'.format(len(train_data[0]), len(train_data[1]), len(train_data[2])))

Len 0: 218, Len 1: 189, Len 2: 141


Variable lenght is fixed by truncating after a certain number of words. For reviews that are less than the number of words we are cutting off, we pad.

In [0]:
LEN_WORDS = 300
train_data = preprocessing.sequence.pad_sequences(train_data, maxlen=LEN_WORDS)
test_data = preprocessing.sequence.pad_sequences(test_data, maxlen=LEN_WORDS)

In [9]:
print('Len 0: {}, Len 1: {}, Len 2: {}'.format(len(train_data[0]), len(train_data[1]), len(train_data[2])))

Len 0: 300, Len 1: 300, Len 2: 300


In [13]:
print(train_data.shape)

(25000, 300)


## Sequential Model with Dense Layers

In [30]:
dense_model = models.Sequential([
    layers.Dense(300, input_shape=(300,), activation='relu'),
    layers.Dense(300, activation='relu'),
    layers.Dense(300, activation='relu'),
    layers.Dense(300, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
print(dense_model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_23 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_24 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_25 (Dense)             (None, 300)               90300     
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 301       
Total params: 361,501
Trainable params: 361,501
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
dense_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dense_model.fit(train_data, train_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb044caf390>

## Introduce Embeddings

In [27]:
DIMENSION = 16

e_model = models.Sequential([
    layers.Embedding(NUM_WORDS, DIMENSION, input_length=LEN_WORDS),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
])
print(e_model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 16)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 17        
Total params: 320,017
Trainable params: 320,017
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
e_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
e_model.fit(train_data, train_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb03d06b9e8>