In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

In [3]:
imdb,info=tfds.load('imdb_reviews',with_info=True,as_supervised=True)

In [4]:
train_batch,test_batch=imdb['train'],imdb['test']

## Data Preprocessing

In [5]:
training_cat,training_label=[],[]
testing_cat,testing_label=[],[]
for cat,label in train_batch:
    training_cat.append(str(cat.numpy()))
    training_label.append(label.numpy())
    
for cat,label in test_batch:
    testing_cat.append(str(cat.numpy()))
    testing_label.append(label.numpy())



In [7]:
### Label conversion to numpy
training_label=np.array(training_label)
testing_label=np.array(testing_label)

In [8]:
testing_label.shape

(25000,)

In [11]:
training_cat[1]

"b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.'"

### Tokenization

In [12]:
## Hyper parameters
vocab_size=10000
embedded_dim=16
truncating='post'
oov_name='<OOV>'
max_word_len=120

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [14]:
token=Tokenizer(vocab_size,oov_token=oov_name)
token.fit_on_texts(training_cat)
word_index=token.word_index

In [21]:
word_index['fellow']

1508

In [22]:
train_seq=token.texts_to_sequences(training_cat)
test_seq=token.texts_to_sequences(testing_cat)

### Padding sequence

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
train_pad_seq=pad_sequences(train_seq,maxlen=max_word_len,padding='post',truncating=truncating)
test_pad_seq=pad_sequences(test_seq,maxlen=max_word_len,padding='post',truncating=truncating)

In [25]:
train_pad_seq.shape

(25000, 120)

In [26]:
train_pad_seq[2]

array([   1, 6175,    2,    1, 4916, 4029,    9,    4,  912, 1622,    3,
       1969, 1307,    3, 2384, 8836,  201,  746,  361,   15,   34,  208,
        308,    6,   83,    8,    8,   19,  214,   22,  352,    4,    1,
        990,    2,   82,    5, 3608,  545,    1,    6,    1,  539,    4,
          1,  434,    4,    1,    3,    6,    1,    2, 1176,  539,   95,
          1, 8111,   10,   46,   22,    2, 1996,   16, 1153,    5,    2,
        511,    8,    8,  163,   62, 2624, 7315,   13,  586,   22,    2,
       2297,  507,    5,    2, 3652,  317,    2,    1, 1835, 3445,  451,
       4030,    3, 1168,  985,    6,   28, 4091, 3608,  545,   16,    1,
          2, 2297, 2430,   16,    2,  299, 1357, 1259,    8,    8, 2297,
        803,   29, 2871,   16,    4,    1, 3028,  564,    5,  746],
      dtype=int32)

### Model Training

In [27]:
from tensorflow.keras.layers import Dense, Flatten,Embedding

In [33]:
model=keras.Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embedded_dim,input_length=max_word_len))
model.add(Flatten())
model.add(Dense(units=6,activation=tf.nn.relu))
model.add(Dense(units=1,activation=tf.nn.sigmoid))
model.compile(optimizer='adam',
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
                           metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [37]:
## Call backs
class MyCallback(keras.callbacks.Callback):
    def on_epoch_end(self,epoch,logs={}):
        if logs.get('accuracy')>0.80:
            print('\n')
            print('Crossed 80% accuracy. Stopping training')
            self.model_stop_training=True

In [38]:
callback=MyCallback()

In [36]:
model.fit(train_pad_seq,training_label,epochs=10,validation_data=(test_pad_seq,testing_label),callbacks=[callback])

Train on 25000 samples, validate on 25000 samples
Epoch 1/10

Crossed 80% accuracy. Stopping training
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9973c71850>