##### Copyright 2018 The TensorFlow Authors.

In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Text classification with an RNN for Tensor Flow Lite

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/tutorials/text/text_classification_rnn"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/text_classification_rnn.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/text/text_classification_rnn.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/text/text_classification_rnn.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.

## Setup

In [0]:
!pip install tensorflow
!pip install tensorflow-text
import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_text as text

## Setup input pipeline


The IMDB large movie review dataset is a *binary classification* dataset—all the reviews have either a *positive* or *negative* sentiment.

Download the dataset using [TFDS](https://www.tensorflow.org/datasets).


In [0]:
dataset, info = tfds.load('imdb_reviews', with_info=True, 
                          as_supervised=True)
train_examples, test_examples = dataset['train'], dataset['test']

for ex in train_examples.take(4):
  print(ex)

Create our custom encoder based on `tfds.features.text.TextEncoder`.

In [0]:
import binascii
import sys
import tensorflow_text as text

class HashedTextEncoder(tfds.features.text.TextEncoder):
  """Encodes text using PySuperFastHash"""

  def __init__(self):
    """Constructs HashedTextEncoder.
    Args:
      None
    """
  def encode(self, s):
    # Handle additional tokens
    s = tf.compat.as_text(s)
    s = s.lower()
    ids = []
    words = s.split(" ") 
    for substr in words:
      if not substr:
        continue
      newid = self.superFastHash(substr)
      ids.append(newid)
    return self.pad_incr(ids)

  def pad_incr(self,ids):
    """Add 1 to ids to account for pad."""
    return [i + 1 for i in ids]

  def decode(self, ids):
    raise NotImplementedError

  def load_from_file():
    raise NotImplementedError
    
  def save_to_file():
    raise NotImplementedError  
  
  def vocab_size():
    raise NotImplementedError  

  def get16bits(self, data):
    """Returns the first 16bits of a string"""
    return int(binascii.hexlify(data[1::-1]), 16)

  def superFastHash(self, data):
    # Start by stripping out UTF data
    data=data.encode("ascii","ignore")

    hash = length = len(data)
    if length == 0:
        return 0

    rem = length & 3
    length >>= 2

    while length > 0:
        hash += self.get16bits(data) & 0xFFFFFFFF
        tmp = (self.get16bits(data[2:])<< 11) ^ hash
        hash = ((hash << 16) & 0xFFFFFFFF) ^ tmp
        data = data[4:]
        hash += hash >> 11
        hash = hash & 0xFFFFFFFF
        length -= 1

    if rem == 3:
        hash += self.get16bits (data)
        hash ^= (hash << 16) & 0xFFFFFFFF
        hash ^= (data[2] << 18) & 0xFFFFFFFF
        hash += hash >> 11
    elif rem == 2:
        hash += self.get16bits (data)
        hash ^= (hash << 11) & 0xFFFFFFFF
        hash += hash >> 17
    elif rem == 1:
        hash += data[0]
        hash ^= (hash << 10) & 0xFFFFFFFF
        hash += hash >> 1

    hash = hash & 0xFFFFFFFF
    hash ^= (hash << 3) & 0xFFFFFFFF
    hash += hash >> 5
    hash = hash & 0xFFFFFFFF
    hash ^= (hash << 4) & 0xFFFFFFFF
    hash += hash >> 17
    hash = hash & 0xFFFFFFFF
    hash ^= (hash << 25) & 0xFFFFFFFF
    hash += hash >> 6

    #Shorter version throw away top bits
    hash = hash & 0x1FFF

    return hash

This text encoder converts words to hashes.

In [0]:
#Needed to test filtering out unicode strings
sample_string = 'Hello TensorFlow, this is a @fun test. Fichier non trouvé'

encoder = HashedTextEncoder()

encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))

#Note this is a hash function so we can't reverse the operation

## Prepare the data for training

Now run the encoder on the dataset by wrapping it in `tf.py_function` and passing that to the dataset's map method.

In [0]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

You want to use `Dataset.map` to apply this function to each element of the dataset. `Dataset.map` runs in graph mode.

*   Graph tensors do not have a value.
*   In graph mode you can only use TensorFlow Ops and functions.

So you can't `.map` this function directly: You need to wrap it in a `tf.py_function`. The `tf.py_function` will pass regular tensors (with a value and a `.numpy()` method to access it), to the wrapped python function.

In [0]:
BUFFER_SIZE = 50000
BATCH_SIZE = 16

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


train_encoded = train_examples.map(encode_map_fn)
test_encoded = test_examples.map(encode_map_fn)

train_batches = train_encoded.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
test_batches = test_encoded.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)

Lets take a look at one of these to see how it now looks 

In [0]:
for train_example, train_label in train_batches.take(1):
  print('Encoded text:', train_example[:10].numpy())
  print('Label:', train_label.numpy())

for example_batch, label_batch in train_batches.take(2):
  print("Batch shape:", example_batch.shape)
  print("label shape:", label_batch.shape)

## Create the model

Build a `tf.keras.Sequential` model, the first embedding layer needs to be as big as our biggest hash + 1 as 0 is used for padding. The input_length parameter is needed so that we don't get the following error when converting the model to lite

`None is only supported in the 1st dimension. Tensor 'embedding_input' has invalid shape '[None, None]'.`

A recurrent neural network (RNN) processes sequence input by iterating through the elements. RNNs pass the outputs from one timestep to their input—and then to the next.

The `tf.keras.layers.Bidirectional` wrapper can also be used with an RNN layer. This propagates the input forward and backwards through the RNN layer and then concatenates the output. This helps the RNN to learn long range dependencies.

In [0]:
# model = tf.keras.Sequential([
#    tf.keras.layers.Embedding(32767,64, input_length=32),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#    tf.keras.layers.Dense(64, activation='relu'),
#    tf.keras.layers.Dense(1)
#])

# Simpler model from https://www.tensorflow.org/tutorials/keras/text_classification

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(8193, 16,input_length=16),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(1)
  ])

model.summary()

Please note that we choose to Keras sequential model here since all the layers in the model only have single input and produce single output. In case you want to use stateful RNN layer, you might want to build your model with Keras functional API or model subclassing so that you can retrieve and reuse the RNN layer states. Please check [Keras RNN guide](https://www.tensorflow.org/guide/keras/rnn#rnn_state_reuse) for more details.

Compile the Keras model to configure the training process:

In [0]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

## Train the model

Training on a reduced data set to speed verification of the technique. Can use the whole set once we know the process will work.

In [0]:
history = model.fit(train_batches.take(30), epochs=1,
                    validation_data=test_batches.take(30), 
                    validation_steps=2)

In [0]:
test_loss, test_acc = model.evaluate(test_batches)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [0]:
# Save the model
model.save("saved_model/textclassification_model") 

The above model does not mask the padding applied to the sequences. This can lead to skew if trained on padded sequences and test on un-padded sequences. Ideally you would [use masking](../../guide/keras/masking_and_padding) to avoid this, but as you can see below it only have a small effect on the output.

If the prediction is >= 0.5, it is positive else it is negative.

## Test the model

In [0]:
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

In [0]:
def sample_predict(sample_pred_text, pad):
  encoded_sample_pred_text = encoder.encode(sample_pred_text)

  if pad:
    encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
  encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))

  return (predictions)

In [0]:
# predict on a sample text without padding.

sample_pred_text = ('The movie was cool. The animation and the graphics '
                    'were out of this world. I would recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=False)
print(predictions)

In [0]:
# predict on a sample text with padding

sample_pred_text = ('The movie was fantastic. The animation and the graphics '
                    'were out of this world. I would recommend this movie. Loved every minute of it. A cast of famous people')
predictions = sample_predict(sample_pred_text, pad=True)
print(predictions)

sample_pred_text2 = ('This was rubbish, wont be going again. Hated it. Totally pants')
predictions = sample_predict(sample_pred_text2, pad=True)
print(predictions)

# Export the model

Install TinyML Gen to convert the model to C format. Is internally using the TFLiteConverter to create the TFLite image and then formatting it as a big C array

In [0]:
!pip install tinymlgen

In [0]:
from tinymlgen import port

c_code = port(model,optimize=True,pretty_print=True)

print(len(c_code))

In [0]:
#The c_code string was too big to dump to the screen so instead we save it to our G Drive 
from google.colab import drive 
drive.mount('/content/gdrive') 

In [0]:
c_file = open(r"/content/gdrive/My Drive/text_model.h","w+")

n = c_file.write(c_code)
c_file.close()