##### Copyright 2018 The TensorFlow Authors.

In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Text classification with an RNN for Tensor Flow Lite

This text classification tutorial trains a [recurrent neural network](https://developers.google.com/machine-learning/glossary/#recurrent_neural_network) on the [IMDB large movie review dataset](http://ai.stanford.edu/~amaas/data/sentiment/) for sentiment analysis.

It was based on a number of examples from TensorFlow combined with my own code. See 

## Setup

In [0]:
!pip install tensorflow
!pip install tensorflow-text
import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_text as text

## Setup input pipeline


The IMDB large movie review dataset is a *binary classification* dataset—all the reviews have either a *positive* or *negative* sentiment.

Download the dataset using [TFDS](https://www.tensorflow.org/datasets).


In [0]:
dataset, info = tfds.load('imdb_reviews', with_info=True, 
                          as_supervised=True)
train_examples, test_examples = dataset['train'], dataset['test']

for ex in train_examples.take(4):
  print(ex)

Create our custom encoder based on `tfds.features.text.TextEncoder`.

In [0]:
import binascii
import sys
import tensorflow_text as text
from math import floor

class HashedTextEncoder(tfds.features.text.TextEncoder):
  """Encodes text using PySuperFastHash"""

  def __init__(self):
    """Constructs HashedTextEncoder.
    Args:
      None
    """
  def encode(self, s):
    # Handle additional tokens
    s = tf.compat.as_text(s)
    s = s.lower()
    ids = []
    words = s.split(" ") 
    for substr in words[0:16]:
      if not substr:
        continue
      newid = self.superFastHash(substr)
      ids.append(newid)
    #If length is too long then select the middle words
    #if len(ids) > 12:
    #  ids = ids[floor((len(ids)-12) / 2):floor((len(ids)-12) / 2) + 12]
    return self.pad_incr(ids)

  def pad_incr(self,ids):
    """Add 1 to ids to account for pad."""
    return [i + 1 for i in ids]

  def decode(self, ids):
    raise NotImplementedError

  def load_from_file():
    raise NotImplementedError
    
  def save_to_file():
    raise NotImplementedError  
  
  def vocab_size():
    raise NotImplementedError  

  def get16bits(self, data):
    """Returns the first 16bits of a string"""
    return int(binascii.hexlify(data[1::-1]), 16)

  def superFastHash(self, data):
    # Start by stripping out UTF data
    data=data.encode("ascii","ignore")

    hash = length = len(data)
    if length == 0:
        return 0

    rem = length & 3
    length >>= 2

    while length > 0:
        hash += self.get16bits(data) & 0xFFFFFFFF
        tmp = (self.get16bits(data[2:])<< 11) ^ hash
        hash = ((hash << 16) & 0xFFFFFFFF) ^ tmp
        data = data[4:]
        hash += hash >> 11
        hash = hash & 0xFFFFFFFF
        length -= 1

    if rem == 3:
        hash += self.get16bits (data)
        hash ^= (hash << 16) & 0xFFFFFFFF
        hash ^= (data[2] << 18) & 0xFFFFFFFF
        hash += hash >> 11
    elif rem == 2:
        hash += self.get16bits (data)
        hash ^= (hash << 11) & 0xFFFFFFFF
        hash += hash >> 17
    elif rem == 1:
        hash += data[0]
        hash ^= (hash << 10) & 0xFFFFFFFF
        hash += hash >> 1

    hash = hash & 0xFFFFFFFF
    hash ^= (hash << 3) & 0xFFFFFFFF
    hash += hash >> 5
    hash = hash & 0xFFFFFFFF
    hash ^= (hash << 4) & 0xFFFFFFFF
    hash += hash >> 17
    hash = hash & 0xFFFFFFFF
    hash ^= (hash << 25) & 0xFFFFFFFF
    hash += hash >> 6

    #Shorter version throw away top bits
    hash = hash & 0x3FF

    return hash

This text encoder converts words to hashes.

In [0]:
#Needed to test filtering out unicode strings
sample_string = 'Hello TensorFlow, this is a @fun test. Fichier non trouvé, now check that the too long didnt read function is also working'

encoder = HashedTextEncoder()

encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))

#Note this is a hash function so we can't reverse the operation

## Prepare the data for training

Now run the encoder on the dataset by wrapping it in `tf.py_function` and passing that to the dataset's map method.

In [0]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

You want to use `Dataset.map` to apply this function to each element of the dataset. `Dataset.map` runs in graph mode.

*   Graph tensors do not have a value.
*   In graph mode you can only use TensorFlow Ops and functions.

So you can't `.map` this function directly: You need to wrap it in a `tf.py_function`. The `tf.py_function` will pass regular tensors (with a value and a `.numpy()` method to access it), to the wrapped python function.

Note that this means there is delayed execution of these functions and you don't see them run until you access them or process them through a model.

Question: Can we down size the int here or do we wait till the quantisation step? https://stackoverflow.com/questions/22725043/convert-dtype-from-int64-to-int32

In [0]:
def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label

In [0]:
BUFFER_SIZE = 50000
BATCH_SIZE = 100

train_encoded = train_examples.map(encode_map_fn)
test_encoded = test_examples.map(encode_map_fn)

train_batches = train_encoded.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
test_batches = test_encoded.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)

Lets take a look at one of these to see how it now looks 

In [0]:
for train_example, train_label in train_batches.take(1):
  print('Encoded text:', train_example[:10].numpy())
  print('Label:', train_label.numpy())

for example_batch, label_batch in train_batches.take(1):
  print("Batch shape:", example_batch.shape)
  print("label shape:", label_batch.shape)

## Create the model

Build a `tf.keras.Sequential` model, the first embedding layer needs to be as big as our biggest hash + 1 as 0 is used for padding.

When using the Embedding layer, the input_length parameter is needed so that we don't get the following error when converting the model to lite.

https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

`None is only supported in the 1st dimension. Tensor 'embedding_input' has invalid shape '[None, None]'.`

Otherwise use the input_shape parameter

A recurrent neural network (RNN) processes sequence input by iterating through the elements. RNNs pass the outputs from one timestep to their input—and then to the next.

The `tf.keras.layers.Bidirectional` wrapper can also be used with an RNN layer. This propagates the input forward and backwards through the RNN layer and then concatenates the output. This helps the RNN to learn long range dependencies.

In [0]:
# Orgional Model.
# model = tf.keras.Sequential([
#    tf.keras.layers.Embedding(32767,64, input_length=32),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#    tf.keras.layers.Dense(64, activation='relu'),
#    tf.keras.layers.Dense(1)
#])

# This one seems to cause problems with optimisation steps.
# model = tf.keras.Sequential([
#    tf.keras.layers.Embedding(1025,16, input_length=16),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
#    tf.keras.layers.Dense(16, activation='relu'),
#    tf.keras.layers.Dense(1)
#])

# Experiment that does not seem to work, can't get our tensors in the right shape for bidirectional
# Could it be done by modifying the map function?
# model = tf.keras.Sequential([
#  tf.keras.layers.Reshape((-1,3),input_shape=(16,1)),
#  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#  tf.keras.layers.Dense(64, activation='relu'),
#  tf.keras.layers.Dense(1)
#  ])

# Simpler model from https://www.tensorflow.org/tutorials/keras/text_classification
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(1025, 8,input_length=16),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(8, activation='relu'),
  tf.keras.layers.Dense(1)
  ])

model.summary()

Please note that we choose to Keras sequential model here since all the layers in the model only have single input and produce single output. In case you want to use stateful RNN layer, you might want to build your model with Keras functional API or model subclassing so that you can retrieve and reuse the RNN layer states. Please check [Keras RNN guide](https://www.tensorflow.org/guide/keras/rnn#rnn_state_reuse) for more details.

Compile the Keras model to configure the training process:

Optimisers - https://www.tensorflow.org/api_docs/python/tf/keras/optimizers

In [0]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

## Train the model

Training on a reduced data by using .take(20) on the batches and reducing the validation steps to speed verification of the technique. Can use the whole set once we know the process will work.

In [0]:
history = model.fit(train_batches, epochs=30,
                    validation_data=test_batches.take(20), 
                    validation_steps=5)

In [0]:
test_loss, test_acc = model.evaluate(test_batches)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [0]:
# Save the model
model.save("textclassification_model") 

The above model does not mask the padding applied to the sequences. This can lead to skew if trained on padded sequences and test on un-padded sequences. Ideally you would [use masking](../../guide/keras/masking_and_padding) to avoid this, but as you can see below it only have a small effect on the output.

If the prediction is >= 0.5, it is positive else it is negative.

## Test the model

In [0]:
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

In [0]:
def sample_predict(sample_pred_text, pad):
  encoded_sample_pred_text = encoder.encode(sample_pred_text)

  if pad:
    encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 16)

  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))

  return (predictions)

In [0]:
# predict on a sample text without padding.

sample_pred_text = ('The movie was cool. The animation and the graphics '
                    'were out of this world. I would recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=False)
print(predictions)

In [0]:
# predict on a sample text with padding

sample_pred_text = ('The movie was fantastic. The animation and the graphics '
                    'were out of this world. I would recommend this movie. Loved every minute of it. A cast of famous people')
predictions = sample_predict(sample_pred_text, pad=True)
print(predictions)

print(sample_predict('This was rubbish, wont be going again. Hated it. Totally pants', pad=True))

print(sample_predict('Amazing film, loved seeing this', pad=True))


# Export the model

Convert the model to TFLite then format as a big C array.

Based on https://github.com/eloquentarduino/tinymlgen/blob/master/tinymlgen/tinymlgen.py

Ref https://blog.tensorflow.org/2019/06/tensorflow-integer-quantization.html

In [0]:
!pip install hexdump

In [0]:
#Experimenting with optimisations

import re
import hexdump
import tensorflow as tf

def port(model,optimize=True, variable_name='model_data',pretty_print=False):
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    if optimize:
        if isinstance(optimize, bool):
            optimizers = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
        else:
            optimizers = optimize

        converter.optimizations = optimizers
    tflite_model = converter.convert()
    bytes = hexdump.dump(tflite_model).split(' ')
    c_array = ', '.join(['0x%02x' % int(byte, 16) for byte in bytes])
    c = 'const unsigned char %s[] DATA_ALIGN_ATTRIBUTE = {%s};' % (variable_name, c_array)
    if pretty_print:
        c = c.replace('{', '{\n\t').replace('}', '\n}')
        c = re.sub(r'(0x..?, ){12}', lambda x: '%s\n\t' % x.group(0), c)
    c += '\nconst int %s_len = %d;' % (variable_name, len(bytes))
    preamble = '''
#ifdef __has_attribute
#define HAVE_ATTRIBUTE(x) __has_attribute(x)
#else
#define HAVE_ATTRIBUTE(x) 0
#endif
#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
#else
#define DATA_ALIGN_ATTRIBUTE
#endif
'''
    return preamble + c

In [0]:
c_code = port(model,optimize=True,pretty_print=True)

print(len(c_code))

File size needs to be < 400K to fit onto the device. Check the model_data_len value at the bottom of the file.
const int model_data_len = 109840 and a tiny bit of code comes to 90% of the availabe space.
But perhaps it also needs to be smaller than the available ram to be able to run? For example the sine model is just 2640 bytes;

In [0]:
c_file = open(r"text_model.h","w+")

n = c_file.write(c_code)
c_file.close()

# Testing the TFLite model

It is possible to reload the model back into the notebook and test it here.

Arena Size?

https://github.com/edgeimpulse/tflite-find-arena-size


Debugging TFLite
