### **1.** Preprocessing

First we need to install tensorflow_text


In [1]:
pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 5.5 MB/s 
[K     |████████████████████████████████| 462 kB 42.7 MB/s 
[?25h

In [2]:
import tensorflow as tf 
import tensorflow_hub as hub
import tensorflow_text as text

I run this on Colab so instead import the module, we copy it here

In [26]:
import os
def preprocess_for_tfrecord(tfrecord):
    """ preprocess for reading
    """
    feature_descriptions = {
        "review": tf.io.FixedLenFeature([], tf.string, default_value=""),
        "sentiment": tf.io.FixedLenFeature([], tf.int64, default_value=0)
    }
    example = tf.io.parse_single_example(tfrecord, feature_descriptions)
    return example["review"], example["sentiment"]

def load_tfrecord(filepaths, n_read_threads=5, shuffle_buffer_size=10000,
                n_parse_threads=5, batch_size=32, cache=True):
    """ first convert the filepath into tfdataset, then shuffle the files
        then reading the data from files
        by using prefetch in -> make faster (prepare the next data even the current data not finishes)"""
    dataset = tf.data.TFRecordDataset(filepaths,
                                      num_parallel_reads=n_read_threads)
    if cache:
        dataset = dataset.cache()
    if shuffle_buffer_size:
        dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess_for_tfrecord, num_parallel_calls=n_parse_threads)
    return dataset.batch(batch_size).prefetch(1)      


def list_files_in_path(pathfile):
    files = []
    for file in os.listdir(pathfile):
        files.append(os.path.join(pathfile,file))
    return files


In [27]:
#change the path if you do not use colab
train_set = load_tfrecord(list_files_in_path("/content/datasets/tfrecords/train"))
valid_set = load_tfrecord(list_files_in_path("/content/datasets/tfrecords/validation"))

### **2.** Build the model

What is Bert?
First recall the Embedding. If we give the embedding layer a string then the output is welldefined, so that given a word the output of this word will always be the same no matter, where it stands in the sentence.

So now recall from additional_models.ipynb. We used a PositionalEncoding so that the positional encoding will add to the output from the embedding. This positional encoding output depends on the positional of the word in the sentence. 




# 2.1 Download Bert encoder and preprocess model

In [28]:
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [29]:
bert_preprcess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

## **2.2** Build the Model

In [31]:
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name="text")
preprocess_text = bert_preprcess(text_input)
bert_outputs = bert_encoder(preprocess_text)
dropout_layer = tf.keras.layers.Dropout(0.5,name="dropout1")(bert_outputs["pooled_output"])
output_layer = tf.keras.layers.Dense(1,activation="sigmoid",name="output")(dropout_layer)
model = tf.keras.Model(text_input,output_layer)

In [32]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [37]:
from tensorflow.keras.optimizers import Adam
model.compile(loss="BinaryCrossentropy",metrics=['accuracy'],optimizer=Adam(0.01))

I stop training after 1 epoch

In [38]:
model.fit(train_set,
            epochs=5,
            batch_size=32,
            validation_data=valid_set)

Epoch 1/5
Epoch 2/5
  73/1250 [>.............................] - ETA: 13:03 - loss: 0.6922 - accuracy: 0.6421

KeyboardInterrupt: ignored