#Sentiment analysis using BERT


In [1]:
import datetime
import shutil

import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from google.cloud import aiplatform

tf.get_logger().setLevel('ERROR')

## Downloading dataset

In [2]:
import os

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

path = tf.keras.utils.get_file(
    "aclImdb_v1.tar.gz",
    url,
    untar=True,
    cache_dir='.',
    cache_subdir=''
    )

dataset = os.path.join(os.path.dirname(path), 'aclImdb')

train_dataset = os.path.join(dataset, 'train')
test_dataset = os.path.join(dataset, 'test')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 0us/step


In [3]:
path = "/content/aclImdb_v1_extracted/"
AUTOTUNE = tf.data.AUTOTUNE

batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    path + 'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

class_name = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    path + 'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    path + 'aclImdb/test',
    batch_size=batch_size
)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 75000 files belonging to 3 classes.
Using 60000 files for training.
Found 75000 files belonging to 3 classes.
Using 15000 files for validation.
Found 25000 files belonging to 2 classes.


In [10]:
for t, l in train_ds.take(1):
  for i in range(10):
    print(f"Review: {t.numpy()[i]}\nLabel: {l.numpy()[i]}\n")

Review: b'There is this father-son conversation in the climax of \'KALPURUSH\'. I quote the English DVD-subtitle version. Shumonto tells his father: "I may not have become someone, but when I see two people in love, I smile. And when I see someone eating alone, I cry." Ashvini, his father, replies wistfully: "I wish I could\'ve lived my life like you did." These 2 lines, perhaps, comprise the gist of this new film by Buddhadev Dasgupta - director of teeny-weeny gems like \'Tahader Katha\', \'Bagh Bahadur\', \'Uttara\' & \'Mondo Meyer Upakhyan\' - which took nearly 3 years to reach the cinemas in India.<br /><br />The film opens with a man called Ashvini following a younger man called Shumonto, who, we are told, is his son. It seems that the father is stalking - or haunting, rather - his son. As the film progresses and we meet Shumonto\'s ambitious wife, Supriya, and his mother, Koyel, who seems to be tied up with something in her past, we realise that the son is, indeed, haunted by his

##Fine tuning a BERT model

In [6]:
encoder =  ( "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1" )

preprocessor =  ("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" )

print(f"Encoder available: {encoder}")
print(f"Preprocessor available: {preprocessor}")

Encoder available: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocessor available: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [7]:
bert_preprocess = hub.KerasLayer(preprocessor)

In [15]:
testing_text = ["this is a sample text."]
testing_text_preprocessed = bert_preprocess(testing_text)

print(f"Keys of testing_text_preprocessed: {testing_text_preprocessed.keys()}")
print(f"Value of input_word_ids: {testing_text_preprocessed['input_word_ids']}")
print(f"Value of input_mask: {testing_text_preprocessed['input_mask']}")
print(f"Value of input_type_ids: {testing_text_preprocessed['input_type_ids']}")

Keys of testing_text_preprocessed: dict_keys(['input_type_ids', 'input_mask', 'input_word_ids'])
Value of input_word_ids: [[ 101 2023 2003 1037 7099 3793 1012  102    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
Value of input_mask: [[1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

###Preprocessing outputs
This BERT model outputs a dictionary containing:

*   input_word_ids : token number
*   input_mask     : indexing bolean
*   input_type_ids :



In [16]:
bert_model = hub.KerasLayer(encoder)

In [84]:
def build_classifier(dropout_rate=0.1):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
    preprocessing_layer = hub.KerasLayer(
        preprocessor, name="preprocessing"
    )
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(
        encoder, trainable=True, name="BERT_encoder"
    )
    outputs = encoder(encoder_inputs)
    net = outputs["pooled_output"]
    net = tf.keras.layers.Dropout(dropout_rate)(net)
    net = tf.keras.layers.Dense(1, activation="sigmoid", name="classifier")(net)
    return tf.keras.Model(text_input, net)

In [85]:
loss = tf.keras.losses.BinaryCrossentropy()
metrics = tf.metrics.BinaryAccuracy()

In [86]:
import sys
import os

module_path = '/content/models'

# Add the directory to sys.path
if module_path not in sys.path:
    sys.path.append(module_path)

import official.nlp


In [87]:
epochs = 10
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 0.1

optimizer = official.nlp.optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw'
)

In [88]:
model = build_classifier()

ValueError: Exception encountered when calling layer 'preprocessing' (type KerasLayer).

A KerasTensor is symbolic: it's a placeholder for a shape an a dtype. It doesn't have any actual numerical value. You cannot convert it to a NumPy array.

Call arguments received by layer 'preprocessing' (type KerasLayer):
  • inputs=<KerasTensor shape=(None,), dtype=string, sparse=False, name=text>
  • training=None