# Project - BERT model
NLP - Sentimental Analysis of "Large Movie Review Dataset" using LSTM network

CSE - 6363 - 003 : Machine Learning 

Team - 18 : Members -
Preeti Singh - 1002013566
Sai Sarath Reddy Koppula - 1002081785
Renu Aakanksha Veesam - 1002113666

# For Implementing BERT Model we have followed this Blog Post by Tensorflow as reference.

- Ref.: https://www.tensorflow.org/text/tutorials/classify_text_with_bert

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text


import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [None]:
# Downloading the IMDB Dataset
# The IMDB Dataset is hosted on Stanford Website - 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

# Passing the IMDB Dataset Download link and Downloading it using the keras utils.get_file method
imdb_data = tf.keras.utils.get_file("aclImdb_v1.tar.gz", "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", untar=True, cache_dir=".", cache_subdir="")

In [None]:
# The original dataset is in a folder format, the following code is to handle these folder format that the original dataset is in
main_directory = os.path.join(os.path.dirname(imdb_data), 'aclImdb')

# Training Data Directory
training_data_directory = os.path.join(main_directory, "train")

# Removing the folders that are not used.
shutil.rmtree(os.path.join(training_data_directory, "unsup"))

In [None]:
# Loading the training and testing data
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

# This function is typically used for creating a labeled dataset of text data from a directory structure.
raw_train_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed)
class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)


# Testing Dataset
test_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/test', batch_size=batch_size)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Using the Small-BERT Model
# Loading the Preprocessing Component
bert_preprocess_model = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

In [None]:
# Using the Small-BERT Model
# Loading the BERT Model
bert_model = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1')

In [None]:
def get_BERT_classifier_model():
  # Text Input Layer
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  # Text Preprocessing Layer
  preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')

  # Text Encoding
  encoder_inputs = preprocessing_layer(text_input)

  # Text Encoding Layer
  encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1', trainable=True, name='BERT_encoder')

  # Obtain the Encoder Output
  outputs = encoder(encoder_inputs)

  net = outputs['pooled_output']

  # Adding a Dropout Layer
  net = tf.keras.layers.Dropout(0.1)(net)

  # Final Prediction Layer
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)

  # Input till the Dense Prediction Layers are combined as a Keras Model
  return tf.keras.Model(text_input, net)

In [None]:
# Creating the model object
model = get_BERT_classifier_model()

Model Architecture Visualization

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
# Training the Model for 10 epochs
epochs = 10

steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

# Learning rate of 0.00003
init_lr = 3e-5

optimizer = optimization.create_optimizer(init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type='adamw')

In [None]:
# Using the BinaryCrossEntropy Loss - as we are doing sentiment prediction with two classes
# Using Accuracy as a metric
model.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=tf.metrics.BinaryAccuracy())

In [None]:
history = model.fit(x=train_ds, epochs=epochs)

In [None]:
loss, accuracy = model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
# Example Prediction with BERT
examples = ['this is such a terrible movie, we had a aweful experience. The direction is bad and the movie run time was too long.']

In [None]:
results = tf.sigmoid(model(tf.constant(examples)))

In [None]:
print(results[0][0])

# From the result it can be seen that the provided example belongs to the negative class.

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history["loss"], marker="x", color="blue")
plt.title("Epoch-wise Training Loss Scores")
plt.show()

plt.plot(history.history["binary_accuracy"], marker="x", color="blue")
plt.title("Epoch-wise Training Accuracy Scores")
plt.show()