# Imports

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Bidirectional,
    Dense,
    Dropout,
    Input,
)
from tensorflow.keras.models import Model

from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
from transformers import TFBertModel

from sklearn.model_selection import train_test_split
from utils.functions import *

# Set-up

In [2]:
import warnings

warnings.filterwarnings("ignore")

# Loading IMDB Data

In [3]:
imdb_train, ds_info = tfds.load(
    name='imdb_reviews',
    split='train',
    with_info=True,
    as_supervised=True
)

imdb_test = tfds.load(
    name='imdb_reviews',
    split='test',
    as_supervised=True
)

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Load dataset info from /home/pop/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


INFO:absl:Reusing dataset imdb_reviews (/home/pop/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)
INFO:absl:Constructing tf.data.Dataset for split train, from /home/pop/tensorflow_datasets/imdb_reviews/plain_text/1.0.0
2023-10-31 14:06:50.510175: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-10-31 14:06:50.510385: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-10-31 14:06:50.510397: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-10-31 14:06:50.510415: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pop-os): /proc/driver/nvidia/version does not exist
2023-10-31 14:06:50.512281: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices

In [4]:
# Check and example from the dataset
for example, label in imdb_train.take(1):
    print(example, '\n', label)

2023-10-31 14:06:50.663394: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-10-31 14:06:50.663966: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3393180000 Hz


tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string) 
 tf.Tensor(0, shape=(), dtype=int64)


# Create vocabulary and encode

In [5]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
MAX_TOKENS = 0

for example, label in imdb_train:
  some_tokens = tokenizer.tokenize(example.numpy())
  if MAX_TOKENS < len(some_tokens):
        MAX_TOKENS = len(some_tokens)
  vocabulary_set.update(some_tokens)

In [6]:
imdb_encoder = tfds.features.text.TokenTextEncoder(vocabulary_set,
                                                   lowercase=True,
                                                   tokenizer=tokenizer)
vocab_size = imdb_encoder.vocab_size

print(vocab_size, MAX_TOKENS)

93931 2525


In [7]:
def encode_pad_transform(sample):
    encoded = imdb_encoder.encode(sample.numpy())
    pad = sequence.pad_sequences([encoded], padding='post', 
                                 maxlen=150)
    return np.array(pad[0], dtype=np.int64)  


def encode_tf_fn(sample, label):
    encoded = tf.py_function(encode_pad_transform, 
                                       inp=[sample], 
                                       Tout=(tf.int64))
    encoded.set_shape([None])
    label.set_shape([])
    return encoded, label

In [8]:
# Lets verify tokenization and encoding works
for example, label in imdb_train.take(1):
    print(example)
    encoded = imdb_encoder.encode(example.numpy())
    print(imdb_encoder.decode(encoded))

tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
this was an absolutely terrible movie don t be lured in by christopher walken or michael ironside both are great actors but this must simply be their worst role in history even their great acting could not redeem this movie s ridiculous storyline this 

In [9]:
subset = imdb_train.take(10)
tst = subset.map(encode_tf_fn)

In [10]:
encoded_train = imdb_train.map(
    encode_tf_fn,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)

encoded_test = imdb_test.map(
    encode_tf_fn,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)

# Loading pre-trained GloVe embeddings

In [11]:
dict_w2v = {}

with open('../data/raw/glove.6B.50d.txt', 'r') as file:
    for line in file:
        tokens = line.split()
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            print('There was an issue with ' + word)

So far, we have a dataset, its vocabulary, and a dictionary of GloVe words and
their corresponding vectors. However, there is no correlation between these two
vocabularies. The way to connect them is through the creation of an embedding
matrix.

In [12]:
embedding_dim = 50
embedding_matrix = np.zeros((imdb_encoder.vocab_size, embedding_dim))

After this embedding matrix of zeros is initialized, it needs to be populated. For each word in the vocabulary of reviews, the corresponding vector is retrieved from the
GloVe dictionary.

In [13]:
unk_cnt = 0
unk_set = set()

for word in imdb_encoder.tokens:
    embedding_vector = dict_w2v.get(word)
    
    if embedding_vector is not None:
        tkn_id = imdb_encoder.encode(word)[0]
        embedding_matrix[tkn_id] = embedding_vector
    else:
        unk_cnt += 1
        unk_set.add(word)

During the data loading step, we saw that the total number of tokens was 93,931.
Out of these, 14,553 words could not be found, which is approximately 15% of
the tokens. For these words, the embedding matrix will have zeros.

# Feature Extraction Model

the **feature extraction** model freezes the pre-trained
weights and does not update them. An important issue with this approach in the
current setup is that there are a large number of tokens, over 14,000, that have
zero embedding vectors. These words could not be matched to an entry in the
GloVe word list.

## Glove based BiLSTM Model

In [14]:
vocab_size = imdb_encoder.vocab_size

rnn_units = 64

BATCH_SIZE=100

In [15]:
def build_model_bilstm(
    vocab_size: int,
    embedding_dim: int,
    rnn_units: int,
    batch_size: int,
    train_emb: bool = False
) -> tf.keras.Model:
    model = tf.keras.Sequential(
        [
            Embedding(
                vocab_size,
                embedding_dim,
                mask_zero=True,
                weights=[embedding_matrix],
                trainable=train_emb,
            ),
            Dropout(0.25),
            Bidirectional(
                LSTM(
                    rnn_units,
                    return_sequences=True,
                    dropout=0.5)
            ),
            Bidirectional(LSTM(
                rnn_units,
                dropout=0.25)),
            Dense(1, activation="sigmoid"),
        ]
    )
    return model

In [16]:
model_fe = build_model_bilstm(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE
)

model_fe.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          4696550   
_________________________________________________________________
dropout (Dropout)            (None, None, 50)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         58880     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 4,854,375
Trainable params: 157,825
Non-trainable params: 4,696,550
_________________________________________________________________


This model has about 4.8 million trainable parameters. A simpler or smaller model will train faster and possibly be less
likely to overfit as the model capacity is lower

In [17]:
model_fe.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', 'Precision', 'Recall']
)

In [18]:
encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

In [19]:
model_fe.fit(encoded_train_batched, epochs=20, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7fdb3379dc70>

In [20]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))



[0.38784679770469666,
 0.8335599899291992,
 0.7792138457298279,
 0.9308800101280212]

In [21]:
model_fe.save('../models/feature-extraction-model.h5')

# Fine-tuning model

In [22]:
model_ft = build_model_bilstm(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
    train_emb=True
)

model_ft.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          4696550   
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         58880     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 4,854,375
Trainable params: 4,854,375
Non-trainable params: 0
_________________________________________________________________


This model is identical to the feature extraction model in size. However, since the
embeddings will be fine-tuned, training is expected to take a little longer. There
are several thousand zero embeddings, which can now be updated. The resulting
accuracy is expected to be much better than the previous model.

In [23]:
model_ft.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', 'Precision', 'Recall']
)

In [24]:
model_fe.fit(encoded_train_batched, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fdb04464af0>

In [25]:
model_ft.evaluate(encoded_test.batch(BATCH_SIZE))



[0.6952202320098877,
 0.49779999256134033,
 0.49880772829055786,
 0.9204000234603882]

In [26]:
model_ft.save('../models/fine-tuning-model.h5')

# BERT Based Transfer Learning with HuggingFace

In [38]:
bert_name = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(
    bert_name,
    add_special_tokens=True,
    do_lower_case=False,
    max_length=150,
    pad_to_max_length=True
)

In [40]:
def bert_encoder(review):
    txt = review.numpy().decode('utf-8')
    encoded = tokenizer.encode_plus(txt, add_special_tokens=True, 
                                    max_length=150, pad_to_max_length=True, 
                                    return_attention_mask=True, 
                                    return_token_type_ids=True,
                                    truncation=True)
    return encoded['input_ids'], encoded['token_type_ids'], \
           encoded['attention_mask']

In [41]:
bert_train = [bert_encoder(r) for r, l in imdb_train]
bert_lbl = [l for r, l in imdb_train]
bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)

In [42]:
x_train, x_val, y_train, y_val = train_test_split(
    bert_train,
    bert_lbl,
    test_size=0.2,
    random_state=42
)

In [46]:
tr_reviews, tr_segments, tr_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)

tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

In [47]:
train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews, tr_masks, 
                                               tr_segments, y_train)).\
            map(example_to_features).shuffle(100).batch(16)

valid_ds = tf.data.Dataset.from_tensor_slices((val_reviews, val_masks, 
                                               val_segments, y_val)).\
            map(example_to_features).shuffle(100).batch(16)

# Pre-built BERT classification model

In [48]:
bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

Downloading: 100%|██████████| 433/433 [00:00<00:00, 597kB/s]
Downloading: 100%|██████████| 527M/527M [00:23<00:00, 22.3MB/s] 
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

bert_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['accuracy']
)

In [50]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_39 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [51]:
print('Fine-tuning BERT on IMDB')
bert_history = bert_model.fit(
    train_ds,
    epochs=3,
    validation_data=valid_ds
)

Fine-tuning BERT on IMDB
Epoch 1/3
   6/1250 [..............................] - ETA: 3:06:34 - loss: 0.7336 - accuracy: 0.4469

In [None]:
bert_test = [bert_encoder(r) for r, l in imdb_test]
bert_tst_lbl = [l for r, l in imdb_test]
bert_test2 = np.array(bert_test)
bert_tst_lbl2 = tf.keras.utils.to_categorical(bert_tst_lbl, num_classes=2)
ts_reviews, ts_segments, ts_masks = np.split(bert_test2, 3, axis=1)
ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()
test_ds = (
    tf.data.Dataset.from_tensor_slices(
        (ts_reviews, ts_masks, ts_segments, bert_tst_lbl2)
    )
    .map(example_to_features)
    .shuffle(100)
    .batch(16)
)

In [None]:
bert_model.evaluate(test_ds)

# Custom Model with BERT

In [2]:
bert_name = 'bert-base-cased'
bert = TFBertModel.from_pretrained(bert_name)

2023-11-07 19:19:28.427635: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-11-07 19:19:28.444748: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-11-07 19:19:28.444766: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-11-07 19:19:28.444784: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pop-os): /proc/driver/nvidia/version does not exist
2023-11-07 19:19:28.446847: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel fro

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [4]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [5]:
max_seq_len = 150

inp_ids = Input((max_seq_len), dtype = tf.int64, name='input_ids')
att_mask = Input((max_seq_len), dtype = tf.int64, name='attention_mask')
seq_ids = Input((max_seq_len), dtype = tf.int64, name='token_type_ids')

In [None]:
train_ds.element_spec

In [None]:
inp_dict = {
    'inputs_ids': inp_ids,
    'attention_mask': att_mask,
    'token_type_ids': seq_ids
}

outputs = bert(inp_dict)

outputs

## Creating custom model

In [None]:
x = Dropout(0.2)(outputs[1])
x = Dense(200, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(2, activation='sigmoid')(x)

custom_model = Model(inputs=inp_dict, outputs=x)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
custom_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
print('Custom Model: Fine-tuning BERT on IMDB')
custom_history = custom_model.fit(
    train_ds,
    epochs=10,
    validation_data=valid_ds
)

In [None]:
custom_model.evaluate(test_ds)

In [None]:
bert.trainable = False
optimizer = tf.keras.optimizers.Adam()
custom_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['accuracy']
)

In [None]:
custom_model.summary()

In [None]:
print("Custom Model: Fine-tuning BERT on IMDB")
custom_history = custom_model.fit(train_ds, epochs=2,
                                  validation_data=valid_ds)

In [None]:
custom_model.evaluate(test_ds)