In [None]:
# Author: Hussain Abbas, MSc
# © 2021 Stats AI LLC 
# All Rights Reserved

In [20]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

# Verify GPU is detected and working
tf.config.experimental.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## GloVe Feature Extraction Model

In [2]:
imdb_train, ds_info = tfds.load(name="imdb_reviews", split="train", with_info=True, as_supervised=True)
imdb_test = tfds.load(name="imdb_reviews", split="test", as_supervised=True)

In [3]:
# Use the default tokenizer settings
tokenizer = tfds.deprecated.text.Tokenizer()
 
vocabulary_set = set()
MAX_TOKENS = 0

for example, label in imdb_train:
    some_tokens = tokenizer.tokenize(example.numpy())
    if MAX_TOKENS < len(some_tokens):
        MAX_TOKENS = len(some_tokens)
    vocabulary_set.update(some_tokens)

In [4]:
imdb_encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, lowercase=True, tokenizer=tokenizer)

vocab_size = imdb_encoder.vocab_size
print(vocab_size, MAX_TOKENS)

93931 2525


In [5]:
# transformation functions to be used with the dataset
from tensorflow.keras.preprocessing import sequence

def encode_pad_transform(sample):
    encoded = imdb_encoder.encode(sample.numpy())
    pad = sequence.pad_sequences([encoded], padding='post', maxlen=150)
    return np.array(pad[0], dtype=np.int64)

def encode_tf_fn(sample, label):
    encoded = tf.py_function(encode_pad_transform, inp=[sample], Tout=(tf.int64))
    encoded.set_shape([None])
    label.set_shape([])
    return encoded, label

In [6]:
encoded_train = imdb_train.map(encode_tf_fn,num_parallel_calls=tf.data.experimental.AUTOTUNE)
encoded_test = imdb_test.map(encode_tf_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [7]:
# import urllib
# import zipfile
# import os

# url = "http://nlp.stanford.edu/data/glove.6B.zip"
# extract_dir = "glove_embeddings"

# zip_path, _ = urllib.request.urlretrieve(url)
# with zipfile.ZipFile(zip_path, "r") as f:
#     f.extractall(extract_dir)

In [8]:
#os.listdir(extract_dir)

In [9]:
dict_w2v = {}

with open('glove_embeddings/glove.6B.50d.txt', 'r', encoding="utf8") as file:
    for line in file:
        tokens = line.split()
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            print("There was an issue with " + word)

# let's check the vocabulary size
print("Dictionary Size: ", len(dict_w2v))

Dictionary Size:  400000


In [10]:
embedding_dim = 50
embedding_matrix = np.zeros((imdb_encoder.vocab_size, embedding_dim))

In [11]:
unk_cnt = 0
unk_set = set()

for word in imdb_encoder.tokens:
    embedding_vector = dict_w2v.get(word)

    if embedding_vector is not None:
        tkn_id = imdb_encoder.encode(word)[0]
        embedding_matrix[tkn_id] = embedding_vector
    else:
        unk_cnt += 1
        unk_set.add(word)

# Print how many weren't found
print("Total unknown words: ", unk_cnt)

Total unknown words:  14553


In [12]:
# Length of the vocabulary in chars
vocab_size = imdb_encoder.vocab_size # len(chars)

# Number of RNN units
rnn_units = 64

#batch size
BATCH_SIZE=100

In [13]:
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, train_emb=False):
    model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, mask_zero=True,
    weights=[embedding_matrix], trainable=train_emb),
    Bidirectional(LSTM(rnn_units, return_sequences=True, dropout=0.5)),
    Bidirectional(LSTM(rnn_units, dropout=0.25)), Dense(1, activation='sigmoid')
    ])
    
    return model

In [14]:
model_fe = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model_fe.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          4696550   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         58880     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 4,854,375
Trainable params: 157,825
Non-trainable params: 4,696,550
_________________________________________________________________


In [15]:
model_fe.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy', 'Precision', 'Recall'])

# Prefetch for performance
encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

model_fe.fit(encoded_train_batched, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x292eaf57070>

In [16]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))



[0.4264746904373169,
 0.8292800188064575,
 0.7796955704689026,
 0.9179199934005737]

## Fine-tuned GloVe Embeddings

In [17]:
model_fe = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
    train_emb=True)

model_fe.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          4696550   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         58880     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 4,854,375
Trainable params: 4,854,375
Non-trainable params: 0
_________________________________________________________________


In [18]:
model_fe.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy', 'Precision', 'Recall'])

# Prefetch for performance
encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

model_fe.fit(encoded_train_batched, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x292eaf1aaf0>

In [19]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))



[0.47013941407203674,
 0.8679199814796448,
 0.8793302774429321,
 0.8528800010681152]

In [None]:
# class CustomCallback(tf.keras.callbacks.Callback):
    
#     def __init__(self, validation = None):   
#         super(Metrics, self).__init__()
#         self.validation = validation    
            
#         print('validation shape', len(self.validation[0]))
        
#     def on_train_begin(self, logs={}):        
#         self.val_f1s = []
#         self.val_recalls = []
#         self.val_precisions = []
     
#     def on_epoch_end(self, epoch, logs={}):
#         val_targ = self.validation[1]   
#         val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
    
#         val_f1 = f1_score(val_targ, val_predict)
#         val_recall = recall_score(val_targ, val_predict)         
#         val_precision = precision_score(val_targ, val_predict)
        
#         self.val_f1s.append(round(val_f1, 6))
#         self.val_recalls.append(round(val_recall, 6))
#         self.val_precisions.append(round(val_precision, 6))
 
#         print(f' — val_f1: {val_f1} — val_precision: {val_precision}, — val_recall: {val_recall}')

In [None]:
model_fe.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy', 'Precision', 'Recall']) 
#callbacks=[CustomCallback()])

# Prefetch for performance
encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

model_fe.fit(encoded_train_batched, epochs=10)

In [None]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))

## Fine-tuning Pre-trained BERT

In [1]:
# import tensorflow as tf
# import tensorflow_datasets as tfds
# import numpy as np
# import pandas as pd

# # Verify GPU is detected and working
# tf.config.experimental.list_physical_devices()

# imdb_train, ds_info = tfds.load(name="imdb_reviews", split="train", with_info=True, as_supervised=True)
# imdb_test = tfds.load(name="imdb_reviews", split="test", as_supervised=True)

In [2]:
from transformers import BertTokenizer

bert_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_name,
add_special_tokens=True,
do_lower_case=False,
max_length=150,
pad_to_max_length=True)

In [3]:
from transformers import TFBertForSequenceClassification
bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# tokenizer.encode_plus(" Don't be lured", add_special_tokens=True,
#                       max_length=9,
#                       pad_to_max_length=True,
#                       return_attention_mask=True,
#                       return_token_type_ids=True)

In [None]:
# tokenizer.encode_plus(" Don't be"," lured", add_special_tokens=True,
#                       max_length=10,
#                       pad_to_max_length=True,
#                       return_attention_mask=True,
#                       return_token_type_ids=True)

In [4]:
def bert_encoder(review):
    txt = review.numpy().decode('utf-8')
    encoded = tokenizer.encode_plus(txt, add_special_tokens=True,
                                    max_length=150,
                                    pad_to_max_length=True,
                                    return_attention_mask=True,
                                    return_token_type_ids=True)
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [5]:
bert_train = [bert_encoder(r) for r, l in imdb_train]
bert_lbl = [l for r, l in imdb_train]

bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
# create training and validation splits
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(bert_train, 
                                                  bert_lbl,
                                                  test_size=0.2,
                                                  random_state=42)

print(x_train.shape, y_train.shape)

(20000, 3, 150) (20000, 2)


In [7]:
tr_reviews, tr_segments, tr_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)

tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

In [8]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
    return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y


train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews, tr_masks, 
                                               tr_segments, y_train)).\
            map(example_to_features).shuffle(100).batch(16)

valid_ds = tf.data.Dataset.from_tensor_slices((val_reviews, val_masks, 
                                               val_segments, y_val)).\
            map(example_to_features).shuffle(100).batch(16)

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [11]:
print("Fine-tuning BERT on IMDB")

bert_history = bert_model.fit(train_ds, epochs=3,validation_data=valid_ds)

Fine-tuning BERT on IMDB
Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
























Epoch 2/3
Epoch 3/3


In [12]:
# prep data for testing
bert_test = [bert_encoder(r) for r,l in imdb_test]
bert_tst_lbl = [l for r, l in imdb_test]

bert_test2 = np.array(bert_test)
bert_tst_lbl2 = tf.keras.utils.to_categorical (bert_tst_lbl,
num_classes=2)

ts_reviews, ts_segments, ts_masks = np.split(bert_test2, 3, axis=1)
ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()

test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews, ts_masks, ts_segments, bert_tst_lbl2)).\
map(example_to_features).shuffle(100).batch(16)



In [13]:
bert_model.evaluate(test_ds)



[0.4861351549625397, 0.8657600283622742]

## Custom Fine-tuned Pre-trained BERT

In [14]:
from transformers import TFBertModel
bert_name = 'bert-base-cased'
bert = TFBertModel.from_pretrained(bert_name)
bert.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [15]:
max_seq_len = 150
inp_ids = tf.keras.layers.Input((max_seq_len,), dtype=tf.int64,
name="input_ids")
att_mask = tf.keras.layers.Input((max_seq_len,), dtype=tf.int64,
name="attention_mask")
seg_ids = tf.keras.layers.Input((max_seq_len,), dtype=tf.int64,
name="token_type_ids")

In [16]:
train_ds.element_spec

({'input_ids': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None),
  'token_type_ids': TensorSpec(shape=(None, 150), dtype=tf.int32, name=None)},
 TensorSpec(shape=(None, 2), dtype=tf.float32, name=None))

In [17]:
inp_dict = {"input_ids": inp_ids,
"attention_mask": att_mask,
"token_type_ids": seg_ids}
outputs = bert(inp_dict)
# let's see the output structure
outputs









TFBaseModelOutputWithPooling(last_hidden_state=<KerasTensor: shape=(None, 150, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, hidden_states=None, attentions=None)

In [18]:
x = tf.keras.layers.Dropout(0.2)(outputs[1])
x = tf.keras.layers.Dense(200, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(2, activation='sigmoid')(x)
custom_model = tf.keras.models.Model(inputs=inp_dict, outputs=x)

In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
custom_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
custom_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 150)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 108310272   attention_mask[0][0]             
                                                                 input_ids[0][0]              

In [20]:
print("Custom Model: Fine-tuning BERT on IMDB")

custom_history = custom_model.fit(train_ds, epochs=3,validation_data=valid_ds)

Custom Model: Fine-tuning BERT on IMDB
Epoch 1/3
























Epoch 2/3
Epoch 3/3


In [21]:
custom_model.evaluate(test_ds)




[0.3923428952693939, 0.8805199861526489]