In [1]:
import tensorflow_datasets
import tensorflow as tf
import numpy as np
from tensorflow.keras.mixed_precision import experimental as mixed_precision
import wandb
from wandb.tensorflow import WandbHook
wandb.init(project="nonint-transformers", sync_tensorboard=True)

from transformers import (TFBertModel, 
                          BertTokenizer,
                          TFRobertaForSequenceClassification, 
                          RobertaTokenizer)

from transformers import glue_convert_examples_to_features

fp16 = True
BATCH_SIZE = 32
if fp16:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
    BATCH_SIZE = 48

In [2]:
bert_model_pre = TFBertModel.from_pretrained("bert-base-cased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

roberta_model_pre = TFRobertaForSequenceClassification.from_pretrained("roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = roberta_model_pre

In [3]:
sequence = "Systolic arrays are cool. This 🐳 is cool too."

bert_tokenized_sequence = bert_tokenizer.tokenize(sequence)
roberta_tokenized_sequence = roberta_tokenizer.tokenize(sequence)

print("BERT:", bert_tokenized_sequence)
print("RoBERTa:", roberta_tokenized_sequence)

BERT: ['S', '##ys', '##to', '##lic', 'array', '##s', 'are', 'cool', '.', 'This', '[UNK]', 'is', 'cool', 'too', '.']
RoBERTa: ['Sy', 'st', 'olic', 'Ġarrays', 'Ġare', 'Ġcool', '.', 'ĠThis', 'ĠðŁ', 'Ĳ', '³', 'Ġis', 'Ġcool', 'Ġtoo', '.']


In [5]:
# Fine tune the models
import tensorflow_datasets
data = tensorflow_datasets.load("glue/mrpc")

train_dataset = data["train"]
validation_dataset = data["validation"]
for i in range(5):
    example = list(train_dataset.__iter__())[i]
    print('',
        'idx:      ', example['idx'],       '\n',
        'label:    ', example['label'],     '\n',
        'sentence1:', example['sentence1'], '\n',
        'sentence2:', example['sentence2'],
    )

# Decode from a tensor into a UTF-8 string
seq0 = example['sentence1'].numpy().decode('utf-8')  # Obtain bytes from tensor and convert it to a string
seq1 = example['sentence2'].numpy().decode('utf-8')  # Obtain bytes from tensor and convert it to a string

# Encode string into a list of tokens
encoded_bert_sequence = bert_tokenizer.encode(seq0, seq1, add_special_tokens=True, max_length=128)
encoded_roberta_sequence = roberta_tokenizer.encode(seq0, seq1, add_special_tokens=True, max_length=128)

print("BERT tokenizer separator, cls token id:   ", bert_tokenizer.sep_token_id, bert_tokenizer.cls_token_id)
print("RoBERTa tokenizer separator, cls token id:", roberta_tokenizer.sep_token_id, roberta_tokenizer.cls_token_id)

bert_special_tokens = [bert_tokenizer.sep_token_id, bert_tokenizer.cls_token_id]
roberta_special_tokens = [roberta_tokenizer.sep_token_id, roberta_tokenizer.cls_token_id]

def print_in_red(string):
    print("\033[91m" + str(string) + "\033[0m", end=' ')

print("\nBERT tokenized sequence")
output = [print_in_red(tok) if tok in bert_special_tokens else print(tok, end=' ') for tok in encoded_bert_sequence]

print("\n\nRoBERTa tokenized sequence")
output = [print_in_red(tok) if tok in roberta_special_tokens else print(tok, end=' ') for tok in encoded_roberta_sequence]

INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset glue (C:\Users\jbetk\tensorflow_datasets\glue\mrpc\0.0.2)
INFO:absl:Constructing tf.data.Dataset for split None, from C:\Users\jbetk\tensorflow_datasets\glue\mrpc\0.0.2


 idx:       tf.Tensor(201, shape=(), dtype=int32) 
 label:     tf.Tensor(1, shape=(), dtype=int64) 
 sentence1: tf.Tensor(b'Tibco has used the Rendezvous name since 1994 for several of its technology products , according to the Palo Alto , California company .', shape=(), dtype=string) 
 sentence2: tf.Tensor(b'Tibco has used the Rendezvous name since 1994 for several of its technology products , it said .', shape=(), dtype=string)
 idx:       tf.Tensor(2977, shape=(), dtype=int32) 
 label:     tf.Tensor(0, shape=(), dtype=int64) 
 sentence1: tf.Tensor(b"Most of the alleged spammers engaged in fraudulent or deceptive practices , said Brad Smith , Microsoft 's senior VP and general counsel .", shape=(), dtype=string) 
 sentence2: tf.Tensor(b'" Spam knows no borders , " said Brad Smith , Microsoft \'s senior vice-president and general counsel .', shape=(), dtype=string)
 idx:       tf.Tensor(3482, shape=(), dtype=int32) 
 label:     tf.Tensor(1, shape=(), dtype=int64) 
 sentence1: tf.Tens

In [6]:
# Go ahead and perform the above steps to all of the train/val dataset.
bert_train_dataset = glue_convert_examples_to_features(train_dataset, bert_tokenizer, 128, 'mrpc')
bert_train_dataset = bert_train_dataset.shuffle(100).batch(BATCH_SIZE).repeat(2)
bert_validation_dataset = glue_convert_examples_to_features(validation_dataset, bert_tokenizer, 128, 'mrpc')
bert_validation_dataset = bert_validation_dataset.batch(64)

# RoBERTa requires a bit more of work as it does not use the token_type_ids, 
# which we need to remove. We use the tf.data.Dataset.map() method for this.
def token_type_ids_removal(example, label):
    del example["token_type_ids"]
    return example, label
roberta_train_dataset = glue_convert_examples_to_features(train_dataset, roberta_tokenizer, 128, 'mrpc')
roberta_train_dataset = roberta_train_dataset.map(token_type_ids_removal)
roberta_train_dataset = roberta_train_dataset.shuffle(100).batch(BATCH_SIZE).repeat(2)
roberta_validation_dataset = glue_convert_examples_to_features(validation_dataset, roberta_tokenizer, 128, 'mrpc')
roberta_validation_dataset = roberta_validation_dataset.map(token_type_ids_removal)
roberta_validation_dataset = roberta_validation_dataset.batch(64)

In [7]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)#, clipnorm=1.0)
if fp16:
    #optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
    tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

from tensorflow.keras.layers import Input, Dense
inputs = [Input(shape=(128,), dtype='int32', name='input_ids'),
          Input(shape=(128,), dtype='int32', name='attention_mask'), 
          Input(shape=(128,), dtype='int32', name='token_type_ids')]
tensor = bert_model_pre(inputs)[1]
tensor = Dense(activation='softmax', units=2)(tensor)
bert_model = tf.keras.Model(inputs=inputs, outputs=tensor)

bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
roberta_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [8]:
print("Fine-tuning BERT on MRPC")
bert_history = bert_model.fit(bert_train_dataset, epochs=5, validation_data=bert_validation_dataset)

#print("\nFine-tuning RoBERTa on MRPC")
#roberta_history = roberta_model.fit(roberta_train_dataset, epochs=3, validation_data=roberta_validation_dataset)

Fine-tuning BERT on MRPC
Epoch 1/5
Epoch 2/5






In [9]:
wandb.log(bert_history)

ValueError: wandb.log must be passed a dictionary

In [None]:
# Test it out.

test_sentence_1 = 'A whale jumped from the ocean, breathing heavily as it floated in the air.'
test_sentence_2 = 'Breathing with exertion, the whale flew out of the water into the air.'

def pad_zero(inputs, seq_len):
    for k in inputs: 
        output = np.zeros(seq_len+1, dtype='int32')
        output[:len(inputs[k])] = np.asarray(inputs[k])
        inputs[k] = output
    return inputs
 
test_sentence_bert_encoded = pad_zero(bert_tokenizer.encode_plus(test_sentence_1, test_sentence_2, add_special_tokens=True, max_length=128), 128)
print(test_sentence_bert_encoded)
test_sentence_roberta_encoded = pad_zero(roberta_tokenizer.encode_plus(test_sentence_1, test_sentence_2, add_special_tokens=True, max_length=128), 128)


test_sentence_bert_encoded_formatted = \
    [np.resize(test_sentence_bert_encoded['input_ids'], (1,-1)),
    np.resize(test_sentence_bert_encoded['token_type_ids'], (1,-1)),
    np.resize(test_sentence_bert_encoded['attention_mask'], (1,-1))]
print(bert_model.predict(test_sentence_bert_encoded_formatted))

#rbs_frm = [np.resize(np.asarray(ers['input_ids']), (1,-1)),
#          np.resize(np.asarray(ers['attention_mask']), (1,-1))]
#print(roberta_model.predict(rbs_frm))