In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 1.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.5 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 42.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [2]:
import os
import tensorflow as tf
import tensorflow_datasets

In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features

In [4]:
BATCH_SIZE = 32
FINE_TUNED_MODEL_DIR = "./data/"

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Load dataset via TensorFlow Datasets
data, info = tensorflow_datasets.load("glue/mrpc", with_info=True)

num_train = info.splits["train"].num_examples
num_valid = info.splits["validation"].num_examples
num_train, num_valid

[1mDownloading and preparing dataset glue/mrpc/1.0.0 (download: 1.43 MiB, generated: Unknown size, total: 1.43 MiB) to /root/tensorflow_datasets/glue/mrpc/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incompletePNDVYW/glue-train.tfrecord


  0%|          | 0/3668 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incompletePNDVYW/glue-validation.tfrecord


  0%|          | 0/408 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/glue/mrpc/1.0.0.incompletePNDVYW/glue-test.tfrecord


  0%|          | 0/1725 [00:00<?, ? examples/s]

[1mDataset glue downloaded and prepared to /root/tensorflow_datasets/glue/mrpc/1.0.0. Subsequent calls will reuse this data.[0m


(3668, 408)

In [7]:
found_0, found_1 = False, False
for d in data["train"]:
  # print(d)
  label = d["label"].numpy()
  if label == 0:
    print("0:", d["sentence1"].numpy(), "|", d["sentence2"].numpy())
    found_0 = True
  if label == 1:
    print("1:", d["sentence1"].numpy(), "|", d["sentence2"].numpy())
    found_1 = True
  if found_0 == 1 and found_1:
    break

0: b'The identical rovers will act as robotic geologists , searching for evidence of past water .' | b'The rovers act as robotic geologists , moving on six wheels .'
0: b"Less than 20 percent of Boise 's sales would come from making lumber and paper after the OfficeMax purchase is completed ." | b"Less than 20 percent of Boise 's sales would come from making lumber and paper after the OfficeMax purchase is complete , assuming those businesses aren 't sold ."
1: b'Spider-Man snatched $ 114.7 million in its debut last year and went on to capture $ 403.7 million .' | b'Spider-Man , rated PG-13 , snatched $ 114.7 million in its first weekend and went on to take in $ 403.7 million .'


In [8]:
# Prepare dataset for GLUE as a tf.data.Dataset instance
Xtrain = glue_convert_examples_to_features(data["train"], tokenizer, 128, "mrpc")
Xtrain = Xtrain.shuffle(128).batch(BATCH_SIZE).repeat(-1)

Xvalid = glue_convert_examples_to_features(data["validation"], tokenizer, 128, "mrpc")
Xvalid = Xvalid.batch(BATCH_SIZE)



In [9]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=opt, loss=loss, metrics=[metric])

In [10]:
train_steps = num_train // BATCH_SIZE
valid_steps = num_valid // BATCH_SIZE

history = model.fit(Xtrain, epochs=2, steps_per_epoch=train_steps, 
                    validation_data=Xvalid, validation_steps=valid_steps)

Epoch 1/2
Epoch 2/2


In [11]:
model.save_pretrained(FINE_TUNED_MODEL_DIR)

In [12]:
saved_model = TFBertForSequenceClassification.from_pretrained(FINE_TUNED_MODEL_DIR)

Some layers from the model checkpoint at ./data/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./data/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [28]:
def print_result(id1, id2, pred):
    if pred == 1:      
        print("sentence_{:d} is a paraphrase of sentence_{:d}".format(id1, id2))
    else:
        print("sentence_{:d} is not a paraphrase of sentence_{:d}".format(id1, id2))

sentence_0 = "At least 12 people were killed in the battle last week."
sentence_1 = "At least 12 people lost their lives in last weeks fighting."
sentence_2 = "The fires burnt down the houses on the street."

inputs_1 = tokenizer(sentence_0, sentence_1, return_tensors="tf")
inputs_2 = tokenizer(sentence_0, sentence_2, return_tensors="tf")

output_1 = saved_model(inputs_1)
output_2 = saved_model(inputs_2)

print("output_1.logits:", output_1.logits)
print("output_2.logits:", output_2.logits)

pred_1 = tf.argmax(output_1.logits, axis=-1)[0].numpy()
pred_2 = tf.argmax(output_2.logits, axis=-1)[0].numpy()

print_result(0, 1, pred_1)
print_result(0, 2, pred_2)

output_1.logits: tf.Tensor([[-1.9245654  1.7167482]], shape=(1, 2), dtype=float32)
output_2.logits: tf.Tensor([[ 1.7672827 -1.5870204]], shape=(1, 2), dtype=float32)
sentence_0 is a paraphrase of sentence_1
sentence_0 is not a paraphrase of sentence_2
