<a href="https://colab.research.google.com/github/Sarztak/nlp-authorship-attribution/blob/main/BERT_Studies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install "transformers<=4.54.1"

In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

(ds_train, ds_val, ds_test), ds_info = tfds.load(
    "imdb_reviews",
    split=["train[:80%]", "train[80%:]", "test"],
    as_supervised=True,
    with_info=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.T8JVE9_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.T8JVE9_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.T8JVE9_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [76]:
next(iter(ds_train))

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [47]:
text = "This movie was surprisingly good!"

text_preprocessed = tokenizer(
    text,
    max_length=128,
    padding="max_length",
    truncation=True,
)

# text_preprocessed is a dictionary of list
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Word Ids   : {text_preprocessed["input_ids"][:12]}')
print(f'Input Mask : {text_preprocessed["attention_mask"][:12]}')
print(f'Type Ids   : {text_preprocessed["token_type_ids"][:12]}')

Keys       : ['input_ids', 'token_type_ids', 'attention_mask']
Word Ids   : [101, 2023, 3185, 2001, 10889, 2204, 999, 102, 0, 0, 0, 0]
Input Mask : [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
Type Ids   : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [6]:
# map to the expected input to TFBertForSequenceClassification, see here
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def convert_example_to_feature(text):
    return tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="tf" # give tf tensor instead of lists
    )

def encode_examples(ds, limit=-1):

  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []

  if (limit > 0):
      ds = ds.take(limit)

  for review, label in tfds.as_numpy(ds):
    bert_input = convert_example_to_feature(review.decode())

    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [29]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32

# # train dataset
ds_train_encoded = (
    encode_examples(ds_train)
    .shuffle(10000)
    .batch(batch_size)
    .prefetch(AUTOTUNE)
)

# # train dataset
ds_val_encoded = (
    encode_examples(ds_val)
    .shuffle(10000)
    .batch(batch_size)
    .prefetch(AUTOTUNE)
)

# test dataset
ds_test_encoded = (
    encode_examples(ds_test)
    .batch(batch_size)
    .prefetch(AUTOTUNE)
)


In [18]:
lr = 2e-5
epochs = 1

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy()

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'Variable' object has no attribute '_distribute_strategy'

# Hugging Face No longer support Tensorflow models:(

In [15]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

with tf.device("/CPU:0"):
    bert_text_processor = hub.KerasLayer(tfhub_handle_preprocess, name="preprocessor")

bert_encoder = hub.KerasLayer(
    tfhub_handle_encoder,
    trainable=True,
    name='BERT_encoder'
)

In [25]:


def prep_map(x, y):
    # x shape: (batch,) -> vectorized preprocessing keeps the batch
    enc = bert_text_processor(x)                  # dict of tensors, each (batch, seq_len)
    return enc, tf.cast(y, tf.float32)   # keep labels batched

texts  = tf.constant(["good", "bad", "okay", "great"])
labels = tf.constant([1, 0, 0, 1])

ds = (tf.data.Dataset.from_tensor_slices((texts, labels))
        .batch(32)
        .prefetch(tf.data.AUTOTUNE))

In [33]:

class BertTextClassifier(tf.keras.Model):
    def __init__(
        self,
        preprocess_handle="https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
        encoder_handle="https://tfhub.dev/google/experts/bert/wiki_books/sst2/2",
        seq_length=128,
        dropout_rate=0.1,
        train_encoder=True,
        pin_preprocess_cpu=True,
        pin_encoder_gpu=True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.seq_length = int(seq_length)
        self.pin_preprocess_cpu = bool(pin_preprocess_cpu)
        self.pin_encoder_gpu = bool(pin_encoder_gpu)

        # Preprocessor (CPU-only resources like vocab hash tables)
        self.preprocess = hub.KerasLayer(
            preprocess_handle,
            name="preprocess_cpu",
        )

        # Encoder (GPU-friendly)
        self.encoder = hub.KerasLayer(
            encoder_handle, name="bert_encoder", trainable=train_encoder
        )

        # Head
        self.dropout = tf.keras.layers.Dropout(dropout_rate, name="dropout")
        self.classifier = tf.keras.layers.Dense(1, name="classifier")  # from_logits

    def _preprocess_call(self, inputs):
        # inputs: tf.string 1-D tensor [batch], returns dict with static shapes
        if self.pin_preprocess_cpu:
            with tf.device("/CPU:0"):
                return self.preprocess(inputs)
        else:
            return self.preprocess(inputs)

    def _encoder_call(self, enc_inputs, training):
        if self.pin_encoder_gpu:
            with tf.device("/GPU:0"):
                return self.encoder(enc_inputs, training=training)
        else:
            return self.encoder(enc_inputs, training=training)

    def call(self, inputs, training=False):
        """
        Accepts either:
          - raw strings: Tensor(shape=(batch,), dtype=string)
          - encoder dict: {
              'input_word_ids': (batch, seq_length) int32,
              'input_mask':     (batch, seq_length) int32,
              'input_type_ids': (batch, seq_length) int32
            }
        """
        if isinstance(inputs, dict):
            enc_in = inputs
        else:
            # Expect 1-D string tensor; batching preserved by Keras/tf.data
            enc_in = self._preprocess_call(inputs)

        enc_out = self._encoder_call(enc_in, training=training)
        x = enc_out["pooled_output"]
        x = self.dropout(x, training=training)
        logits = self.classifier(x)
        return logits  # use from_logits=True in loss


In [34]:
# 1) End-to-end strings (preprocess on CPU, encoder on GPU)
model = BertTextClassifier(pin_preprocess_cpu=True, pin_encoder_gpu=True)
model.compile(optimizer=tf.keras.optimizers.Adam(3e-5),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.BinaryAccuracy(threshold=0.0)],
              jit_compile=False)  # avoid XLA cross-device issues
raw_ds = (tf.data.Dataset.from_tensor_slices((
              tf.constant(["good movie", "bad film"]),  # (batch,) strings
              tf.constant([1, 0])
          ))
          .batch(2)
          .prefetch(tf.data.AUTOTUNE))
model.fit(raw_ds, epochs=1)

# 2) Preprocess in tf.data on CPU, feed encoder dict (keeps batching)
pre = model.preprocess  # same layer, already CPU-pinned
def prep_map(x, y):
    return pre(x), tf.cast(y, tf.float32)

dict_ds = (tf.data.Dataset.from_tensor_slices((
              tf.constant(["good movie", "bad film"]),
              tf.constant([1, 0])
          ))
          .batch(2)
          .map(prep_map, num_parallel_calls=tf.data.AUTOTUNE)
          .prefetch(tf.data.AUTOTUNE))
model.fit(dict_ds, epochs=1)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14s/step - binary_accuracy: 0.5000 - loss: 0.5825
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - binary_accuracy: 0.5000 - loss: 0.7135


<keras.src.callbacks.history.History at 0x7f8445916690>

In [35]:
reviews, labels = [], []
for review, label in tfds.as_numpy(ds_train):
    reviews.append(review.decode())
    labels.append(int(label))

In [36]:
reviews_tf = tf.constant(reviews)
labels_tf = tf.constant(labels)
dataset = tf.data.Dataset.from_tensor_slices((reviews_tf, labels_tf)).batch(32)

In [37]:
model.fit(dataset)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 316ms/step - binary_accuracy: 0.6883 - loss: 0.5909


<keras.src.callbacks.history.History at 0x7f83d1196a50>