In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
from tqdm import tqdm

import tensorflow as tf
import tensorflow_datasets as tfds

from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

In [None]:
def convert_text_to_feature(review, tokenizer, max_length):
  
    return tokenizer.encode_plus(
        review,
        add_special_tokens = True,
        max_length = max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask = True
        )

def map_feature_to_dict(input_ids, attention_masks, token_type_ids, label):

    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }, label

def encode_text(ds, tokenizer, max_length, limit=-1):

    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    if (limit > 0):

        ds = ds.take(limit)
        
    for review, label in tfds.as_numpy(ds):

        bert_input = convert_text_to_feature(review.decode(), tokenizer, max_length)
    
        input_ids_list.append(bert_input['input_ids'])
        attention_mask_list.append(bert_input['attention_mask'])

        label_list.append([label])

    return tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list, token_type_ids_list, label_list)
    ).map(map_feature_to_dict)

In [None]:
model_name = 'bert-base-uncased'
batch_size = 32
max_length = 512
learning_rate = 2e-5
epochs = 1

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

train_dataloader = encode_text(train_dataset, tokenizer, max_length).shuffle(10000).batch(batch_size)
test_dataloader = encode_text(test_dataset, tokenizer, max_length).batch(batch_size)

model = TFBertForSequenceClassification.from_pretrained(model_name)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08), 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
)

history = model.fit(
    train_dataloader, 
    epochs=epochs, 
    validation_data=test_dataloader
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


