# Transformers for TensorFlow

## 1. Loading Dataset

In [1]:
# Installing the "datasets" library
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m471.0/471.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Importing the necessary function to load a dataset
from datasets import load_dataset

# Loading the "rotten_tomatoes" dataset
dataset = load_dataset("rotten_tomatoes")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [3]:
# Displaying the loaded dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [4]:
# Accessing the first example from the test split of the dataset
dataset["test"][0]

{'text': 'lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .',
 'label': 1}

## 2. Data Preprocessing

In [5]:
# Importing the tokenizer for a pre-trained model
from transformers import AutoTokenizer

# Initializing the tokenizer for the "distilbert-base-uncased" model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [6]:
# Tokenizing the text of the first training example
tokenizer(dataset["train"][0]["text"])

{'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
# Preprocessing function for tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
# Applying the preprocessing function to the entire dataset in batches
dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [9]:
# Importing the data collator with padding
from transformers import DataCollatorWithPadding

# Initializing the data collator with padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

## 3. Model Loading

In [10]:
# Importing the TensorFlow version of the model for sequence classification
from transformers import TFAutoModelForSequenceClassification

# Initializing a model for sequence classification using "distilbert-base-uncased"
my_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [11]:
# Preparing the training dataset as a TensorFlow dataset
tf_train_set = my_model.prepare_tf_dataset(
    dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

# Preparing the validation dataset as a TensorFlow dataset
tf_validation_set = my_model.prepare_tf_dataset(
    dataset["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

## 4. Model Training

In [16]:
my_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
# Training the model
my_model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2)

Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x7ec5c6235f90>

## 5. Prediction

In [18]:
# Defining a text for inference
text = "I love AI. It's fun to analyze the NLP tasks with Hugging Face."

In [19]:
# Tokenizing the text for inference
tokenized_text = tokenizer(text, return_tensors="tf")
tokenized_text

{'input_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[  101,  1045,  2293,  9932,  1012,  2009,  1005,  1055,  4569,
         2000, 17908,  1996, 17953,  2361,  8518,  2007, 17662,  2227,
         1012,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int32)>}

In [20]:
# Obtaining model logits for the tokenized text
logits = my_model(**tokenized_text).logits

In [21]:
# Importing the math module from TensorFlow
from tensorflow import math

# Finding the index of the class with the highest logit score
int(math.argmax(logits, axis=-1)[0])

0