In [1]:
#!pip install transformers datasets evaluate
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Import IMDB 

from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
imdb_dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [4]:
from pprint import pprint
pprint(imdb_dataset["train"][10]) # Label 1 means positive review

{'label': 0,
 'text': 'It was great to see some of my favorite stars of 30 years ago '
         'including John Ritter, Ben Gazarra and Audrey Hepburn. They looked '
         'quite wonderful. But that was it. They were not given any characters '
         'or good lines to work with. I neither understood or cared what the '
         'characters were doing.<br /><br />Some of the smaller female roles '
         'were fine, Patty Henson and Colleen Camp were quite competent and '
         'confident in their small sidekick parts. They showed some talent and '
         "it is sad they didn't go on to star in more and better films. Sadly, "
         "I didn't think Dorothy Stratten got a chance to act in this her only "
         'important film role.<br /><br />The film appears to have some fans, '
         'and I was very open-minded when I started watching it. I am a big '
         'Peter Bogdanovich fan and I enjoyed his last movie, "Cat\'s Meow" '
         'and all his early ones from 

In [5]:
from transformers import AutoTokenizer

#Each pre-trained model has its own tokenizer. If we give the model name, the corresponding tokenizer will be loaded.
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [6]:
#We will define a preprocessing function that applies the tokenizer to the text data in the dataset.
def preprocess(reviews):
  return tokenizer(reviews["text"], truncation=True) #truncation=True ensures that the sequences are truncated to the maximum length supported by the model.

tokenized_imdb_dataset = imdb_dataset.map(preprocess, batched=True) #batched=True processes the data in batches for efficiency.

Map: 100%|██████████| 50000/50000 [00:13<00:00, 3610.77 examples/s]


In [7]:
from transformers import DataCollatorWithPadding #To dynamically pad the sequences to the maximum length in the batch

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") #Creating a data collator that will pad the sequences and return TensorFlow tensors.

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

In [9]:
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

In [10]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import create_optimizer #To create an optimizer with a learning rate schedule like Adam
import tensorflow as tf

batch_size = 16
num_epochs = 5
subset_size = 1024 # Using a subset for faster training

steps_per_epoch = subset_size // batch_size
total_train_steps = int(steps_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) #num_warmup_steps=0 means no warmup phase.




In [None]:
# Load the pre-trained DistilBERT model for sequence classification, we don't need to create the model from scratch.
from transformers import TFAutoModelForSequenceClassification # The task is doing the sequence(reviews) to classification(sentiment label: positive/negative)

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english",
                                                     from_pt=True,
                                                     num_labels=2,
                                                     id2label=id2label,
                                                     label2id=label2id)




TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [None]:
shuffled_train_dataset = tokenized_imdb_dataset["train"].shuffle(seed=42)
shuffled_validation_dataset = tokenized_imdb_dataset["test"].shuffle(seed=42)

# shuffled_train_dataset is a dataset that is known to Hugging Face transformers library, so we can use the prepare_tf_dataset method to create TensorFlow datasets.

tf_train_set = model.prepare_tf_dataset(
    shuffled_train_dataset.select(range(subset_size)),
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

tf_validation_set = model.prepare_tf_dataset(
    shuffled_validation_dataset.select(range(subset_size)),
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator
)

In [14]:
model.compile(optimizer=optimizer)

In [15]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [17]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=[metric_callback])

Epoch 1/5

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x20fc81d82f0>

In [19]:
text = "Saw an early screening of this film at the Tilton Square Theatre in New Jersey, and I was completely blown away. From the opening scene all the way until the credits I never felt bored, which is impressive for a 2 hour and 45 minute film."

inputs = tokenizer(text, return_tensors="tf")
logits = model(**inputs).logits
predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
print("Predicted class:", model.config.id2label[predicted_class_idx.numpy()])

Predicted class: NEGATIVE
