<a href="https://colab.research.google.com/github/Nithya07shree/colab-notes-aiml/blob/main/BERT_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implement a basic attention layer in PyTorch

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class ScaledDotProductAttention(nn.Module):
  def __init__(self,dk):
    super(ScaledDotProductAttention,self).__init__()
    self.dk = dk

  def forward(self, Q,K,V, mask=None):
    # calculate score
    score = torch.matmul(Q,K.transpose(-2,-1))/math.sqrt(self.dk)
    # apply mask
    if mask is not None:
      score = score.masked_fill(mask==0,-1e9)
    # softmax
    attn_weights = F.soft,ax(score, dim=-1)
    # multiply weights by V
    output = torch.matmul(attn_weights,V)
    return output, attn_weights

In [None]:
# num_heads: number of parallel attention heads (e.g., 8)
multihead_attn = nn.MultiheadAttention(embed_dim=512, num_heads=8, batch_first=True)
# attn_output shape: [batch, seq_len, embed_dim]
attn_output, attn_weights = multihead_attn(query, key, value)

use a pretrained BERT Model

In [1]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "bert-base-uncased" # The 'base' version (110M parameters)

# Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load Model with 2 output labels (Positive/Negative)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# BERT doesn't split text on spaces, it uses "wordpiece tokenization" , requiring special tokens. so we use tokenizer for the model
def tokenize(examples):
  return tokenizer(examples["text"], padding= "max_length", truncation = True, max_length = 128)

# load imdb via huggingface
from datasets import load_dataset
dataset_imdb = load_dataset("imdb")

# tokenize the dataset
tokenized_imdb = dataset_imdb.map(tokenize, batched = True)

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# the Trainer API for BERT handles the GPU distribution, mixed-precision training (making it faster), and logging.
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch", # Calculate accuracy after every epoch
    per_device_train_batch_size=16,
    num_train_epochs=1,          # BERT often only needs 1-3 epochs
    weight_decay=0.01,
    optim="adamw_torch" # Explicitly use non-fused AdamW for XLA compatibility
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"].shuffle(seed=42).select(range(2000)), # Sample for speed
    eval_dataset=tokenized_imdb["test"].select(range(500)),
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.315808,0.856




TrainOutput(global_step=125, training_loss=0.4481163024902344, metrics={'train_runtime': 93.424, 'train_samples_per_second': 21.408, 'train_steps_per_second': 1.338, 'total_flos': 131555527680000.0, 'train_loss': 0.4481163024902344, 'epoch': 1.0})

The inference

In [7]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Load the model and tokenizer
# If you just finished training, 'model' and 'tokenizer' are already in memory.
# If loading from a saved folder:
# model_path = "./test_trainer/checkpoint-500"
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

In [8]:
# ensure model is in evaluation mode
model.eval()

# move to device(to gpu/cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_sentiment(text):
  # preprocess (tokenize) the input text
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length= 128)

  # move inputs to device same as model
  inputs = {k: v.to(device) for k, v in inputs.items()}

  # forward pass
  with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

  # convert logits to probabilities
  probs = F.softmax(logits, dim=-1)

  # get the label (0 or 1) and confidence score
  pred_label = torch.argmax(probs, dim=1).item()
  confidence = probs[0][pred_label].item()

  # map 0/1 to human readable text
  label_map = {0: "Negative", 1: "Positive"}

  return label_map[pred_label], confidence

In [9]:
sentence1 = "The cinematography was breathtaking, but the script was garbage."
result, conf = predict_sentiment(sentence1)
print(f"Review: '{sentence1}'\nSentiment: {result} ({conf:.2f}%)\n")

Review: 'The cinematography was breathtaking, but the script was garbage.'
Sentiment: Negative (0.95%)



In [10]:
sentence2 = "I honestly think this is the best film of the year."
result, conf = predict_sentiment(sentence2)
print(f"Review: '{sentence2}'\nSentiment: {result} ({conf:.2f}%)")

Review: 'I honestly think this is the best film of the year.'
Sentiment: Positive (0.92%)


In [11]:
sentence2 = "The movie vibes was lit."
result, conf = predict_sentiment(sentence2)
print(f"Review: '{sentence2}'\nSentiment: {result} ({conf:.2f}%)")

Review: 'The movie vibes was lit.'
Sentiment: Positive (0.73%)


In [12]:
sentence2 = "The movie vibes was off."
result, conf = predict_sentiment(sentence2)
print(f"Review: '{sentence2}'\nSentiment: {result} ({conf:.2f}%)")

Review: 'The movie vibes was off.'
Sentiment: Negative (0.79%)


Save the model

In [13]:
from google.colab import drive
drive.mount('/content/drive')
save_dir = "/content/drive/MyDrive/imdb_bert_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

load the model back later:

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

save_dir = "/content/drive/MyDrive/imdb_bert_model"

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

alternative approach: host the model on huggingface hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()
model.push_to_hub("imdb-bert-model")
tokenizer.push_to_hub("imdb-bert-model")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Replace 'username' with your actual Hugging Face username
my_model = AutoModelForSequenceClassification.from_pretrained("username/imdb-bert-model")
my_tokenizer = AutoTokenizer.from_pretrained("username/imdb-bert-model")