In [2]:
# firstly we have to train our baseline model: BERT-uncased on sst-2 dataset. Firstly load the dataset
from datasets import load_dataset
# we are using the sst-2 dataset from the GLUE benchmark. 
# the sst-2 dataset is a binary sentiment classification dataset.
task = "sst2"
dataset = load_dataset("glue", task)

In [3]:
# load the tokenizer and tokenize the dataset
from transformers import AutoTokenizer
import torch
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# don't pad here, we'll handle padding later through dynamic padding in the DataCollator
def tokenize(examples):
    return tokenizer(examples["sentence"], truncation=True)

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence", "idx"])
tokenized_dataset.set_format("torch")
# divide the dataset into train, validation and test sets
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]

In [4]:
# load the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

In [6]:
# train the model
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./bert_sst2_baseline",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    fp16=True,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# define the metrics from evaluate
import evaluate
import numpy as np
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

In [7]:
# save the model and tokenizer
trainer.save_model("./bert_sst2_baseline/best_model")
tokenizer.save_pretrained("./bert_sst2_baseline/tokenizer")

In [8]:
# params
params = sum(p.numel() for p in model.parameters())
print("Parameters:", params, "≈ %.1f MB" % (params * 4 / (1024**2)))

# file size of saved model
import os
def folder_size_mb(path):
    total = 0
    for root, dirs, files in os.walk(path):
        for f in files:
            total += os.path.getsize(os.path.join(root, f))
    return total / (1024**2)
print("Saved model folder size (MB):", folder_size_mb("./bert_sst2_baseline"))

In [10]:
import time
import torch

# Load the trained model
model.eval().to("cuda")

# Prepare a single example for batch=1
sample_sentence = "This is a sample sentence to measure inference latency."
encoded_input = tokenizer(sample_sentence, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
input_ids = encoded_input['input_ids'].to("cuda")
attention_mask = encoded_input['attention_mask'].to("cuda")

# Warm-up runs
with torch.no_grad():
    for _ in range(10):
        _ = model(input_ids=input_ids, attention_mask=attention_mask)

# Timed runs
N = 200  # number of repetitions to remove errors
torch.cuda.synchronize()
start_time = time.time()
with torch.no_grad():
    for _ in range(N):
        _ = model(input_ids=input_ids, attention_mask=attention_mask)
torch.cuda.synchronize()
end_time = time.time()

# Calculate average latency per sample
latency_ms = (end_time - start_time) / N * 1000
print(f"Batch=1 latency (average over {N} runs): {latency_ms:.2f} ms")

In [None]:
"""
COMMENT:
Finetuned bert-base-uncased on SST-2 (GLUE); baseline validation accuracy 92.5459%, model size 417.6 MB, inference latency 16.46 ms averaged over 200 runs(batch size=1) on Google Colab T4 GPU
"""