<a href="https://colab.research.google.com/github/Shubham-Gattani/Hugging_face_basics/blob/main/fine_tuning_bert_for_sentimentClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================
# End-to-End BERT Training Pipeline in Google Colab
# Includes Dataset Loading, Tokenization, Training, Evaluation, and Saving
# =============================

# STEP 1: Install Required Libraries
# ---------------------------
!pip install transformers datasets torch -q  # Install Hugging Face tools
!pip install accelerate -q  # Helps optimize training speed


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# STEP 2: Import Required Libraries
# ---------------------------
import torch  # For PyTorch-based deep learning
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import os

In [3]:
# STEP 3: Load Dataset (Rotten Tomatoes Reviews)
# ---------------------------
# The dataset contains movie reviews labeled as "Positive" (1) or "Negative" (0).
dataset = load_dataset("rotten_tomatoes")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [4]:
# STEP 4: Load BERT Tokenizer
# ---------------------------
# We use "bert-base-uncased" because:
# - "base" means 12 layers (smaller than "large" version)
# - "uncased" means it ignores capitalization (e.g., "Movie" = "movie")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# STEP 5: Tokenize Dataset
# ---------------------------
# Tokenization converts text into input IDs that BERT understands.
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128) # padding="max_length": This ensures that all tokenized sequences are padded to a fixed length (max_length). If a sequence is shorter than max_length, padding tokens will be added to reach this length.
# truncation=True: meaning any sequence longer than the specified max_length is truncated to fit within this length. max_length=128: This sets the fixed length for the sequences to 128 tokens. Both padding and truncation will adjust the sequence length to this value.#\

# Apply tokenization to the entire dataset
dataset = dataset.map(tokenize_function, batched=True) # Apply the tokenize_function to each example in the dataset, enabling batched processing for efficiency

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [6]:
# STEP 6: Convert Labels & Remove Unnecessary Columns
# ---------------------------
# We must rename the label column to "labels" for Hugging Face's Trainer
dataset = dataset.rename_column("label", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [8]:
dataset["train"][0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  1996,  2600,  2003, 16036,  2000,  2022,  1996,  7398,  2301,
          1005,  1055,  2047,  1000, 16608,  1000,  1998,  2008,  2002,  1005,
          1055,  2183,  2000,  2191,  1037, 17624,  2130,  3618,  2084,  7779,
         29058,  8625, 13327,  1010,  3744,  1011, 18856, 19513,  3158,  5477,
          4168,  2030,  7112, 16562,  2140,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  

In [10]:
# STEP 7: Load Pretrained BERT Model for Classification
# ---------------------------
# - num_labels=2 (binary classification: positive/negative)
# - BERT outputs **logits** which we later convert into class labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# STEP 8: Define Training Arguments
# ---------------------------
training_args = TrainingArguments(
    output_dir="./bert_sentiment",  # Where the model checkpoints will be saved
    evaluation_strategy="epoch",  # Evaluate model at the end of every epoch
    save_strategy="epoch",  # Save model at the end of every epoch
    logging_dir="./logs",  # Directory for logs
    per_device_train_batch_size=8,  # Choose batch size (balance speed & memory)
    per_device_eval_batch_size=8,  # Same batch size for evaluation
    num_train_epochs=2,  # Train for 2 full passes over the dataset
    learning_rate=2e-5,  # Standard learning rate for fine-tuning BERT
    weight_decay=0.01,  # Helps prevent overfitting
    logging_steps=500,  # Log training status every 500 steps
    fp16=True,  # Enables mixed precision training (faster & uses less memory)
    load_best_model_at_end=True,  # Saves the best model based on evaluation loss
    report_to="none",  # Disable W&B
)



In [17]:
# STEP 9: Create Trainer Object
# ---------------------------
# The Trainer API simplifies training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [18]:
# STEP 10: Disable Weights & Biases (W&B) Logging (Optional)
# ---------------------------
# W&B is enabled by default in Hugging Face's Trainer, so we turn it off.
os.environ["WANDB_DISABLED"] = "true"

In [19]:
# STEP 11: Start Training 🚀
# ---------------------------
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3677,0.397948
2,0.2416,0.592844


TrainOutput(global_step=2134, training_loss=0.31230010199792607, metrics={'train_runtime': 259.3723, 'train_samples_per_second': 65.774, 'train_steps_per_second': 8.228, 'total_flos': 1122168651110400.0, 'train_loss': 0.31230010199792607, 'epoch': 2.0})

In [22]:
# STEP 12: Evaluate Model Performance
# ---------------------------
# The Trainer automatically returns accuracy and loss.
from pprint import pprint
metrics = trainer.evaluate()
pprint(metrics)

{'epoch': 2.0,
 'eval_loss': 0.39794766902923584,
 'eval_runtime': 5.1771,
 'eval_samples_per_second': 205.907,
 'eval_steps_per_second': 25.883}


In [23]:
# STEP 13: Save the Fine-Tuned Model
# ---------------------------
# Save the trained model & tokenizer to local storage.
model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")

('fine_tuned_bert/tokenizer_config.json',
 'fine_tuned_bert/special_tokens_map.json',
 'fine_tuned_bert/vocab.txt',
 'fine_tuned_bert/added_tokens.json')

In [35]:
# STEP 14: Load the Model for Inference (Test on New Sentences)
# ---------------------------
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(-1).item()
    return "Positive" if prediction == 1 else "Negative"

# Test with new reviews:
print(predict_sentiment("This movie was not so great!"))
print(predict_sentiment("The storyline was magnificent, but the acting was too boring."))
print(predict_sentiment("The storyline was magnificent"))

Negative
Negative
Positive
