In [1]:
# ============================================================
# 📌 1) Install dependencies
# ============================================================
!pip install transformers datasets torch --quiet


# ============================================================
# 📌 2) Imports
# ============================================================
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


# ============================================================
# 📌 3) Load Dataset + Clean Invalid Text Rows
# ============================================================
dataset = load_dataset("SetFit/bbc-news")

# Remove None / empty / non-string rows
clean_train = [
    (t, l) for t, l in zip(dataset["train"]["text"], dataset["train"]["label"])
    if isinstance(t, str) and len(t.strip()) > 0
]

clean_test = [
    (t, l) for t, l in zip(dataset["test"]["text"], dataset["test"]["label"])
    if isinstance(t, str) and len(t.strip()) > 0
]

train_texts = [t for t, _ in clean_train]
train_labels = [l for _, l in clean_train]

test_texts = [t for t, _ in clean_test]
test_labels = [l for _, l in clean_test]

num_classes = 5

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


# ============================================================
# 📌 4) PyTorch Dataset Class
# ============================================================
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.enc = tokenizer(
            texts, truncation=True, padding=True, return_tensors="pt"
        )
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.enc["input_ids"][idx],
            "attention_mask": self.enc["attention_mask"][idx],
            "labels": self.labels[idx]
        }

train_ds = NewsDataset(train_texts, train_labels)
test_ds = NewsDataset(test_texts, test_labels)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=16)


# ============================================================
# 📌 5) Build BERT Linear Probe Model (freeze encoder)
# ============================================================
class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        # Freeze all BERT encoder layers
        for param in self.bert.parameters():
            param.requires_grad = False

        # Trainable classification layer
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_output = outputs.pooler_output
        return self.classifier(cls_output)


model = BERTClassifier(num_classes).to(device)


# ============================================================
# 📌 6) Train Setup
# ============================================================
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()


# ============================================================
# 📌 7) Training Loop — Show Loss & Accuracy Per Epoch
# ============================================================
for epoch in range(10):
    model.train()
    running_loss = 0
    all_preds = []
    all_labels = []

    for batch in train_loader:
        optimizer.zero_grad()

        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(ids, mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    epoch_acc = accuracy_score(all_labels, all_preds)

    print(f"📌 Epoch {epoch+1}")
    print(f"   🔹 Loss: {running_loss:.4f}")
    print(f"   🔹 Accuracy: {epoch_acc:.4f}\n")


# ============================================================
# 📌 8) Evaluate on Test Set
# ============================================================
model.eval()
test_preds = []
test_true = []

with torch.no_grad():
    for batch in test_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(ids, mask)
        preds = torch.argmax(outputs, dim=1)

        test_preds.extend(preds.cpu().numpy())
        test_true.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_true, test_preds)

print("🎯 Final Test Accuracy:", test_accuracy)


# ============================================================
# 📌 9) Prediction Function
# ============================================================
categories = ["business", "entertainment", "politics", "sport", "tech"]

def predict(text):
    enc = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        # Filter out 'token_type_ids' if present, as the model's forward method doesn't use it
        model_inputs = {
            k: v for k, v in enc.items() if k in ['input_ids', 'attention_mask']
        }
        outputs = model(**model_inputs)
        label = torch.argmax(outputs, dim=1).item()

    return categories[label]

print("\n🔍 Prediction Examples:")
print(predict("Apple releases new AI laptop"))
print(predict("Government announces new policy"))
print(predict("Manchester United wins the match"))

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/880 [00:00<?, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

📌 Epoch 1
   🔹 Loss: 123.0401
   🔹 Accuracy: 0.2359

📌 Epoch 2
   🔹 Loss: 113.9572
   🔹 Accuracy: 0.4000

📌 Epoch 3
   🔹 Loss: 108.0274
   🔹 Accuracy: 0.5020

📌 Epoch 4
   🔹 Loss: 103.0572
   🔹 Accuracy: 0.5265

📌 Epoch 5
   🔹 Loss: 99.1615
   🔹 Accuracy: 0.6147

📌 Epoch 6
   🔹 Loss: 94.6884
   🔹 Accuracy: 0.6180

📌 Epoch 7
   🔹 Loss: 91.0516
   🔹 Accuracy: 0.6686

📌 Epoch 8
   🔹 Loss: 87.7761
   🔹 Accuracy: 0.7012

📌 Epoch 9
   🔹 Loss: 85.4116
   🔹 Accuracy: 0.7224

📌 Epoch 10
   🔹 Loss: 82.1041
   🔹 Accuracy: 0.7216

🎯 Final Test Accuracy: 0.787

🔍 Prediction Examples:
tech
tech
tech


In [2]:
print(predict("Royal challengers won the first ipl trophy in 2025"))

sport


In [3]:
print(predict("modi won the election last year"))

sport


In [4]:
print(predict("ai is taking the jobs of all the enginners out there,the main reason behind it is copy and paste"))

sport


In [5]:
print(predict("Government announces new policy"))

tech


In [6]:
print(predict("ambani is taking all the industry in india and ,becoming the worst richest person"))

sport
