In [1]:
# Step 1: install + check
!pip -q install -U transformers datasets evaluate accelerate gradio

import torch, transformers, datasets, evaluate, gradio, sys, platform, accelerate

print("Python:", sys.version.split()[0])
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("Transformers:", transformers.__version__)
print("Datasets:", datasets.__version__)
print("Evaluate:", evaluate.__version__)
print("Accelerate:", accelerate.__version__)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25hPython: 3.12.11
Torch: 2.8.0+cu126 | CUDA available: True
GPU: Tesla T4
Transformers: 4.56.0
Datasets: 4.0.0
Evaluate: 0.4.5
Accelerate: 1.10.1


In [2]:
from datasets import load_dataset

# Load the AG News dataset
dataset = load_dataset("ag_news")

# Show dataset structure
print(dataset)

# Show a sample
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [4]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize dataset
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",  # make all sequences the same length
        truncation=True,       # cut off if text too long
        max_length=128         # standard size for short texts
    )

# Apply tokenization to YOUR dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# View structure
print(tokenized_datasets)
print("\nSample tokenized item:")
print(tokenized_datasets["train"][0])


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

Sample tokenized item:
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2, 'input_ids': [101, 2813, 2358, 1012, 6468, 15020, 2067, 2046, 1996, 2304, 1006, 26665, 1007, 26665, 1011, 2460, 1011, 19041, 1010, 2813, 2395, 1005, 1055, 1040, 11101, 2989, 1032, 2316, 1997, 11087, 1011, 22330, 8713, 2015, 1010, 2024, 3773, 2665, 2153, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [5]:
import torch
from torch.utils.data import DataLoader

# Convert dataset into PyTorch format
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

# Create dataloaders
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)

# Check one batch
batch = next(iter(train_dataloader))
print({k: v.shape for k, v in batch.items()})


{'label': torch.Size([16]), 'input_ids': torch.Size([16, 128]), 'attention_mask': torch.Size([16, 128])}


In [6]:
{'input_ids': torch.Size([16, 128]),
 'attention_mask': torch.Size([16, 128]),
 'label': torch.Size([16])}


{'input_ids': torch.Size([16, 128]),
 'attention_mask': torch.Size([16, 128]),
 'label': torch.Size([16])}

In [7]:
from transformers import BertForSequenceClassification

# Load pretrained BERT model with classification head (4 classes in AG News)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded on:", device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda


In [8]:
from torch.optim import AdamW
from transformers import get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler (linear decay of learning rate)
num_training_steps = len(train_dataloader) * 2  # let's do 2 epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print("Optimizer & scheduler ready. Training steps:", num_training_steps)


Optimizer & scheduler ready. Training steps: 15000


In [9]:
from torch.nn import CrossEntropyLoss
import torch

loss_fn = CrossEntropyLoss()


In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [14]:
# Subset the dataset for speed
small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
small_test  = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train, shuffle=True, batch_size=16, collate_fn=data_collator)
eval_dataloader  = DataLoader(small_test, batch_size=16, collate_fn=data_collator)


In [15]:
from torch.optim import AdamW
from transformers import get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print("Optimizer & scheduler ready. Total training steps:", num_training_steps)


Optimizer & scheduler ready. Total training steps: 375


In [16]:
from tqdm import tqdm

num_epochs = 2  # keep it small for now
progress_bar = tqdm(range(num_epochs * len(train_dataloader)))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

print("✅ Training finished (lightweight run).")


 28%|██▊       | 2127/7500 [23:44<59:57,  1.49it/s]
100%|██████████| 250/250 [01:22<00:00,  2.89it/s]

✅ Training finished (lightweight run).


In [17]:
import evaluate

metric = evaluate.load("accuracy")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

final_result = metric.compute()
print("📊 Evaluation Result:", final_result)


Downloading builder script: 0.00B [00:00, ?B/s]

📊 Evaluation Result: {'accuracy': 0.924}


In [18]:
from transformers import AutoTokenizer

# Define save directory
save_dir = "./bert_agnews_model"

# Save model
model.save_pretrained(save_dir)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

print(f"Model saved at {save_dir}")


Model saved at ./bert_agnews_model
