# Task 01: News Topic Classifier using BERT (HuggingFace)
# Advanced Internship – DevelopersHub

Install Dependicidies

In [1]:
%pip install transformers datasets evaluate torch gradio scikit-learn -q

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import sklearn
print(sklearn.__version__)

1.8.0


In [4]:
%pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# 2. Import Libraries

import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 3. Load Dataset

dataset = load_dataset("gimmaru/ag_news")

# Check dataset
print(dataset)

# Access the single dataset
data = dataset["test"]

# Split train/test
split = data.train_test_split(test_size=0.2, seed=42)

train_data = split["train"]
test_data = split["test"]

print(train_data)
print(test_data)

DatasetDict({
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})
Dataset({
    features: ['text', 'label'],
    num_rows: 800
})
Dataset({
    features: ['text', 'label'],
    num_rows: 200
})


In [None]:
# 4. Load Tokenizer

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
# 5. Tokenize Dataset

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_encoded = train_data.map(tokenize_batch, batched=True, remove_columns=["text"])
test_encoded = test_data.map(tokenize_batch, batched=True, remove_columns=["text"])


In [24]:
#Sanity Check
print(train_encoded.column_names)
print(test_encoded.column_names)

['label', 'input_ids', 'token_type_ids', 'attention_mask']
['label', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
# 6. Rename label -> labels

def rename_label(batch):
    batch["labels"] = batch["label"]
    return batch

train_encoded = train_encoded.map(rename_label, batched=True, remove_columns=["label"])
test_encoded = test_encoded.map(rename_label, batched=True, remove_columns=["label"])

# Set PyTorch tensor format
train_encoded.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

test_encoded.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

Map: 100%|██████████| 800/800 [00:00<00:00, 4915.68 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 3113.86 examples/s]


In [None]:
# 7. Load Model

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 8. Training Arguments

training_args = TrainingArguments(
    output_dir="./bert-news-classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [None]:
# 9. Define Metrics

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "f1": f1_score}

In [None]:
# 10. Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=test_encoded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
# 11. Start Training

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.690161,0.785,0.777925
2,No log,0.512828,0.83,0.829554
3,No log,0.478717,0.84,0.841019




TrainOutput(global_step=150, training_loss=0.6322743733723958, metrics={'train_runtime': 13254.8839, 'train_samples_per_second': 0.181, 'train_steps_per_second': 0.011, 'total_flos': 157869468057600.0, 'train_loss': 0.6322743733723958, 'epoch': 3.0})

In [None]:
# 12. Save Model

trainer.model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

In [None]:
# 13. Quick Test Inference

from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="./saved_model",
    tokenizer="./saved_model"
)

classifier("Apple releases new iPhone with advanced AI features")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


[{'label': 'LABEL_3', 'score': 0.7422987818717957}]