# Task 1 — BERT News Topic Classifier

AG News → BERT fine-tuning using Hugging Face `Trainer`.

In [None]:
import pandas as pd, numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import torch


## 1) Load dataset

In [None]:
ds = load_dataset('ag_news')
print(ds)


## 2) Tokenize

In [None]:
model_ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True)

# AG News has 'text' and 'label'
encoded = ds.map(tokenize, batched=True)
encoded = encoded.remove_columns(['text'])
encoded = encoded.rename_column('label','labels')
encoded.set_format('torch')


## 3) Prepare model & trainer

In [None]:
num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)
metric_acc = evaluate.load('accuracy')
metric_f1 = evaluate.load('f1')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_args = TrainingArguments(
    output_dir='./outputs',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=torch.cuda.is_available()
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        'accuracy': metric_acc.compute(predictions=preds, references=labels)['accuracy'],
        'f1_macro': metric_f1.compute(predictions=preds, references=labels, average='macro')['f1']
    }

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=encoded['train'],
    eval_dataset=encoded['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


## 4) Train & Evaluate

In [None]:
trainer.train()
metrics = trainer.evaluate()
metrics

## 5) Save model

In [None]:
trainer.save_model('./model')


## 6) Inference helper

In [None]:
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

def predict(text):
    return pipe(text)

predict('Stocks rally as market optimism grows')
