In [None]:
!pip install datasets


In [None]:

from datasets import load_dataset

dataset = load_dataset("ag_news")
print(dataset)


In [None]:
!pip install datasets transformers gradio evaluate

from datasets import load_dataset
dataset = load_dataset("ag_news")

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

from transformers import DataCollatorWithPadding, BertForSequenceClassification
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

from transformers import TrainingArguments, Trainer
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(20000)),
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
print(trainer.evaluate())

import gradio as gr, torch
label_names = ["World", "Sports", "Business", "Sci/Tech"]

def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        return label_names[torch.argmax(outputs.logits, dim=-1).item()]

demo = gr.Interface(fn=classify_news, inputs="text", outputs="label", title="News Topic Classifier")
demo.launch()


Task: News Topic Classifier Using BERT

Dataset → Loaded the AG News dataset (4 classes: World, Sports, Business, Sci/Tech).

Preprocessing → Used BERT tokenizer to convert news headlines into token IDs with padding/truncation.

Model → Fine-tuned bert-base-uncased (pretrained BERT) with a classification head for 4 labels.

Training → Used Hugging Face Trainer API with training arguments (learning rate, batch size, epochs).

Evaluation → Measured accuracy and F1-score on the test set.

Deployment → Built a Gradio app so users can type a news headline and instantly see the predicted topic.