In [None]:
#  Step 1: Install required libraries
!pip install --upgrade transformers datasets scikit-learn


In [None]:

#  Step 2: Import libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import torch


In [None]:

#  Step 3: Load the AG News dataset (train + test)
dataset = load_dataset("ag_news")


In [None]:

#  Step 4: Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [None]:

#  Step 5: Tokenize the text column
def tokenize(example):
    return tokenizer(example["text"], truncation=True)
# Apply tokenizer to all samples
tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:



#  Step 6: Use DataCollator to automatically handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:

#  Step 7: Load pre-trained BERT model for classification (4 labels)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)


In [None]:

#  Step 8: Define evaluation metrics (accuracy and F1-score)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}


In [None]:

#  Step 9: Set training arguments
training_args = TrainingArguments(
    output_dir="./results",               # where to save results
    evaluation_strategy="epoch",         # evaluate after each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",               # save model after every epoch
    logging_dir="./logs",
)


In [None]:

#  Step 10: Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:

#  Step 11: Fine-tune the model on AG News
trainer.train()


In [None]:

#  Step 12: Save the trained model and tokenizer
trainer.save_model("news_bert_model")
tokenizer.save_pretrained("news_bert_model")
