# Fine-Tuning BERT model (reference : [Hugging Face](https://huggingface.co/docs/transformers/en/model_doc/bert))

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load and preprocess data
df = pd.read_csv("../data/clean/cleaned_imdb_data.csv")
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})
df = df[['review', 'sentiment']].rename(columns={'review': 'text', 'sentiment': 'label'})

train_val, test = train_test_split(df, test_size=0.15, random_state=42, stratify=df['label'])
train, val = train_test_split(train_val, test_size=0.176, random_state=42, stratify=train_val['label'])

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

# Tokenization using BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_fn(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# Clean and format datasets
for ds in [train_dataset, val_dataset, test_dataset]:
    ds = ds.remove_columns([col for col in ["text", "__index_level_0__"] if col in ds.column_names])
    ds.set_format("torch")

Map: 100%|██████████| 35020/35020 [00:05<00:00, 6175.21 examples/s]
Map: 100%|██████████| 7480/7480 [00:01<00:00, 7410.24 examples/s]
Map: 100%|██████████| 7500/7500 [00:01<00:00, 6098.33 examples/s]


In [3]:
# Load pretrained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_sentiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    fp16=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1,
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [11]:
# Train and evaluate
trainer.train()
results = trainer.evaluate(test_dataset)
print(f"Final Test Accuracy: {results['eval_accuracy']:.4f}")

Epoch,Training Loss,Validation Loss,Accuracy
1,0.313,0.271815,0.902005
2,0.1634,0.272562,0.911096
3,0.1708,0.381633,0.911898


Final Test Accuracy: 0.9141


In [12]:
# Save model and tokenizer
model.save_pretrained("./bert_sentiment_model")
tokenizer.save_pretrained("./bert_sentiment_model")

('./bert_sentiment_model/tokenizer_config.json',
 './bert_sentiment_model/special_tokens_map.json',
 './bert_sentiment_model/vocab.txt',
 './bert_sentiment_model/added_tokens.json',
 './bert_sentiment_model/tokenizer.json')

***

In [None]:
model_path = "./bert_sentiment_model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

model.eval()

# Reload Trainer with the loaded model
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate on test dataset
results = trainer.evaluate(test_dataset)

print(f"Test Accuracy : {results['eval_accuracy']:.4f}")
print(f"Test Loss : {results['eval_loss']:.4f}")


  trainer = Trainer(


Test Accuracy : 0.9141
Test Loss : 0.3739


#### Summary of Results :

-   Fine-tuned BERT outperforms all other models in test accuracy, achieving 91.4%, which is about +3% better than the LSTM-based models and the custom transformer.

-   Test loss for BERT is higher (0.3739) than the custom transformer and LSTM models, meaning BERT’s predictions are slightly less confident (loss measures how sharp and certain predictions are).

-   Despite a higher loss, BERT generalizes better, making more correct predictions overall.

Overall, the fine-tuned model achieved the highest test accuracy (91.4%) compared to all other models, outperforming both LSTM-based and custom Transformer models by a margin of about 3%. Despite having a slightly higher test loss, BERT’s superior performance can be justified by its deep contextual understanding of language, learned from massive pretraining on large text corpora. This enables BERT to generalize better on unseen data, making it overall the most effective model for the sentiment classification task.