# Fine-tuning the BERT Model

In [32]:
# !pip install -r requirements.txt

## Import Libraries

In [34]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from datasets import Dataset
import matplotlib.pyplot as plt
import sys
import os
# Add the src directory to the system path
sys.path.append(os.path.abspath('../src'))
from config import MODEL_NAME, RANDOM_SEED, BATCH_SIZE, LEARNING_RATE, EPOCHS, MODEL_SAVE_DIR


## Load the Processed Dataset

In [35]:
df = pd.read_csv('../data/processed/processed_data.csv')

In [36]:
# Shuffle and split the dataset with fixed seed 42
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Perform stratified split to maintain balance in training and validation sets
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['Liked'], random_state=RANDOM_SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Liked'], random_state=RANDOM_SEED)


In [37]:
train_df['labels'] = train_df['Liked']
val_df['labels'] = val_df['Liked']
test_df['labels'] = test_df['Liked']

## Construct HF Datasets

In [38]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
val_dataset

## Tokenizing the `Review` from the Dataset

In [None]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

In [43]:
# Tokenize helper function
def tokenize_function(examples):
    # Tokenize the 'Review' column
    tokenized = tokenizer(examples['Review'], padding='max_length', truncation=True)
    # Include 'labels' in the tokenized output if available
    tokenized['labels'] = examples['labels']
    return tokenized

In [None]:
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [45]:
# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


## Training the `bert-base-uncased` Model

### Training Arguments with Version Control for Checkpoints

In [46]:
version = 1
model_path = os.path.join(MODEL_SAVE_DIR, f"{MODEL_NAME}_v{version}")
while os.path.exists(model_path):
    version += 1
    model_path = os.path.join(MODEL_SAVE_DIR, f"{MODEL_NAME}_v{version}")

By default, Hugging Face's Trainer uses the AdamW optimizer and cross-entropy loss for classification tasks. Here, we just use default settings, but we could take customized functions to override later.

In [None]:
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir=f"{model_path}/logs",
    seed=RANDOM_SEED,
    load_best_model_at_end=True,
    # gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    # lr_scheduler_type=LR_SCHEDULER_TYPE,
    no_cuda=True
)

In [48]:
class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # Save train and eval losses if they exist in logs
            if "loss" in logs:
                self.train_losses.append(logs["loss"])
            if "eval_loss" in logs:
                self.eval_losses.append(logs["eval_loss"])

In [49]:
def save_losses_and_plot(train_losses, eval_losses, output_dir=".", filename="losses.txt"):
    # Save losses to a text file
    with open(os.path.join(output_dir, filename), "w") as f:
        f.write("Epoch\tTraining Loss\tValidation Loss\n")
        for epoch, (train_loss, eval_loss) in enumerate(zip(train_losses, eval_losses), start=1):
            f.write(f"{epoch}\t{train_loss}\t{eval_loss}\n")

    # Plot the losses
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, label="Training Loss", marker='o')
    plt.plot(range(1, len(eval_losses) + 1), eval_losses, label="Validation Loss", marker='o')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    plt.grid()
    plt.savefig(os.path.join(output_dir, "loss_plot.png"))
    plt.show()

In [50]:
loss_logger = LossLoggerCallback()

### Model Training

In [None]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[loss_logger]
)

In [None]:
# Train the model
trainer.train()

# Save the final model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

In [None]:
save_losses_and_plot(loss_logger.train_losses, loss_logger.eval_losses, output_dir=model_path)

Simly load the model checkpoints for testing

In [22]:
# Load the fine-tuned model and tokenizer
model_path = '../data/models/bert-base-uncased_v1'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
# Initialize a Trainer instance (only for prediction)
trainer = Trainer(model=model)

# Perform prediction on the test dataset
predictions = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

In [None]:
# Convert predictions to a list and add as a new column to test_df
test_df['Predicted'] = predicted_labels.numpy()  # Convert tensor to numpy array, then add to DataFrame

# Display the first few rows to compare actual vs. predicted
print(test_df[['Review', 'Liked', 'Predicted']].head())

In [None]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(test_df['Liked'], test_df['Predicted'])

# Calculate accuracy
accuracy = accuracy_score(test_df['Liked'], test_df['Predicted'])

print("Confusion Matrix:")
print(conf_matrix)
print("\nAccuracy:", accuracy)

In [None]:
incorrect_predictions = test_df[test_df['Liked'] != test_df['Predicted']]

# Display the incorrect predictions
print(incorrect_predictions)

In [None]:
len(incorrect_predictions)

If you don't have cuda installed in your machine, use the following method to call the model.

In [28]:
# Ensure the model is on the CPU
model = model.to("cpu")

training_args = TrainingArguments(
    output_dir=model_path,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # logging_strategy="epoch",
    # learning_rate=LEARNING_RATE,
    # per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    # num_train_epochs=EPOCHS,
    # weight_decay=0.01,
    # logging_dir=f"{model_path}/logs",
    # seed=RANDOM_SEED,
    # load_best_model_at_end=True,
    # gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    # lr_scheduler_type=LR_SCHEDULER_TYPE,
    no_cuda=True
)

# Initialize the Trainer instance with CPU-only settings
trainer = Trainer(
    model=model,
    args=training_args
)


In [None]:
# Perform prediction on the test dataset
predictions = trainer.predict(test_dataset)

# Extract the predicted labels
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert predictions to a list and add as a new column to test_df
test_df['Predicted'] = predicted_labels.numpy()  # Convert tensor to numpy array

# Display the first few rows to compare actual vs. predicted
print(test_df[['Review', 'Liked', 'Predicted']].head())

In [None]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(test_df['Liked'], test_df['Predicted'])

# Calculate accuracy
accuracy = accuracy_score(test_df['Liked'], test_df['Predicted'])

print("Confusion Matrix:")
print(conf_matrix)
print("\nAccuracy:", accuracy)

In [None]:
incorrect_predictions = test_df[test_df['Liked'] != test_df['Predicted']]

# Display the incorrect predictions
print(incorrect_predictions)
print(len(incorrect_predictions))