## Import

In [None]:
import pandas as pd

df = pd.read_csv('IMDB dataset.csv')
df.head()

## Cleaning

In [5]:
df['review_cleaned'] = df['review'].apply(lambda x: x.replace('<br />', ''))
# whitespace removal
df['review_cleaned'] = df['review_cleaned'].replace(r'\s+', ' ', regex=True)

## Encode sentiment

In [None]:
df['sentiment_encoded'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head()

## Tokenize data

With the data prepared it needs to be tokenized in preparation for BERT. This uses the `BertTokenizer` from Hugging Face `transformers` to do so.

We use the `bert-case-uncased` tokenizer which ignores the casing of the reviews. This may or may not be appropriate. There could be 'implied' meaning in such usage however internet users are also historically notorious for poor casing in general.

In [None]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Example tokenization
# sample_sentence = 'I liked this movie'
# token_ids = tokenizer.encode(sample_sentence, return_tensors='np')[0]
# print(f'Token IDs: {token_ids}')
# # Convert the token IDs back to tokens to reveal the special tokens added
# tokens = tokenizer.convert_ids_to_tokens(token_ids)
# print(f'Tokens   : {tokens}')

token_ids = []
attention_masks = []

for review in df['review_cleaned']:
  # We have a large number of padding tokens due to short reviews.
  # Using `encode_plus` lets us add an attention mask to ignore all
  # these padding tokens therefore avoiding attending to these.
  batch_encoder = tokenizer.encode_plus(
    review,
    max_length = 512,
    padding = 'max_length',
    truncation = True,
    return_tensors = 'pt'
  )

  token_ids.append(batch_encoder['input_ids'])
  attention_masks.append(batch_encoder['attention_mask'])

# Converts our lists to PyToch tensors
token_ids = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

## Preparing training & evaluation datasets

See [3.3 - Create the Train and Validation Dataloaders](https://towardsdatascience.com/a-complete-guide-to-bert-with-code-9f87602e4a11/).

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

val_size = 0.1

train_ids, val_ids = train_test_split(
  token_ids,
  test_size=val_size,
  shuffle=False
)

train_masks, val_masks = train_test_split(
  attention_masks,
  test_size=val_size,
  shuffle=False
)

labels = torch.tensor(df['sentiment_encoded'].values)
train_ids, val_ids, train_masks, val_masks, train_labels, val_labels = train_test_split(
    token_ids,
    attention_masks,
    labels,
    test_size=0.1,
    shuffle=True,
    random_state=42
)

train_data = TensorDataset(train_ids, train_masks, train_labels)
val_data   = TensorDataset(val_ids,   val_masks,   val_labels)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
val_dataloader   = DataLoader(val_data,   batch_size=16)

In [None]:
from transformers import BertForSequenceClassification
import torch
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained(
  'bert-base-uncased',
  num_labels=2 # positive or negative
)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print("Using Device: ", device)

EPOCHS = 2

# The optimizer aims to converage weights and bias towards an optimal.
optimizer = AdamW(model.parameters(), lr=2e-5)
# Language models typically use the cross entropy loss function.
loss_function = nn.CrossEntropyLoss()

# The scheduler determines the size of changes to weights and biases.
# Since we start with random parameters then big changes early on can
# often be beneficial to converge. As training progresses, changes should
# be smaller towards the convergance. This is what the linear scheduler aims
# to do.
num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)

## Training

Apple M4 chip CPU training is horrendously slow (no suprise!). Takes X on a T4 GPU.

In [None]:
from tqdm import tqdm

for epoch in range(0, EPOCHS):
    batches = len(train_dataloader);
    print(f"\nEpoch {epoch} — total batches: {batches}")

    model.train()
    training_loss = 0

    progress_bar = tqdm(
        enumerate(train_dataloader),
        total=batches,
        desc=f"Epoch {epoch}",
        position=0,
        leave=True
    )

    for idx, batch in progress_bar:
        batch_token_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()

        loss, logits = model(
            batch_token_ids,
            token_type_ids=None,
            attention_mask=batch_attention_mask,
            labels=batch_labels,
            return_dict=False
        )

        training_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Update tqdm bar text with current loss
        progress_bar.set_postfix({"loss": loss.item()})

    average_train_loss = training_loss / batches
    print(f"Average training loss: {average_train_loss:.4f}")

## Save model

In [None]:
import shutil

model.save_pretrained("movie_sentiment_model")
tokenizer.save_pretrained("movie_sentiment_model")

# Create a ZIP file for easier sharing
shutil.make_archive("movie_sentiment_model", 'zip', "movie_sentiment_model")
print("Saved model and tokenizer to 'movie_sentiment_model/' and zipped as 'movie_sentiment_model.zip'")

## Evaluation

In [None]:
from tqdm import tqdm
import numpy as np
from transformers import BertForSequenceClassification
from transformers import BertTokenizer

# print("Loading model from drive...");

# model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/movie_sentiment_model")
# tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/movie_sentiment_model")
# model.to(device)

print("Model ready")

def calculate_accuracy(preds, labels):
    """ Calculate the accuracy of model predictions against true labels.

    Parameters:
        preds (np.array): The predicted label from the model
        labels (np.array): The true label

    Returns:
        accuracy (float): The accuracy as a percentage of the correct
            predictions.
    """
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    accuracy = np.sum(pred_flat == labels_flat) / len(labels_flat)

    return accuracy

model.eval()
val_loss = 0
val_accuracy = 0

total_batches = len(val_dataloader)
print(f"\nValidation — total batches: {total_batches}")

progress_bar = tqdm(
    enumerate(val_dataloader),
    total=total_batches,
    desc="Validating",
    position=0,
    leave=True
)

for idx, batch in progress_bar:

    batch_token_ids = batch[0].to(device)
    batch_attention_mask = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        loss, logits = model(
            batch_token_ids,
            attention_mask=batch_attention_mask,
            labels=batch_labels,
            token_type_ids=None,
            return_dict=False
        )

    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.cpu().numpy()

    val_loss += loss.item()
    val_accuracy += calculate_accuracy(logits, label_ids)

    # Update tqdm bar
    progress_bar.set_postfix({
        "loss": f"{loss.item():.4f}",
        "acc": f"{calculate_accuracy(logits, label_ids):.4f}"
    })

average_val_loss = val_loss / total_batches
average_val_accuracy = val_accuracy / total_batches

print(f"\nAverage validation loss: {average_val_loss:.4f}")
print(f"Average validation accuracy: {average_val_accuracy:.4f}")

## Usage

In [None]:
import torch
import torch.nn.functional as F

review = "Movie review goes here"

encoded = tokenizer.encode_plus(
    review,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded['input_ids'].to(device)
attention_mask = encoded['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=None,
        return_dict=False
    )

logits = outputs[0]              # shape: (1, num_labels)
probs = F.softmax(logits, dim=1) # convert logits → probabilities

pred_label = torch.argmax(probs, dim=1).item()
confidence = probs[0][pred_label].item()

label_names = ["Negative", "Positive"]

print(f"Prediction: {label_names[pred_label]}")
print(f"Confidence: {confidence:.4f}")