## Import

In [None]:
import pandas as pd

df = pd.read_csv('IMDB dataset.csv')
df.head()

## Cleaning

In [3]:
df['review_cleaned'] = df['review'].apply(lambda x: x.replace('<br />', ''))
# whitespace removal
df['review_cleaned'] = df['review_cleaned'].replace(r'\s+', ' ', regex=True)

## Encode sentiment

In [None]:
df['sentiment_encoded'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head()

## Tokenize data

With the data prepared it needs to be tokenized in preparation for BERT. This uses the `BertTokenizer` from Hugging Face `transformers` to do so.

We use the `bert-case-uncased` tokenizer which ignores the casing of the reviews. This may or may not be appropriate. There could be 'implied' meaning in such usage however internet users are also historically notorious for poor casing in general.

In [9]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Example tokenization
# sample_sentence = 'I liked this movie'
# token_ids = tokenizer.encode(sample_sentence, return_tensors='np')[0]
# print(f'Token IDs: {token_ids}')
# # Convert the token IDs back to tokens to reveal the special tokens added
# tokens = tokenizer.convert_ids_to_tokens(token_ids)
# print(f'Tokens   : {tokens}')

token_ids = []
attention_masks = []

for review in df['review_cleaned']:
  # We have a large number of padding tokens due to short reviews. 
  # Using `encode_plus` lets us add an attention mask to ignore all
  # these padding tokens therefore avoiding attending to these.
  batch_encoder = tokenizer.encode_plus(
    review,
    max_length = 512,
    padding = 'max_length',
    truncation = True,
    return_tensors = 'pt'
  )

  token_ids.append(batch_encoder['input_ids'])
  attention_masks.append(batch_encoder['attention_mask'])

# Converts our lists to PyToch tensors
token_ids = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)



## Preparing training & evaluation datasets

See [3.3 - Create the Train and Validation Dataloaders](https://towardsdatascience.com/a-complete-guide-to-bert-with-code-9f87602e4a11/).

In [12]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

val_size = 0.1

train_ids, val_ids = train_test_split(
  token_ids,
  test_size=val_size,
  shuffle=False
)

train_masks, val_masks = train_test_split(
  attention_masks,
  test_size=val_size,
  shuffle=False
)

labels = torch.tensor(df['sentiment_encoded'].values)
train_labels, val_labels = train_test_split(
  labels,
  test_size=val_size,
  shuffle=False
)

train_data = TensorDataset(train_ids, train_masks, train_labels)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
val_data = TensorDataset(val_ids, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=16)

In [None]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained(
  'bert-base-uncased',
  num_labels=2 # positive or negative
)

EPOCHS = 2

# The optimizer aims to converage weights and bias towards an optimal.
optimizer = AdamW(model.parameters())
# Language models typically use the cross entropy loss function.
loss_function = nn.CrossEntropyLoss()

# The scheduler determines the size of changes to weights and biases.
# Since we start with random parameters then big changes early on can
# often be beneficial to converge. As training progresses, changes should
# be smaller towards the convergance. This is what the linear scheduler aims
# to do.
num_training_steps = EPOCHS * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print("Using Device: ", device)

## Training

In [None]:
for epoch in range(0, EPOCHS):

    model.train()
    training_loss = 0

    for batch in train_dataloader:

        batch_token_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()

        loss, logits = model(
            batch_token_ids,
            token_type_ids = None,
            attention_mask=batch_attention_mask,
            labels=batch_labels,
            return_dict=False)

        training_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    average_train_loss = training_loss / len(train_dataloader)